Skip to content

Commit a5ac4bf

Browse files
authored
Replace direct cudaMemcpyAsync calls with utility functions (within /src) (#17550)
Replaced the calls to `cudaMemcpyAsync` with the new `cuda_memcpy`/`cuda_memcpy_async` utility, which optionally avoids using the copy engine. Also took the opportunity to use cudf::detail::host_vector and its factories to enable wider pinned memory use. Remaining instances are either not viable (e.g. copying `h_needs_fallback`, interop) or D2D copies. Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - David Wendt (https://github.com/davidwendt) - Nghia Truong (https://github.com/ttnghia) URL: #17550
1 parent e975ca3 commit a5ac4bf

File tree

10 files changed

+93
-109
lines changed

10 files changed

+93
-109
lines changed

cpp/include/cudf/detail/device_scalar.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ class device_scalar : public rmm::device_scalar<T> {
7878
[[nodiscard]] T value(rmm::cuda_stream_view stream) const
7979
{
8080
cuda_memcpy<T>(bounce_buffer, device_span<T const>{this->data(), 1}, stream);
81-
return bounce_buffer[0];
81+
return std::move(bounce_buffer[0]);
8282
}
8383

8484
void set_value_async(T const& value, rmm::cuda_stream_view stream)

cpp/src/bitmask/is_element_valid.cpp

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
*/
1616

1717
#include <cudf/detail/is_element_valid.hpp>
18+
#include <cudf/detail/utilities/vector_factories.hpp>
1819
#include <cudf/utilities/bit.hpp>
1920
#include <cudf/utilities/error.hpp>
2021

@@ -30,15 +31,14 @@ bool is_element_valid_sync(column_view const& col_view,
3031
CUDF_EXPECTS(element_index >= 0 and element_index < col_view.size(), "invalid index.");
3132
if (!col_view.nullable()) { return true; }
3233

33-
bitmask_type word = 0;
3434
// null_mask() returns device ptr to bitmask without offset
3535
size_type const index = element_index + col_view.offset();
36-
CUDF_CUDA_TRY(cudaMemcpyAsync(&word,
37-
col_view.null_mask() + word_index(index),
38-
sizeof(bitmask_type),
39-
cudaMemcpyDefault,
40-
stream.value()));
41-
stream.synchronize();
36+
37+
auto const word =
38+
cudf::detail::make_host_vector_sync(
39+
device_span<bitmask_type const>{col_view.null_mask() + word_index(index), 1}, stream)
40+
.front();
41+
4242
return static_cast<bool>(word & (bitmask_type{1} << intra_word_index(index)));
4343
}
4444

cpp/src/column/column_device_view.cu

Lines changed: 5 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
2+
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
1616
#include <cudf/column/column_device_view.cuh>
1717
#include <cudf/column/column_view.hpp>
1818
#include <cudf/detail/iterator.cuh>
19+
#include <cudf/detail/utilities/vector_factories.hpp>
1920
#include <cudf/types.hpp>
2021
#include <cudf/utilities/error.hpp>
2122

@@ -60,13 +61,12 @@ create_device_view_from_view(ColumnView const& source, rmm::cuda_stream_view str
6061
// A buffer of CPU memory is allocated to hold the ColumnDeviceView
6162
// objects. Once filled, the CPU memory is copied to device memory
6263
// and then set into the d_children member pointer.
63-
std::vector<char> staging_buffer(descendant_storage_bytes);
64+
auto staging_buffer = detail::make_host_vector<char>(descendant_storage_bytes, stream);
6465

6566
// Each ColumnDeviceView instance may have child objects that
6667
// require setting some internal device pointers before being copied
6768
// from CPU to device.
68-
rmm::device_buffer* const descendant_storage =
69-
new rmm::device_buffer(descendant_storage_bytes, stream);
69+
auto const descendant_storage = new rmm::device_uvector<char>(descendant_storage_bytes, stream);
7070

7171
auto deleter = [descendant_storage](ColumnDeviceView* v) {
7272
v->destroy();
@@ -77,13 +77,7 @@ create_device_view_from_view(ColumnView const& source, rmm::cuda_stream_view str
7777
new ColumnDeviceView(source, staging_buffer.data(), descendant_storage->data()), deleter};
7878

7979
// copy the CPU memory with all the children into device memory
80-
CUDF_CUDA_TRY(cudaMemcpyAsync(descendant_storage->data(),
81-
staging_buffer.data(),
82-
descendant_storage->size(),
83-
cudaMemcpyDefault,
84-
stream.value()));
85-
86-
stream.synchronize();
80+
detail::cuda_memcpy<char>(*descendant_storage, staging_buffer, stream);
8781

8882
return result;
8983
}

cpp/src/copying/contiguous_split.cu

Lines changed: 43 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -998,7 +998,8 @@ struct packed_split_indices_and_src_buf_info {
998998
src_buf_info_size(
999999
cudf::util::round_up_safe(num_src_bufs * sizeof(src_buf_info), split_align)),
10001000
// host-side
1001-
h_indices_and_source_info(indices_size + src_buf_info_size),
1001+
h_indices_and_source_info{
1002+
detail::make_host_vector<uint8_t>(indices_size + src_buf_info_size, stream)},
10021003
h_indices{reinterpret_cast<size_type*>(h_indices_and_source_info.data())},
10031004
h_src_buf_info{
10041005
reinterpret_cast<src_buf_info*>(h_indices_and_source_info.data() + indices_size)}
@@ -1025,15 +1026,18 @@ struct packed_split_indices_and_src_buf_info {
10251026
reinterpret_cast<size_type*>(reinterpret_cast<uint8_t*>(d_indices_and_source_info.data()) +
10261027
indices_size + src_buf_info_size);
10271028

1028-
CUDF_CUDA_TRY(cudaMemcpyAsync(
1029-
d_indices, h_indices, indices_size + src_buf_info_size, cudaMemcpyDefault, stream.value()));
1029+
detail::cuda_memcpy_async<uint8_t>(
1030+
device_span<uint8_t>{static_cast<uint8_t*>(d_indices_and_source_info.data()),
1031+
h_indices_and_source_info.size()},
1032+
h_indices_and_source_info,
1033+
stream);
10301034
}
10311035

10321036
size_type const indices_size;
10331037
std::size_t const src_buf_info_size;
10341038
std::size_t offset_stack_size;
10351039

1036-
std::vector<uint8_t> h_indices_and_source_info;
1040+
detail::host_vector<uint8_t> h_indices_and_source_info;
10371041
rmm::device_buffer d_indices_and_source_info;
10381042

10391043
size_type* const h_indices;
@@ -1055,27 +1059,26 @@ struct packed_partition_buf_size_and_dst_buf_info {
10551059
buf_sizes_size{cudf::util::round_up_safe(num_partitions * sizeof(std::size_t), split_align)},
10561060
dst_buf_info_size{cudf::util::round_up_safe(num_bufs * sizeof(dst_buf_info), split_align)},
10571061
// host-side
1058-
h_buf_sizes_and_dst_info(buf_sizes_size + dst_buf_info_size),
1062+
h_buf_sizes_and_dst_info{
1063+
detail::make_host_vector<uint8_t>(buf_sizes_size + dst_buf_info_size, stream)},
10591064
h_buf_sizes{reinterpret_cast<std::size_t*>(h_buf_sizes_and_dst_info.data())},
10601065
h_dst_buf_info{
1061-
reinterpret_cast<dst_buf_info*>(h_buf_sizes_and_dst_info.data() + buf_sizes_size)},
1066+
reinterpret_cast<dst_buf_info*>(h_buf_sizes_and_dst_info.data() + buf_sizes_size),
1067+
num_bufs,
1068+
h_buf_sizes_and_dst_info.get_allocator().is_device_accessible()},
10621069
// device-side
1063-
d_buf_sizes_and_dst_info(buf_sizes_size + dst_buf_info_size, stream, temp_mr),
1070+
d_buf_sizes_and_dst_info(h_buf_sizes_and_dst_info.size(), stream, temp_mr),
10641071
d_buf_sizes{reinterpret_cast<std::size_t*>(d_buf_sizes_and_dst_info.data())},
10651072
// destination buffer info
1066-
d_dst_buf_info{reinterpret_cast<dst_buf_info*>(
1067-
static_cast<uint8_t*>(d_buf_sizes_and_dst_info.data()) + buf_sizes_size)}
1073+
d_dst_buf_info{
1074+
reinterpret_cast<dst_buf_info*>(d_buf_sizes_and_dst_info.data() + buf_sizes_size), num_bufs}
10681075
{
10691076
}
10701077

10711078
void copy_to_host()
10721079
{
10731080
// DtoH buf sizes and col info back to the host
1074-
CUDF_CUDA_TRY(cudaMemcpyAsync(h_buf_sizes,
1075-
d_buf_sizes,
1076-
buf_sizes_size + dst_buf_info_size,
1077-
cudaMemcpyDefault,
1078-
stream.value()));
1081+
detail::cuda_memcpy_async<uint8_t>(h_buf_sizes_and_dst_info, d_buf_sizes_and_dst_info, stream);
10791082
}
10801083

10811084
rmm::cuda_stream_view const stream;
@@ -1084,13 +1087,13 @@ struct packed_partition_buf_size_and_dst_buf_info {
10841087
std::size_t const buf_sizes_size;
10851088
std::size_t const dst_buf_info_size;
10861089

1087-
std::vector<uint8_t> h_buf_sizes_and_dst_info;
1090+
detail::host_vector<uint8_t> h_buf_sizes_and_dst_info;
10881091
std::size_t* const h_buf_sizes;
1089-
dst_buf_info* const h_dst_buf_info;
1092+
host_span<dst_buf_info> const h_dst_buf_info;
10901093

1091-
rmm::device_buffer d_buf_sizes_and_dst_info;
1094+
rmm::device_uvector<uint8_t> d_buf_sizes_and_dst_info;
10921095
std::size_t* const d_buf_sizes;
1093-
dst_buf_info* const d_dst_buf_info;
1096+
device_span<dst_buf_info> const d_dst_buf_info;
10941097
};
10951098

10961099
// Packed block of memory 3:
@@ -1106,11 +1109,12 @@ struct packed_src_and_dst_pointers {
11061109
src_bufs_size{cudf::util::round_up_safe(num_src_bufs * sizeof(uint8_t*), split_align)},
11071110
dst_bufs_size{cudf::util::round_up_safe(num_partitions * sizeof(uint8_t*), split_align)},
11081111
// host-side
1109-
h_src_and_dst_buffers(src_bufs_size + dst_bufs_size),
1112+
h_src_and_dst_buffers{
1113+
detail::make_host_vector<uint8_t>(src_bufs_size + dst_bufs_size, stream)},
11101114
h_src_bufs{reinterpret_cast<uint8_t const**>(h_src_and_dst_buffers.data())},
11111115
h_dst_bufs{reinterpret_cast<uint8_t**>(h_src_and_dst_buffers.data() + src_bufs_size)},
11121116
// device-side
1113-
d_src_and_dst_buffers{rmm::device_buffer(src_bufs_size + dst_bufs_size, stream, temp_mr)},
1117+
d_src_and_dst_buffers{h_src_and_dst_buffers.size(), stream, temp_mr},
11141118
d_src_bufs{reinterpret_cast<uint8_t const**>(d_src_and_dst_buffers.data())},
11151119
d_dst_bufs{reinterpret_cast<uint8_t**>(
11161120
reinterpret_cast<uint8_t*>(d_src_and_dst_buffers.data()) + src_bufs_size)}
@@ -1121,18 +1125,18 @@ struct packed_src_and_dst_pointers {
11211125

11221126
void copy_to_device()
11231127
{
1124-
CUDF_CUDA_TRY(cudaMemcpyAsync(d_src_and_dst_buffers.data(),
1125-
h_src_and_dst_buffers.data(),
1126-
src_bufs_size + dst_bufs_size,
1127-
cudaMemcpyDefault,
1128-
stream.value()));
1128+
detail::cuda_memcpy_async<uint8_t>(
1129+
device_span<uint8_t>{static_cast<uint8_t*>(d_src_and_dst_buffers.data()),
1130+
d_src_and_dst_buffers.size()},
1131+
h_src_and_dst_buffers,
1132+
stream);
11291133
}
11301134

11311135
rmm::cuda_stream_view const stream;
11321136
std::size_t const src_bufs_size;
11331137
std::size_t const dst_bufs_size;
11341138

1135-
std::vector<uint8_t> h_src_and_dst_buffers;
1139+
detail::host_vector<uint8_t> h_src_and_dst_buffers;
11361140
uint8_t const** const h_src_bufs;
11371141
uint8_t** const h_dst_bufs;
11381142

@@ -1205,7 +1209,7 @@ std::unique_ptr<packed_partition_buf_size_and_dst_buf_info> compute_splits(
12051209
std::make_unique<packed_partition_buf_size_and_dst_buf_info>(
12061210
num_partitions, num_bufs, stream, temp_mr);
12071211

1208-
auto const d_dst_buf_info = partition_buf_size_and_dst_buf_info->d_dst_buf_info;
1212+
auto const d_dst_buf_info = partition_buf_size_and_dst_buf_info->d_dst_buf_info.begin();
12091213
auto const d_buf_sizes = partition_buf_size_and_dst_buf_info->d_buf_sizes;
12101214

12111215
auto const split_indices_and_src_buf_info = packed_split_indices_and_src_buf_info(
@@ -1518,26 +1522,19 @@ std::unique_ptr<chunk_iteration_state> chunk_iteration_state::create(
15181522
*/
15191523
if (user_buffer_size != 0) {
15201524
// copy the batch offsets back to host
1521-
std::vector<std::size_t> h_offsets(num_batches + 1);
1522-
{
1523-
rmm::device_uvector<std::size_t> offsets(h_offsets.size(), stream, temp_mr);
1525+
auto const h_offsets = [&] {
1526+
rmm::device_uvector<std::size_t> offsets(num_batches + 1, stream, temp_mr);
15241527
auto const batch_byte_size_iter = cudf::detail::make_counting_transform_iterator(
15251528
0, batch_byte_size_function{num_batches, d_batched_dst_buf_info.begin()});
15261529

1527-
thrust::exclusive_scan(rmm::exec_policy(stream, temp_mr),
1530+
thrust::exclusive_scan(rmm::exec_policy_nosync(stream, temp_mr),
15281531
batch_byte_size_iter,
1529-
batch_byte_size_iter + num_batches + 1,
1532+
batch_byte_size_iter + offsets.size(),
15301533
offsets.begin());
15311534

1532-
CUDF_CUDA_TRY(cudaMemcpyAsync(h_offsets.data(),
1533-
offsets.data(),
1534-
sizeof(std::size_t) * offsets.size(),
1535-
cudaMemcpyDefault,
1536-
stream.value()));
1537-
15381535
// the next part is working on the CPU, so we want to synchronize here
1539-
stream.synchronize();
1540-
}
1536+
return detail::make_host_vector_sync(offsets, stream);
1537+
}();
15411538

15421539
std::vector<std::size_t> num_batches_per_iteration;
15431540
std::vector<std::size_t> size_of_batches_per_iteration;
@@ -1699,7 +1696,7 @@ void copy_data(int num_batches_to_copy,
16991696
int starting_batch,
17001697
uint8_t const** d_src_bufs,
17011698
uint8_t** d_dst_bufs,
1702-
rmm::device_uvector<dst_buf_info>& d_dst_buf_info,
1699+
device_span<dst_buf_info> d_dst_buf_info,
17031700
uint8_t* user_buffer,
17041701
rmm::cuda_stream_view stream)
17051702
{
@@ -1833,15 +1830,9 @@ struct contiguous_split_state {
18331830
keys + num_batches_total,
18341831
values,
18351832
thrust::make_discard_iterator(),
1836-
dst_valid_count_output_iterator{d_orig_dst_buf_info});
1837-
1838-
CUDF_CUDA_TRY(cudaMemcpyAsync(h_orig_dst_buf_info,
1839-
d_orig_dst_buf_info,
1840-
partition_buf_size_and_dst_buf_info->dst_buf_info_size,
1841-
cudaMemcpyDefault,
1842-
stream.value()));
1833+
dst_valid_count_output_iterator{d_orig_dst_buf_info.begin()});
18431834

1844-
stream.synchronize();
1835+
detail::cuda_memcpy<dst_buf_info>(h_orig_dst_buf_info, d_orig_dst_buf_info, stream);
18451836

18461837
// not necessary for the non-chunked case, but it makes it so further calls to has_next
18471838
// return false, just in case
@@ -1889,7 +1880,7 @@ struct contiguous_split_state {
18891880
}
18901881

18911882
auto& h_dst_buf_info = partition_buf_size_and_dst_buf_info->h_dst_buf_info;
1892-
auto cur_dst_buf_info = h_dst_buf_info;
1883+
auto cur_dst_buf_info = h_dst_buf_info.data();
18931884
detail::metadata_builder mb{input.num_columns()};
18941885

18951886
populate_metadata(input.begin(), input.end(), cur_dst_buf_info, mb);
@@ -1927,7 +1918,7 @@ struct contiguous_split_state {
19271918

19281919
// Second pass: uses `dst_buf_info` to break down the work into 1MB batches.
19291920
chunk_iter_state = compute_batches(num_bufs,
1930-
partition_buf_size_and_dst_buf_info->d_dst_buf_info,
1921+
partition_buf_size_and_dst_buf_info->d_dst_buf_info.data(),
19311922
partition_buf_size_and_dst_buf_info->h_buf_sizes,
19321923
num_partitions,
19331924
user_buffer_size,
@@ -1963,7 +1954,7 @@ struct contiguous_split_state {
19631954
auto& h_dst_buf_info = partition_buf_size_and_dst_buf_info->h_dst_buf_info;
19641955
auto& h_dst_bufs = src_and_dst_pointers->h_dst_bufs;
19651956

1966-
auto cur_dst_buf_info = h_dst_buf_info;
1957+
auto cur_dst_buf_info = h_dst_buf_info.data();
19671958
detail::metadata_builder mb(input.num_columns());
19681959

19691960
for (std::size_t idx = 0; idx < num_partitions; idx++) {

cpp/src/io/csv/reader_impl.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,13 @@
2121

2222
#include "csv_common.hpp"
2323
#include "csv_gpu.hpp"
24-
#include "cudf/detail/utilities/cuda_memcpy.hpp"
2524
#include "io/comp/io_uncomp.hpp"
2625
#include "io/utilities/column_buffer.hpp"
2726
#include "io/utilities/hostdevice_vector.hpp"
2827
#include "io/utilities/parsing_utils.cuh"
2928

3029
#include <cudf/detail/utilities/cuda.cuh>
30+
#include <cudf/detail/utilities/cuda_memcpy.hpp>
3131
#include <cudf/detail/utilities/vector_factories.hpp>
3232
#include <cudf/detail/utilities/visitor_overload.hpp>
3333
#include <cudf/io/csv.hpp>

cpp/src/io/orc/writer_impl.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
* @brief cuDF-IO ORC writer class implementation
2020
*/
2121

22-
#include "cudf/detail/utilities/cuda_memcpy.hpp"
2322
#include "io/comp/nvcomp_adapter.hpp"
2423
#include "io/orc/orc_gpu.hpp"
2524
#include "io/statistics/column_statistics.cuh"
@@ -30,6 +29,7 @@
3029
#include <cudf/detail/null_mask.hpp>
3130
#include <cudf/detail/utilities/batched_memcpy.hpp>
3231
#include <cudf/detail/utilities/cuda.cuh>
32+
#include <cudf/detail/utilities/cuda_memcpy.hpp>
3333
#include <cudf/detail/utilities/stream_pool.hpp>
3434
#include <cudf/detail/utilities/vector_factories.hpp>
3535
#include <cudf/logger.hpp>

cpp/src/reductions/minmax.cu

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -218,9 +218,8 @@ struct minmax_functor {
218218
auto dev_result = reduce<cudf::string_view>(col, stream);
219219
// copy the minmax_pair to the host; does not copy the strings
220220
using OutputType = minmax_pair<cudf::string_view>;
221-
OutputType host_result;
222-
CUDF_CUDA_TRY(cudaMemcpyAsync(
223-
&host_result, dev_result.data(), sizeof(OutputType), cudaMemcpyDefault, stream.value()));
221+
222+
auto const host_result = dev_result.value(stream);
224223
// strings are copied to create the scalars here
225224
return {std::make_unique<string_scalar>(host_result.min_val, true, stream, mr),
226225
std::make_unique<string_scalar>(host_result.max_val, true, stream, mr)};
@@ -236,10 +235,8 @@ struct minmax_functor {
236235
// compute minimum and maximum values
237236
auto dev_result = reduce<T>(col, stream);
238237
// copy the minmax_pair to the host to call get_element
239-
using OutputType = minmax_pair<T>;
240-
OutputType host_result;
241-
CUDF_CUDA_TRY(cudaMemcpyAsync(
242-
&host_result, dev_result.data(), sizeof(OutputType), cudaMemcpyDefault, stream.value()));
238+
using OutputType = minmax_pair<T>;
239+
OutputType host_result = dev_result.value(stream);
243240
// get the keys for those indexes
244241
auto const keys = dictionary_column_view(col).keys();
245242
return {detail::get_element(keys, static_cast<size_type>(host_result.min_val), stream, mr),

cpp/src/scalar/scalar.cpp

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -114,11 +114,10 @@ string_scalar::operator std::string() const { return this->to_string(cudf::get_d
114114

115115
std::string string_scalar::to_string(rmm::cuda_stream_view stream) const
116116
{
117-
std::string result;
118-
result.resize(_data.size());
119-
CUDF_CUDA_TRY(
120-
cudaMemcpyAsync(&result[0], _data.data(), _data.size(), cudaMemcpyDefault, stream.value()));
121-
stream.synchronize();
117+
std::string result(size(), '\0');
118+
detail::cuda_memcpy(host_span<char>{result.data(), result.size()},
119+
device_span<char const>{data(), _data.size()},
120+
stream);
122121
return result;
123122
}
124123

0 commit comments

Comments
 (0)