@@ -998,7 +998,8 @@ struct packed_split_indices_and_src_buf_info {
998998 src_buf_info_size(
999999 cudf::util::round_up_safe (num_src_bufs * sizeof (src_buf_info), split_align)),
10001000 // host-side
1001- h_indices_and_source_info(indices_size + src_buf_info_size),
1001+ h_indices_and_source_info{
1002+ detail::make_host_vector<uint8_t >(indices_size + src_buf_info_size, stream)},
10021003 h_indices{reinterpret_cast <size_type*>(h_indices_and_source_info.data ())},
10031004 h_src_buf_info{
10041005 reinterpret_cast <src_buf_info*>(h_indices_and_source_info.data () + indices_size)}
@@ -1025,15 +1026,18 @@ struct packed_split_indices_and_src_buf_info {
10251026 reinterpret_cast <size_type*>(reinterpret_cast <uint8_t *>(d_indices_and_source_info.data ()) +
10261027 indices_size + src_buf_info_size);
10271028
1028- CUDF_CUDA_TRY (cudaMemcpyAsync (
1029- d_indices, h_indices, indices_size + src_buf_info_size, cudaMemcpyDefault, stream.value ()));
1029+ detail::cuda_memcpy_async<uint8_t >(
1030+ device_span<uint8_t >{static_cast <uint8_t *>(d_indices_and_source_info.data ()),
1031+ h_indices_and_source_info.size ()},
1032+ h_indices_and_source_info,
1033+ stream);
10301034 }
10311035
10321036 size_type const indices_size;
10331037 std::size_t const src_buf_info_size;
10341038 std::size_t offset_stack_size;
10351039
1036- std::vector <uint8_t > h_indices_and_source_info;
1040+ detail::host_vector <uint8_t > h_indices_and_source_info;
10371041 rmm::device_buffer d_indices_and_source_info;
10381042
10391043 size_type* const h_indices;
@@ -1055,27 +1059,26 @@ struct packed_partition_buf_size_and_dst_buf_info {
10551059 buf_sizes_size{cudf::util::round_up_safe (num_partitions * sizeof (std::size_t ), split_align)},
10561060 dst_buf_info_size{cudf::util::round_up_safe (num_bufs * sizeof (dst_buf_info), split_align)},
10571061 // host-side
1058- h_buf_sizes_and_dst_info (buf_sizes_size + dst_buf_info_size),
1062+ h_buf_sizes_and_dst_info{
1063+ detail::make_host_vector<uint8_t >(buf_sizes_size + dst_buf_info_size, stream)},
10591064 h_buf_sizes{reinterpret_cast <std::size_t *>(h_buf_sizes_and_dst_info.data ())},
10601065 h_dst_buf_info{
1061- reinterpret_cast <dst_buf_info*>(h_buf_sizes_and_dst_info.data () + buf_sizes_size)},
1066+ reinterpret_cast <dst_buf_info*>(h_buf_sizes_and_dst_info.data () + buf_sizes_size),
1067+ num_bufs,
1068+ h_buf_sizes_and_dst_info.get_allocator ().is_device_accessible ()},
10621069 // device-side
1063- d_buf_sizes_and_dst_info (buf_sizes_size + dst_buf_info_size , stream, temp_mr),
1070+ d_buf_sizes_and_dst_info (h_buf_sizes_and_dst_info.size() , stream, temp_mr),
10641071 d_buf_sizes{reinterpret_cast <std::size_t *>(d_buf_sizes_and_dst_info.data ())},
10651072 // destination buffer info
1066- d_dst_buf_info{reinterpret_cast <dst_buf_info*>(
1067- static_cast < uint8_t *>(d_buf_sizes_and_dst_info.data ()) + buf_sizes_size)}
1073+ d_dst_buf_info{
1074+ reinterpret_cast <dst_buf_info *>(d_buf_sizes_and_dst_info.data () + buf_sizes_size), num_bufs }
10681075 {
10691076 }
10701077
10711078 void copy_to_host ()
10721079 {
10731080 // DtoH buf sizes and col info back to the host
1074- CUDF_CUDA_TRY (cudaMemcpyAsync (h_buf_sizes,
1075- d_buf_sizes,
1076- buf_sizes_size + dst_buf_info_size,
1077- cudaMemcpyDefault,
1078- stream.value ()));
1081+ detail::cuda_memcpy_async<uint8_t >(h_buf_sizes_and_dst_info, d_buf_sizes_and_dst_info, stream);
10791082 }
10801083
10811084 rmm::cuda_stream_view const stream;
@@ -1084,13 +1087,13 @@ struct packed_partition_buf_size_and_dst_buf_info {
10841087 std::size_t const buf_sizes_size;
10851088 std::size_t const dst_buf_info_size;
10861089
1087- std::vector <uint8_t > h_buf_sizes_and_dst_info;
1090+ detail::host_vector <uint8_t > h_buf_sizes_and_dst_info;
10881091 std::size_t * const h_buf_sizes;
1089- dst_buf_info* const h_dst_buf_info;
1092+ host_span< dst_buf_info> const h_dst_buf_info;
10901093
1091- rmm::device_buffer d_buf_sizes_and_dst_info;
1094+ rmm::device_uvector< uint8_t > d_buf_sizes_and_dst_info;
10921095 std::size_t * const d_buf_sizes;
1093- dst_buf_info* const d_dst_buf_info;
1096+ device_span< dst_buf_info> const d_dst_buf_info;
10941097};
10951098
10961099// Packed block of memory 3:
@@ -1106,11 +1109,12 @@ struct packed_src_and_dst_pointers {
11061109 src_bufs_size{cudf::util::round_up_safe (num_src_bufs * sizeof (uint8_t *), split_align)},
11071110 dst_bufs_size{cudf::util::round_up_safe (num_partitions * sizeof (uint8_t *), split_align)},
11081111 // host-side
1109- h_src_and_dst_buffers (src_bufs_size + dst_bufs_size),
1112+ h_src_and_dst_buffers{
1113+ detail::make_host_vector<uint8_t >(src_bufs_size + dst_bufs_size, stream)},
11101114 h_src_bufs{reinterpret_cast <uint8_t const **>(h_src_and_dst_buffers.data ())},
11111115 h_dst_bufs{reinterpret_cast <uint8_t **>(h_src_and_dst_buffers.data () + src_bufs_size)},
11121116 // device-side
1113- d_src_and_dst_buffers{rmm::device_buffer (src_bufs_size + dst_bufs_size , stream, temp_mr) },
1117+ d_src_and_dst_buffers{h_src_and_dst_buffers. size () , stream, temp_mr},
11141118 d_src_bufs{reinterpret_cast <uint8_t const **>(d_src_and_dst_buffers.data ())},
11151119 d_dst_bufs{reinterpret_cast <uint8_t **>(
11161120 reinterpret_cast <uint8_t *>(d_src_and_dst_buffers.data ()) + src_bufs_size)}
@@ -1121,18 +1125,18 @@ struct packed_src_and_dst_pointers {
11211125
11221126 void copy_to_device ()
11231127 {
1124- CUDF_CUDA_TRY ( cudaMemcpyAsync (d_src_and_dst_buffers. data (),
1125- h_src_and_dst_buffers .data (),
1126- src_bufs_size + dst_bufs_size ,
1127- cudaMemcpyDefault ,
1128- stream. value ()) );
1128+ detail::cuda_memcpy_async< uint8_t >(
1129+ device_span< uint8_t >{ static_cast < uint8_t *>(d_src_and_dst_buffers .data () ),
1130+ d_src_and_dst_buffers. size ()} ,
1131+ h_src_and_dst_buffers ,
1132+ stream);
11291133 }
11301134
11311135 rmm::cuda_stream_view const stream;
11321136 std::size_t const src_bufs_size;
11331137 std::size_t const dst_bufs_size;
11341138
1135- std::vector <uint8_t > h_src_and_dst_buffers;
1139+ detail::host_vector <uint8_t > h_src_and_dst_buffers;
11361140 uint8_t const ** const h_src_bufs;
11371141 uint8_t ** const h_dst_bufs;
11381142
@@ -1205,7 +1209,7 @@ std::unique_ptr<packed_partition_buf_size_and_dst_buf_info> compute_splits(
12051209 std::make_unique<packed_partition_buf_size_and_dst_buf_info>(
12061210 num_partitions, num_bufs, stream, temp_mr);
12071211
1208- auto const d_dst_buf_info = partition_buf_size_and_dst_buf_info->d_dst_buf_info ;
1212+ auto const d_dst_buf_info = partition_buf_size_and_dst_buf_info->d_dst_buf_info . begin () ;
12091213 auto const d_buf_sizes = partition_buf_size_and_dst_buf_info->d_buf_sizes ;
12101214
12111215 auto const split_indices_and_src_buf_info = packed_split_indices_and_src_buf_info (
@@ -1518,26 +1522,19 @@ std::unique_ptr<chunk_iteration_state> chunk_iteration_state::create(
15181522 */
15191523 if (user_buffer_size != 0 ) {
15201524 // copy the batch offsets back to host
1521- std::vector<std::size_t > h_offsets (num_batches + 1 );
1522- {
1523- rmm::device_uvector<std::size_t > offsets (h_offsets.size (), stream, temp_mr);
1525+ auto const h_offsets = [&] {
1526+ rmm::device_uvector<std::size_t > offsets (num_batches + 1 , stream, temp_mr);
15241527 auto const batch_byte_size_iter = cudf::detail::make_counting_transform_iterator (
15251528 0 , batch_byte_size_function{num_batches, d_batched_dst_buf_info.begin ()});
15261529
1527- thrust::exclusive_scan (rmm::exec_policy (stream, temp_mr),
1530+ thrust::exclusive_scan (rmm::exec_policy_nosync (stream, temp_mr),
15281531 batch_byte_size_iter,
1529- batch_byte_size_iter + num_batches + 1 ,
1532+ batch_byte_size_iter + offsets. size () ,
15301533 offsets.begin ());
15311534
1532- CUDF_CUDA_TRY (cudaMemcpyAsync (h_offsets.data (),
1533- offsets.data (),
1534- sizeof (std::size_t ) * offsets.size (),
1535- cudaMemcpyDefault,
1536- stream.value ()));
1537-
15381535 // the next part is working on the CPU, so we want to synchronize here
1539- stream. synchronize ( );
1540- }
1536+ return detail::make_host_vector_sync (offsets, stream );
1537+ }();
15411538
15421539 std::vector<std::size_t > num_batches_per_iteration;
15431540 std::vector<std::size_t > size_of_batches_per_iteration;
@@ -1699,7 +1696,7 @@ void copy_data(int num_batches_to_copy,
16991696 int starting_batch,
17001697 uint8_t const ** d_src_bufs,
17011698 uint8_t ** d_dst_bufs,
1702- rmm::device_uvector <dst_buf_info>& d_dst_buf_info,
1699+ device_span <dst_buf_info> d_dst_buf_info,
17031700 uint8_t * user_buffer,
17041701 rmm::cuda_stream_view stream)
17051702{
@@ -1833,15 +1830,9 @@ struct contiguous_split_state {
18331830 keys + num_batches_total,
18341831 values,
18351832 thrust::make_discard_iterator (),
1836- dst_valid_count_output_iterator{d_orig_dst_buf_info});
1837-
1838- CUDF_CUDA_TRY (cudaMemcpyAsync (h_orig_dst_buf_info,
1839- d_orig_dst_buf_info,
1840- partition_buf_size_and_dst_buf_info->dst_buf_info_size ,
1841- cudaMemcpyDefault,
1842- stream.value ()));
1833+ dst_valid_count_output_iterator{d_orig_dst_buf_info.begin ()});
18431834
1844- stream. synchronize ( );
1835+ detail::cuda_memcpy<dst_buf_info>(h_orig_dst_buf_info, d_orig_dst_buf_info, stream );
18451836
18461837 // not necessary for the non-chunked case, but it makes it so further calls to has_next
18471838 // return false, just in case
@@ -1889,7 +1880,7 @@ struct contiguous_split_state {
18891880 }
18901881
18911882 auto & h_dst_buf_info = partition_buf_size_and_dst_buf_info->h_dst_buf_info ;
1892- auto cur_dst_buf_info = h_dst_buf_info;
1883+ auto cur_dst_buf_info = h_dst_buf_info. data () ;
18931884 detail::metadata_builder mb{input.num_columns ()};
18941885
18951886 populate_metadata (input.begin (), input.end (), cur_dst_buf_info, mb);
@@ -1927,7 +1918,7 @@ struct contiguous_split_state {
19271918
19281919 // Second pass: uses `dst_buf_info` to break down the work into 1MB batches.
19291920 chunk_iter_state = compute_batches (num_bufs,
1930- partition_buf_size_and_dst_buf_info->d_dst_buf_info ,
1921+ partition_buf_size_and_dst_buf_info->d_dst_buf_info . data () ,
19311922 partition_buf_size_and_dst_buf_info->h_buf_sizes ,
19321923 num_partitions,
19331924 user_buffer_size,
@@ -1963,7 +1954,7 @@ struct contiguous_split_state {
19631954 auto & h_dst_buf_info = partition_buf_size_and_dst_buf_info->h_dst_buf_info ;
19641955 auto & h_dst_bufs = src_and_dst_pointers->h_dst_bufs ;
19651956
1966- auto cur_dst_buf_info = h_dst_buf_info;
1957+ auto cur_dst_buf_info = h_dst_buf_info. data () ;
19671958 detail::metadata_builder mb (input.num_columns ());
19681959
19691960 for (std::size_t idx = 0 ; idx < num_partitions; idx++) {
0 commit comments