Skip to content

Commit 3be772f

Browse files
authored
Global stream pool (#13922)
#13637 added a static stream pool object for use by the Parquet reader. This PR expands upon that by: - Moving the stream pool to the `cudf::detail` namespace. - Adding a debugging implementation that always returns the default stream. - Hiding implementation details behind a more streamlined interface. - Using cuda events for synchronization. Authors: - Ed Seidl (https://github.com/etseidl) - Vukasin Milovanovic (https://github.com/vuule) - Mark Harris (https://github.com/harrism) Approvers: - Bradley Dice (https://github.com/bdice) - Vukasin Milovanovic (https://github.com/vuule) - Mark Harris (https://github.com/harrism) URL: #13922
1 parent 258e0fe commit 3be772f

File tree

5 files changed

+341
-71
lines changed

5 files changed

+341
-71
lines changed

cpp/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -633,6 +633,7 @@ add_library(
633633
src/utilities/linked_column.cpp
634634
src/utilities/logger.cpp
635635
src/utilities/stacktrace.cpp
636+
src/utilities/stream_pool.cpp
636637
src/utilities/traits.cpp
637638
src/utilities/type_checks.cpp
638639
src/utilities/type_dispatcher.cpp
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
/*
2+
* Copyright (c) 2023, NVIDIA CORPORATION.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#pragma once
18+
19+
#include <cudf/utilities/span.hpp>
20+
21+
#include <rmm/cuda_stream_view.hpp>
22+
23+
#include <cstddef>
24+
#include <vector>
25+
26+
namespace cudf::detail {
27+
28+
/**
29+
* @brief Acquire a set of `cuda_stream_view` objects and synchronize them to an event on another
30+
* stream.
31+
*
32+
* By default an underlying `rmm::cuda_stream_pool` is used to obtain the streams. The only other
33+
* implementation at present is a debugging version that always returns the stream returned by
34+
* `cudf::get_default_stream()`. To use this debugging version, set the environment variable
35+
* `LIBCUDF_USE_DEBUG_STREAM_POOL`.
36+
*
37+
* Example usage:
38+
* @code{.cpp}
39+
* auto stream = cudf::get_default_stream();
40+
* auto const num_streams = 2;
41+
* // do work on stream
42+
* // allocate streams and wait for an event on stream before executing on any of streams
43+
* auto streams = cudf::detail::fork_stream(stream, num_streams);
44+
* // do work on streams[0] and streams[1]
45+
* // wait for event on streams before continuing to do work on stream
46+
* cudf::detail::join_streams(streams, stream);
47+
* @endcode
48+
*
49+
* @param stream Stream that the returned streams will wait on.
50+
* @param count The number of `cuda_stream_view` objects to return.
51+
* @return Vector containing `count` stream views.
52+
*/
53+
[[nodiscard]] std::vector<rmm::cuda_stream_view> fork_streams(rmm::cuda_stream_view stream,
54+
std::size_t count);
55+
56+
/**
57+
* @brief Synchronize a stream to an event on a set of streams.
58+
*
59+
* @param streams Streams to wait on.
60+
* @param stream Joined stream that synchronizes with the waited-on streams.
61+
*/
62+
void join_streams(host_span<rmm::cuda_stream_view const> streams, rmm::cuda_stream_view stream);
63+
64+
} // namespace cudf::detail

cpp/src/io/parquet/reader_impl.cpp

Lines changed: 13 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -18,31 +18,15 @@
1818

1919
#include <cudf/detail/stream_compaction.hpp>
2020
#include <cudf/detail/transform.hpp>
21+
#include <cudf/detail/utilities/stream_pool.hpp>
2122
#include <cudf/detail/utilities/vector_factories.hpp>
2223
#include <rmm/cuda_stream_pool.hpp>
2324

25+
#include <bitset>
2426
#include <numeric>
2527

2628
namespace cudf::io::detail::parquet {
2729

28-
namespace {
29-
30-
int constexpr NUM_DECODERS = 3; // how many decode kernels are there to run
31-
int constexpr APPROX_NUM_THREADS = 4; // guestimate from DaveB
32-
int constexpr STREAM_POOL_SIZE = NUM_DECODERS * APPROX_NUM_THREADS;
33-
34-
auto& get_stream_pool()
35-
{
36-
// TODO: creating this on the heap because there were issues with trying to call the
37-
// stream pool destructor during cuda shutdown that lead to a segmentation fault in
38-
// nvbench. this allocation is being deliberately leaked to avoid the above, but still
39-
// results in non-fatal warnings when running nvbench in cuda-gdb.
40-
static auto pool = new rmm::cuda_stream_pool{STREAM_POOL_SIZE};
41-
return *pool;
42-
}
43-
44-
} // namespace
45-
4630
void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
4731
{
4832
auto& chunks = _file_itm_data.chunks;
@@ -178,34 +162,33 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
178162
chunks.host_to_device_async(_stream);
179163
chunk_nested_valids.host_to_device_async(_stream);
180164
chunk_nested_data.host_to_device_async(_stream);
181-
_stream.synchronize();
182165

183-
auto const level_type_size = _file_itm_data.level_type_size;
166+
// get the number of streams we need from the pool and tell them to wait on the H2D copies
167+
int const nkernels = std::bitset<32>(kernel_mask).count();
168+
auto streams = cudf::detail::fork_streams(_stream, nkernels);
184169

185-
// vector of launched streams
186-
std::vector<rmm::cuda_stream_view> streams;
170+
auto const level_type_size = _file_itm_data.level_type_size;
187171

188172
// launch string decoder
173+
int s_idx = 0;
189174
if (has_strings) {
190-
streams.push_back(get_stream_pool().get_stream());
191-
chunk_nested_str_data.host_to_device_async(streams.back());
192-
gpu::DecodeStringPageData(pages, chunks, num_rows, skip_rows, level_type_size, streams.back());
175+
auto& stream = streams[s_idx++];
176+
chunk_nested_str_data.host_to_device_async(stream);
177+
gpu::DecodeStringPageData(pages, chunks, num_rows, skip_rows, level_type_size, stream);
193178
}
194179

195180
// launch delta binary decoder
196181
if ((kernel_mask & gpu::KERNEL_MASK_DELTA_BINARY) != 0) {
197-
streams.push_back(get_stream_pool().get_stream());
198-
gpu::DecodeDeltaBinary(pages, chunks, num_rows, skip_rows, level_type_size, streams.back());
182+
gpu::DecodeDeltaBinary(pages, chunks, num_rows, skip_rows, level_type_size, streams[s_idx++]);
199183
}
200184

201185
// launch the catch-all page decoder
202186
if ((kernel_mask & gpu::KERNEL_MASK_GENERAL) != 0) {
203-
streams.push_back(get_stream_pool().get_stream());
204-
gpu::DecodePageData(pages, chunks, num_rows, skip_rows, level_type_size, streams.back());
187+
gpu::DecodePageData(pages, chunks, num_rows, skip_rows, level_type_size, streams[s_idx++]);
205188
}
206189

207190
// synchronize the streams
208-
std::for_each(streams.begin(), streams.end(), [](auto& stream) { stream.synchronize(); });
191+
cudf::detail::join_streams(streams, _stream);
209192

210193
pages.device_to_host_async(_stream);
211194
page_nesting.device_to_host_async(_stream);

cpp/src/io/text/multibyte_split.cu

Lines changed: 7 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include <cudf/detail/nvtx/ranges.hpp>
2323
#include <cudf/detail/utilities/cuda.cuh>
2424
#include <cudf/detail/utilities/integer_utils.hpp>
25+
#include <cudf/detail/utilities/stream_pool.hpp>
2526
#include <cudf/io/text/byte_range_info.hpp>
2627
#include <cudf/io/text/data_chunk_source.hpp>
2728
#include <cudf/io/text/detail/multistate.hpp>
@@ -32,7 +33,6 @@
3233
#include <cudf/utilities/default_stream.hpp>
3334
#include <cudf/utilities/span.hpp>
3435

35-
#include <rmm/cuda_stream_pool.hpp>
3636
#include <rmm/cuda_stream_view.hpp>
3737
#include <rmm/exec_policy.hpp>
3838
#include <rmm/mr/device/device_memory_resource.hpp>
@@ -301,44 +301,12 @@ namespace io {
301301
namespace text {
302302
namespace detail {
303303

304-
void fork_stream(std::vector<rmm::cuda_stream_view> streams, rmm::cuda_stream_view stream)
305-
{
306-
cudaEvent_t event;
307-
CUDF_CUDA_TRY(cudaEventCreate(&event));
308-
CUDF_CUDA_TRY(cudaEventRecord(event, stream));
309-
for (uint32_t i = 0; i < streams.size(); i++) {
310-
CUDF_CUDA_TRY(cudaStreamWaitEvent(streams[i], event, 0));
311-
}
312-
CUDF_CUDA_TRY(cudaEventDestroy(event));
313-
}
314-
315-
void join_stream(std::vector<rmm::cuda_stream_view> streams, rmm::cuda_stream_view stream)
316-
{
317-
cudaEvent_t event;
318-
CUDF_CUDA_TRY(cudaEventCreate(&event));
319-
for (uint32_t i = 0; i < streams.size(); i++) {
320-
CUDF_CUDA_TRY(cudaEventRecord(event, streams[i]));
321-
CUDF_CUDA_TRY(cudaStreamWaitEvent(stream, event, 0));
322-
}
323-
CUDF_CUDA_TRY(cudaEventDestroy(event));
324-
}
325-
326-
std::vector<rmm::cuda_stream_view> get_streams(int32_t count, rmm::cuda_stream_pool& stream_pool)
327-
{
328-
auto streams = std::vector<rmm::cuda_stream_view>();
329-
for (int32_t i = 0; i < count; i++) {
330-
streams.emplace_back(stream_pool.get_stream());
331-
}
332-
return streams;
333-
}
334-
335304
std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source const& source,
336305
std::string const& delimiter,
337306
byte_range_info byte_range,
338307
bool strip_delimiters,
339308
rmm::cuda_stream_view stream,
340-
rmm::mr::device_memory_resource* mr,
341-
rmm::cuda_stream_pool& stream_pool)
309+
rmm::mr::device_memory_resource* mr)
342310
{
343311
CUDF_FUNC_RANGE();
344312

@@ -365,8 +333,7 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
365333
CUDF_EXPECTS(delimiter.size() < multistate::max_segment_value,
366334
"delimiter contains too many total tokens to produce a deterministic result.");
367335

368-
auto concurrency = 2;
369-
auto streams = get_streams(concurrency, stream_pool);
336+
auto const concurrency = 2;
370337

371338
// must be at least 32 when using warp-reduce on partials
372339
// must be at least 1 more than max possible concurrent tiles
@@ -411,7 +378,7 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
411378
output_builder<byte_offset> row_offset_storage(ITEMS_PER_CHUNK, max_growth, stream);
412379
output_builder<char> char_storage(ITEMS_PER_CHUNK, max_growth, stream);
413380

414-
fork_stream(streams, stream);
381+
auto streams = cudf::detail::fork_streams(stream, concurrency);
415382

416383
cudaEvent_t last_launch_event;
417384
CUDF_CUDA_TRY(cudaEventCreate(&last_launch_event));
@@ -532,7 +499,7 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
532499

533500
CUDF_CUDA_TRY(cudaEventDestroy(last_launch_event));
534501

535-
join_stream(streams, stream);
502+
cudf::detail::join_streams(streams, stream);
536503

537504
// if the input was empty, we didn't find a delimiter at all,
538505
// or the first delimiter was also the last: empty output
@@ -602,11 +569,10 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
602569
parse_options options,
603570
rmm::mr::device_memory_resource* mr)
604571
{
605-
auto stream = cudf::get_default_stream();
606-
auto stream_pool = rmm::cuda_stream_pool(2);
572+
auto stream = cudf::get_default_stream();
607573

608574
auto result = detail::multibyte_split(
609-
source, delimiter, options.byte_range, options.strip_delimiters, stream, mr, stream_pool);
575+
source, delimiter, options.byte_range, options.strip_delimiters, stream, mr);
610576

611577
return result;
612578
}

0 commit comments

Comments
 (0)