11/*
2- * SPDX-FileCopyrightText: Copyright (c) 2020-2025 , NVIDIA CORPORATION.
2+ * SPDX-FileCopyrightText: Copyright (c) 2020-2026 , NVIDIA CORPORATION.
33 * SPDX-License-Identifier: Apache-2.0
44 */
55
1010#include < cudf/detail/gather.cuh>
1111#include < cudf/detail/indexalator.cuh>
1212#include < cudf/detail/nvtx/ranges.hpp>
13+ #include < cudf/detail/utilities/stream_pool.hpp>
1314#include < cudf/dictionary/detail/update_keys.hpp>
1415#include < cudf/dictionary/dictionary_column_view.hpp>
1516#include < cudf/dictionary/dictionary_factories.hpp>
@@ -397,22 +398,32 @@ std::unique_ptr<table> scatter(table_view const& source,
397398 thrust::make_transform_iterator (scatter_map_begin, index_converter<MapType>{target.num_rows ()});
398399 auto updated_scatter_map_end =
399400 thrust::make_transform_iterator (scatter_map_end, index_converter<MapType>{target.num_rows ()});
400- auto result = std::vector<std::unique_ptr<column>>(target.num_columns ());
401-
402- std::transform (source.begin (),
403- source.end (),
404- target.begin (),
405- result.begin (),
406- [=](auto const & source_col, auto const & target_col) {
407- return type_dispatcher<dispatch_storage_type>(source_col.type (),
408- column_scatterer{},
409- source_col,
410- updated_scatter_map_begin,
411- updated_scatter_map_end,
412- target_col,
413- stream,
414- mr);
415- });
401+
402+ auto const num_columns = target.num_columns ();
403+ auto result = std::vector<std::unique_ptr<column>>(num_columns);
404+
405+ // The data scatter for n columns will be executed over n streams. If there is
406+ // only a single column, the fork/join overhead should be avoided.
407+ auto streams = std::vector<rmm::cuda_stream_view>{};
408+ if (num_columns > 1 ) {
409+ streams = cudf::detail::fork_streams (stream, num_columns);
410+ } else {
411+ streams.push_back (stream);
412+ }
413+
414+ auto it = thrust::make_counting_iterator<size_type>(0 );
415+
416+ std::transform (it, it + num_columns, result.begin (), [&](size_type i) {
417+ auto const & source_col = source.column (i);
418+ return type_dispatcher<dispatch_storage_type>(source_col.type (),
419+ column_scatterer{},
420+ source_col,
421+ updated_scatter_map_begin,
422+ updated_scatter_map_end,
423+ target.column (i),
424+ streams[i],
425+ mr);
426+ });
416427
417428 // We still need to call `gather_bitmask` even when the source columns are not nullable,
418429 // as if the target has null_mask, that null_mask needs to be updated after scattering.
@@ -426,7 +437,9 @@ std::unique_ptr<table> scatter(table_view const& source,
426437
427438 // For struct columns, we need to superimpose the null_mask of the parent over the null_mask of
428439 // the children.
429- std::for_each (result.begin (), result.end (), [=](auto & col) {
440+ auto it = thrust::make_counting_iterator<size_type>(0 );
441+ std::for_each (it, it + num_columns, [&](size_type i) {
442+ auto & col = result[i];
430443 auto const col_view = col->view ();
431444 if (col_view.type ().id () == type_id::STRUCT and col_view.nullable ()) {
432445 auto const num_rows = col_view.size ();
@@ -438,11 +451,17 @@ std::unique_ptr<table> scatter(table_view const& source,
438451 std::move (contents.children ),
439452 null_count,
440453 std::move (*contents.null_mask ),
441- stream ,
454+ streams[i] ,
442455 mr);
443456 }
444457 });
445458 }
459+
460+ // Join streams as late as possible so that null mask computations can run on
461+ // the passed in stream while other streams are scattering. Skip joining if
462+ // only one column, since it used the passed in stream rather than forking.
463+ if (num_columns > 1 ) { cudf::detail::join_streams (streams, stream); }
464+
446465 return std::make_unique<table>(std::move (result));
447466}
448467} // namespace detail
0 commit comments