Skip to content

Commit 45ac8ec

Browse files
authored
Merge branch 'branch-25.02' into cudf-polars-multi-groupby
2 parents 22cebeb + dc2a75c commit 45ac8ec

38 files changed

+410
-418
lines changed

cpp/include/cudf/detail/utilities/integer_utils.hpp

+9-3
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/*
22
* Copyright 2019 BlazingDB, Inc.
33
* Copyright 2019 Eyal Rozenberg <[email protected]>
4-
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
4+
* Copyright (c) 2020-2025, NVIDIA CORPORATION.
55
*
66
* Licensed under the Apache License, Version 2.0 (the "License");
77
* you may not use this file except in compliance with the License.
@@ -23,6 +23,8 @@
2323
*/
2424

2525
#include <cudf/fixed_point/temporary.hpp>
26+
#include <cudf/types.hpp>
27+
#include <cudf/utilities/error.hpp>
2628

2729
#include <cmath>
2830
#include <cstdlib>
@@ -44,13 +46,17 @@ namespace util {
4446
* `modulus` is positive. The safety is in regard to rollover.
4547
*/
4648
template <typename S>
47-
constexpr S round_up_safe(S number_to_round, S modulus)
49+
CUDF_HOST_DEVICE constexpr S round_up_safe(S number_to_round, S modulus)
4850
{
4951
auto remainder = number_to_round % modulus;
5052
if (remainder == 0) { return number_to_round; }
5153
auto rounded_up = number_to_round - remainder + modulus;
5254
if (rounded_up < number_to_round) {
53-
throw std::invalid_argument("Attempt to round up beyond the type's maximum value");
55+
#ifndef __CUDA_ARCH__
56+
CUDF_FAIL("Attempt to round up beyond the type's maximum value", cudf::data_type_error);
57+
#else
58+
CUDF_UNREACHABLE("Attempt to round up beyond the type's maximum value");
59+
#endif
5460
}
5561
return rounded_up;
5662
}

cpp/include/cudf/utilities/span.hpp

+24-16
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
2+
* Copyright (c) 2020-2025, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -197,11 +197,16 @@ struct host_span : public cudf::detail::span_base<T, Extent, host_span<T, Extent
197197

198198
constexpr host_span() noexcept : base() {} // required to compile on centos
199199

200-
/// Constructor from pointer and size
201-
/// @param data Pointer to the first element in the span
202-
/// @param size The number of elements in the span
203-
/// @param is_device_accessible Whether the data is device accessible (e.g. pinned memory)
204-
constexpr host_span(T* data, std::size_t size, bool is_device_accessible)
200+
/**
201+
* @brief Constructor from pointer and size
202+
*
203+
* @note This needs to be host-device , as it's used by a host-device function in base_2dspan
204+
*
205+
* @param data Pointer to the first element in the span
206+
* @param size The number of elements in the span
207+
* @param is_device_accessible Whether the data is device accessible (e.g. pinned memory)
208+
*/
209+
CUDF_HOST_DEVICE constexpr host_span(T* data, std::size_t size, bool is_device_accessible)
205210
: base(data, size), _is_device_accessible{is_device_accessible}
206211
{
207212
}
@@ -311,8 +316,8 @@ struct host_span : public cudf::detail::span_base<T, Extent, host_span<T, Extent
311316
* @param count The number of elements in the subspan
312317
* @return A subspan of the sequence, of requested count and offset
313318
*/
314-
[[nodiscard]] constexpr host_span subspan(typename base::size_type offset,
315-
typename base::size_type count) const noexcept
319+
[[nodiscard]] CUDF_HOST_DEVICE constexpr host_span subspan(
320+
typename base::size_type offset, typename base::size_type count) const noexcept
316321
{
317322
return host_span{this->data() + offset, count, _is_device_accessible};
318323
}
@@ -434,8 +439,8 @@ struct device_span : public cudf::detail::span_base<T, Extent, device_span<T, Ex
434439
* @param count The number of elements in the subspan
435440
* @return A subspan of the sequence, of requested count and offset
436441
*/
437-
[[nodiscard]] constexpr device_span subspan(typename base::size_type offset,
438-
typename base::size_type count) const noexcept
442+
[[nodiscard]] CUDF_HOST_DEVICE constexpr device_span subspan(
443+
typename base::size_type offset, typename base::size_type count) const noexcept
439444
{
440445
return device_span{this->data() + offset, count};
441446
}
@@ -475,28 +480,28 @@ class base_2dspan {
475480
*
476481
* @return A pointer to the first element of the span
477482
*/
478-
[[nodiscard]] constexpr auto data() const noexcept { return _flat.data(); }
483+
[[nodiscard]] CUDF_HOST_DEVICE constexpr auto data() const noexcept { return _flat.data(); }
479484

480485
/**
481486
* @brief Returns the size in the span as pair.
482487
*
483488
* @return pair representing rows and columns size of the span
484489
*/
485-
[[nodiscard]] constexpr auto size() const noexcept { return _size; }
490+
[[nodiscard]] CUDF_HOST_DEVICE constexpr auto size() const noexcept { return _size; }
486491

487492
/**
488493
* @brief Returns the number of elements in the span.
489494
*
490495
* @return Number of elements in the span
491496
*/
492-
[[nodiscard]] constexpr auto count() const noexcept { return _flat.size(); }
497+
[[nodiscard]] CUDF_HOST_DEVICE constexpr auto count() const noexcept { return _flat.size(); }
493498

494499
/**
495500
* @brief Checks if the span is empty.
496501
*
497502
* @return True if the span is empty, false otherwise
498503
*/
499-
[[nodiscard]] constexpr bool is_empty() const noexcept { return count() == 0; }
504+
[[nodiscard]] CUDF_HOST_DEVICE constexpr bool is_empty() const noexcept { return count() == 0; }
500505

501506
/**
502507
* @brief Returns a reference to the row-th element of the sequence.
@@ -507,7 +512,7 @@ class base_2dspan {
507512
* @param row the index of the element to access
508513
* @return A reference to the row-th element of the sequence, i.e., `data()[row]`
509514
*/
510-
constexpr RowType<T, dynamic_extent> operator[](size_t row) const
515+
CUDF_HOST_DEVICE constexpr RowType<T, dynamic_extent> operator[](size_t row) const
511516
{
512517
return _flat.subspan(row * _size.second, _size.second);
513518
}
@@ -517,7 +522,10 @@ class base_2dspan {
517522
*
518523
* @return A flattened span of the 2D span
519524
*/
520-
[[nodiscard]] constexpr RowType<T, dynamic_extent> flat_view() const { return _flat; }
525+
[[nodiscard]] CUDF_HOST_DEVICE constexpr RowType<T, dynamic_extent> flat_view() const
526+
{
527+
return _flat;
528+
}
521529

522530
/**
523531
* @brief Construct a 2D span from another 2D span of convertible type

cpp/src/io/json/read_json.cu

+61-10
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2022-2024, NVIDIA CORPORATION.
2+
* Copyright (c) 2022-2025, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -30,19 +30,33 @@
3030
#include <cudf/utilities/memory_resource.hpp>
3131
#include <cudf/utilities/span.hpp>
3232

33+
#include <rmm/cuda_stream_pool.hpp>
3334
#include <rmm/device_uvector.hpp>
3435
#include <rmm/exec_policy.hpp>
3536

3637
#include <thrust/distance.h>
3738
#include <thrust/iterator/constant_iterator.h>
3839
#include <thrust/scatter.h>
3940

41+
#include <BS_thread_pool.hpp>
42+
#include <BS_thread_pool_utils.hpp>
43+
4044
#include <numeric>
4145

4246
namespace cudf::io::json::detail {
4347

4448
namespace {
4549

50+
namespace pools {
51+
52+
BS::thread_pool& tpool()
53+
{
54+
static BS::thread_pool _tpool(std::thread::hardware_concurrency());
55+
return _tpool;
56+
}
57+
58+
} // namespace pools
59+
4660
class compressed_host_buffer_source final : public datasource {
4761
public:
4862
explicit compressed_host_buffer_source(std::unique_ptr<datasource> const& src,
@@ -51,8 +65,8 @@ class compressed_host_buffer_source final : public datasource {
5165
{
5266
auto ch_buffer = host_span<uint8_t const>(reinterpret_cast<uint8_t const*>(_dbuf_ptr->data()),
5367
_dbuf_ptr->size());
54-
if (comptype == compression_type::GZIP || comptype == compression_type::ZIP ||
55-
comptype == compression_type::SNAPPY) {
68+
if (_comptype == compression_type::GZIP || _comptype == compression_type::ZIP ||
69+
_comptype == compression_type::SNAPPY) {
5670
_decompressed_ch_buffer_size = cudf::io::detail::get_uncompressed_size(_comptype, ch_buffer);
5771
} else {
5872
_decompressed_buffer = cudf::io::detail::decompress(_comptype, ch_buffer);
@@ -96,7 +110,22 @@ class compressed_host_buffer_source final : public datasource {
96110
return std::make_unique<non_owning_buffer>(_decompressed_buffer.data() + offset, count);
97111
}
98112

99-
[[nodiscard]] bool supports_device_read() const override { return false; }
113+
std::future<size_t> device_read_async(size_t offset,
114+
size_t size,
115+
uint8_t* dst,
116+
rmm::cuda_stream_view stream) override
117+
{
118+
auto& thread_pool = pools::tpool();
119+
return thread_pool.submit_task([this, offset, size, dst, stream] {
120+
auto hbuf = host_read(offset, size);
121+
CUDF_CUDA_TRY(
122+
cudaMemcpyAsync(dst, hbuf->data(), hbuf->size(), cudaMemcpyHostToDevice, stream.value()));
123+
stream.synchronize();
124+
return hbuf->size();
125+
});
126+
}
127+
128+
[[nodiscard]] bool supports_device_read() const override { return true; }
100129

101130
[[nodiscard]] size_t size() const override { return _decompressed_ch_buffer_size; }
102131

@@ -431,6 +460,8 @@ device_span<char> ingest_raw_input(device_span<char> buffer,
431460
// line of file i+1 don't end up on the same JSON line, if file i does not already end with a line
432461
// delimiter.
433462
auto constexpr num_delimiter_chars = 1;
463+
std::vector<std::future<size_t>> thread_tasks;
464+
auto stream_pool = cudf::detail::fork_streams(stream, pools::tpool().get_thread_count());
434465

435466
auto delimiter_map = cudf::detail::make_empty_host_vector<std::size_t>(sources.size(), stream);
436467
std::vector<std::size_t> prefsum_source_sizes(sources.size());
@@ -447,13 +478,17 @@ device_span<char> ingest_raw_input(device_span<char> buffer,
447478

448479
auto const total_bytes_to_read = std::min(range_size, prefsum_source_sizes.back() - range_offset);
449480
range_offset -= start_source ? prefsum_source_sizes[start_source - 1] : 0;
450-
for (std::size_t i = start_source; i < sources.size() && bytes_read < total_bytes_to_read; i++) {
481+
for (std::size_t i = start_source, cur_stream = 0;
482+
i < sources.size() && bytes_read < total_bytes_to_read;
483+
i++) {
451484
if (sources[i]->is_empty()) continue;
452485
auto data_size = std::min(sources[i]->size() - range_offset, total_bytes_to_read - bytes_read);
453486
auto destination = reinterpret_cast<uint8_t*>(buffer.data()) + bytes_read +
454487
(num_delimiter_chars * delimiter_map.size());
455-
if (sources[i]->is_device_read_preferred(data_size)) {
456-
bytes_read += sources[i]->device_read(range_offset, data_size, destination, stream);
488+
if (sources[i]->supports_device_read()) {
489+
thread_tasks.emplace_back(sources[i]->device_read_async(
490+
range_offset, data_size, destination, stream_pool[cur_stream++ % stream_pool.size()]));
491+
bytes_read += data_size;
457492
} else {
458493
h_buffers.emplace_back(sources[i]->host_read(range_offset, data_size));
459494
auto const& h_buffer = h_buffers.back();
@@ -481,6 +516,15 @@ device_span<char> ingest_raw_input(device_span<char> buffer,
481516
buffer.data());
482517
}
483518
stream.synchronize();
519+
520+
if (thread_tasks.size()) {
521+
auto const bytes_read = std::accumulate(
522+
thread_tasks.begin(), thread_tasks.end(), std::size_t{0}, [](std::size_t sum, auto& task) {
523+
return sum + task.get();
524+
});
525+
CUDF_EXPECTS(bytes_read == total_bytes_to_read, "something's fishy");
526+
}
527+
484528
return buffer.first(bytes_read + (delimiter_map.size() * num_delimiter_chars));
485529
}
486530

@@ -505,10 +549,17 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
505549
return read_json_impl(sources, reader_opts, stream, mr);
506550

507551
std::vector<std::unique_ptr<datasource>> compressed_sources;
508-
for (size_t i = 0; i < sources.size(); i++) {
509-
compressed_sources.emplace_back(
510-
std::make_unique<compressed_host_buffer_source>(sources[i], reader_opts.get_compression()));
552+
std::vector<std::future<std::unique_ptr<compressed_host_buffer_source>>> thread_tasks;
553+
auto& thread_pool = pools::tpool();
554+
for (auto& src : sources) {
555+
thread_tasks.emplace_back(thread_pool.submit_task([&reader_opts, &src] {
556+
return std::make_unique<compressed_host_buffer_source>(src, reader_opts.get_compression());
557+
}));
511558
}
559+
std::transform(thread_tasks.begin(),
560+
thread_tasks.end(),
561+
std::back_inserter(compressed_sources),
562+
[](auto& task) { return task.get(); });
512563
// in read_json_impl, we need the compressed source size to actually be the
513564
// uncompressed source size for correct batching
514565
return read_json_impl(compressed_sources, reader_opts, stream, mr);

cpp/src/io/parquet/reader_impl.cpp

+16-32
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
2+
* Copyright (c) 2019-2025, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -97,38 +97,24 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
9797
_stream);
9898
}
9999

100-
// Compute column string sizes (using page string offsets) for this subpass
100+
// Compute column string sizes (using page string offsets) for this output table chunk
101101
col_string_sizes = calculate_page_string_offsets();
102102

103-
// ensure cumulative column string sizes have been initialized
104-
if (pass.cumulative_col_string_sizes.empty()) {
105-
pass.cumulative_col_string_sizes.resize(_input_columns.size(), 0);
106-
}
107-
108-
// Add to the cumulative column string sizes of this pass
109-
std::transform(pass.cumulative_col_string_sizes.begin(),
110-
pass.cumulative_col_string_sizes.end(),
111-
col_string_sizes.begin(),
112-
pass.cumulative_col_string_sizes.begin(),
113-
std::plus<>{});
114-
115103
// Check for overflow in cumulative column string sizes of this pass so that the page string
116104
// offsets of overflowing (large) string columns are treated as 64-bit.
117105
auto const threshold = static_cast<size_t>(strings::detail::get_offset64_threshold());
118-
auto const has_large_strings = std::any_of(pass.cumulative_col_string_sizes.cbegin(),
119-
pass.cumulative_col_string_sizes.cend(),
106+
auto const has_large_strings = std::any_of(col_string_sizes.cbegin(),
107+
col_string_sizes.cend(),
120108
[=](std::size_t sz) { return sz > threshold; });
121109
if (has_large_strings and not strings::detail::is_large_strings_enabled()) {
122110
CUDF_FAIL("String column exceeds the column size limit", std::overflow_error);
123111
}
124112

125-
// Mark any chunks for which the cumulative column string size has exceeded the
126-
// large strings threshold
127-
if (has_large_strings) {
128-
for (auto& chunk : pass.chunks) {
129-
auto const idx = chunk.src_col_index;
130-
if (pass.cumulative_col_string_sizes[idx] > threshold) { chunk.is_large_string_col = true; }
131-
}
113+
// Mark/unmark column-chunk descriptors depending on the string sizes of corresponding output
114+
// column chunks and the large strings threshold.
115+
for (auto& chunk : pass.chunks) {
116+
auto const idx = chunk.src_col_index;
117+
chunk.is_large_string_col = (col_string_sizes[idx] > threshold);
132118
}
133119
}
134120

@@ -210,11 +196,9 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
210196
// only do string buffer for leaf
211197
if (idx == max_depth - 1 and out_buf.string_size() == 0 and
212198
col_string_sizes[pass.chunks[c].src_col_index] > 0) {
213-
out_buf.create_string_data(
214-
col_string_sizes[pass.chunks[c].src_col_index],
215-
pass.cumulative_col_string_sizes[pass.chunks[c].src_col_index] >
216-
static_cast<size_t>(strings::detail::get_offset64_threshold()),
217-
_stream);
199+
out_buf.create_string_data(col_string_sizes[pass.chunks[c].src_col_index],
200+
pass.chunks[c].is_large_string_col,
201+
_stream);
218202
}
219203
if (has_strings) { str_data[idx] = out_buf.string_data(); }
220204
out_buf.user_data |=
@@ -416,11 +400,11 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
416400
final_offsets.emplace_back(offset);
417401
out_buf.user_data |= PARQUET_COLUMN_BUFFER_FLAG_LIST_TERMINATED;
418402
} else if (out_buf.type.id() == type_id::STRING) {
419-
// need to cap off the string offsets column
420-
auto const sz = static_cast<size_type>(col_string_sizes[idx]);
421-
if (sz <= strings::detail::get_offset64_threshold()) {
403+
// only if it is not a large strings column
404+
if (col_string_sizes[idx] <=
405+
static_cast<size_t>(strings::detail::get_offset64_threshold())) {
422406
out_buffers.emplace_back(static_cast<size_type*>(out_buf.data()) + out_buf.size);
423-
final_offsets.emplace_back(sz);
407+
final_offsets.emplace_back(static_cast<size_type>(col_string_sizes[idx]));
424408
}
425409
}
426410
}

cpp/src/io/parquet/reader_impl_chunking.hpp

+1-4
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2023-2024, NVIDIA CORPORATION.
2+
* Copyright (c) 2023-2025, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -130,9 +130,6 @@ struct pass_intermediate_data {
130130
rmm::device_buffer decomp_dict_data{0, cudf::get_default_stream()};
131131
rmm::device_uvector<string_index_pair> str_dict_index{0, cudf::get_default_stream()};
132132

133-
// cumulative strings column sizes.
134-
std::vector<size_t> cumulative_col_string_sizes{};
135-
136133
int level_type_size{0};
137134

138135
// skip_rows / num_rows for this pass.

0 commit comments

Comments
 (0)