Skip to content

Commit 559cda2

Browse files
authored
Use 64-bit offsets only if the current strings column output chunk size exceeds threshold (#17693)
This PR improves on #17207 and only uses 64-bit offsets if the current output chunk of a strings column exceeds the large-strings threshold instead of using cumulative strings column sizes per `pass` or `row group` level. Authors: - Muhammad Haseeb (https://github.com/mhaseeb123) Approvers: - Karthikeyan (https://github.com/karthikeyann) - David Wendt (https://github.com/davidwendt) - Yunsong Wang (https://github.com/PointKernel) URL: #17693
1 parent a8a4197 commit 559cda2

File tree

2 files changed

+17
-36
lines changed

2 files changed

+17
-36
lines changed

cpp/src/io/parquet/reader_impl.cpp

Lines changed: 16 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
2+
* Copyright (c) 2019-2025, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -97,38 +97,24 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
9797
_stream);
9898
}
9999

100-
// Compute column string sizes (using page string offsets) for this subpass
100+
// Compute column string sizes (using page string offsets) for this output table chunk
101101
col_string_sizes = calculate_page_string_offsets();
102102

103-
// ensure cumulative column string sizes have been initialized
104-
if (pass.cumulative_col_string_sizes.empty()) {
105-
pass.cumulative_col_string_sizes.resize(_input_columns.size(), 0);
106-
}
107-
108-
// Add to the cumulative column string sizes of this pass
109-
std::transform(pass.cumulative_col_string_sizes.begin(),
110-
pass.cumulative_col_string_sizes.end(),
111-
col_string_sizes.begin(),
112-
pass.cumulative_col_string_sizes.begin(),
113-
std::plus<>{});
114-
115103
// Check for overflow in cumulative column string sizes of this pass so that the page string
116104
// offsets of overflowing (large) string columns are treated as 64-bit.
117105
auto const threshold = static_cast<size_t>(strings::detail::get_offset64_threshold());
118-
auto const has_large_strings = std::any_of(pass.cumulative_col_string_sizes.cbegin(),
119-
pass.cumulative_col_string_sizes.cend(),
106+
auto const has_large_strings = std::any_of(col_string_sizes.cbegin(),
107+
col_string_sizes.cend(),
120108
[=](std::size_t sz) { return sz > threshold; });
121109
if (has_large_strings and not strings::detail::is_large_strings_enabled()) {
122110
CUDF_FAIL("String column exceeds the column size limit", std::overflow_error);
123111
}
124112

125-
// Mark any chunks for which the cumulative column string size has exceeded the
126-
// large strings threshold
127-
if (has_large_strings) {
128-
for (auto& chunk : pass.chunks) {
129-
auto const idx = chunk.src_col_index;
130-
if (pass.cumulative_col_string_sizes[idx] > threshold) { chunk.is_large_string_col = true; }
131-
}
113+
// Mark/unmark column-chunk descriptors depending on the string sizes of corresponding output
114+
// column chunks and the large strings threshold.
115+
for (auto& chunk : pass.chunks) {
116+
auto const idx = chunk.src_col_index;
117+
chunk.is_large_string_col = (col_string_sizes[idx] > threshold);
132118
}
133119
}
134120

@@ -210,11 +196,9 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
210196
// only do string buffer for leaf
211197
if (idx == max_depth - 1 and out_buf.string_size() == 0 and
212198
col_string_sizes[pass.chunks[c].src_col_index] > 0) {
213-
out_buf.create_string_data(
214-
col_string_sizes[pass.chunks[c].src_col_index],
215-
pass.cumulative_col_string_sizes[pass.chunks[c].src_col_index] >
216-
static_cast<size_t>(strings::detail::get_offset64_threshold()),
217-
_stream);
199+
out_buf.create_string_data(col_string_sizes[pass.chunks[c].src_col_index],
200+
pass.chunks[c].is_large_string_col,
201+
_stream);
218202
}
219203
if (has_strings) { str_data[idx] = out_buf.string_data(); }
220204
out_buf.user_data |=
@@ -416,11 +400,11 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
416400
final_offsets.emplace_back(offset);
417401
out_buf.user_data |= PARQUET_COLUMN_BUFFER_FLAG_LIST_TERMINATED;
418402
} else if (out_buf.type.id() == type_id::STRING) {
419-
// need to cap off the string offsets column
420-
auto const sz = static_cast<size_type>(col_string_sizes[idx]);
421-
if (sz <= strings::detail::get_offset64_threshold()) {
403+
// only if it is not a large strings column
404+
if (col_string_sizes[idx] <=
405+
static_cast<size_t>(strings::detail::get_offset64_threshold())) {
422406
out_buffers.emplace_back(static_cast<size_type*>(out_buf.data()) + out_buf.size);
423-
final_offsets.emplace_back(sz);
407+
final_offsets.emplace_back(static_cast<size_type>(col_string_sizes[idx]));
424408
}
425409
}
426410
}

cpp/src/io/parquet/reader_impl_chunking.hpp

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2023-2024, NVIDIA CORPORATION.
2+
* Copyright (c) 2023-2025, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -130,9 +130,6 @@ struct pass_intermediate_data {
130130
rmm::device_buffer decomp_dict_data{0, cudf::get_default_stream()};
131131
rmm::device_uvector<string_index_pair> str_dict_index{0, cudf::get_default_stream()};
132132

133-
// cumulative strings column sizes.
134-
std::vector<size_t> cumulative_col_string_sizes{};
135-
136133
int level_type_size{0};
137134

138135
// skip_rows / num_rows for this pass.

0 commit comments

Comments
 (0)