|
1 | 1 | /* |
2 | | - * Copyright (c) 2019-2024, NVIDIA CORPORATION. |
| 2 | + * Copyright (c) 2019-2025, NVIDIA CORPORATION. |
3 | 3 | * |
4 | 4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | 5 | * you may not use this file except in compliance with the License. |
@@ -97,38 +97,24 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num |
97 | 97 | _stream); |
98 | 98 | } |
99 | 99 |
|
100 | | - // Compute column string sizes (using page string offsets) for this subpass |
| 100 | + // Compute column string sizes (using page string offsets) for this output table chunk |
101 | 101 | col_string_sizes = calculate_page_string_offsets(); |
102 | 102 |
|
103 | | - // ensure cumulative column string sizes have been initialized |
104 | | - if (pass.cumulative_col_string_sizes.empty()) { |
105 | | - pass.cumulative_col_string_sizes.resize(_input_columns.size(), 0); |
106 | | - } |
107 | | - |
108 | | - // Add to the cumulative column string sizes of this pass |
109 | | - std::transform(pass.cumulative_col_string_sizes.begin(), |
110 | | - pass.cumulative_col_string_sizes.end(), |
111 | | - col_string_sizes.begin(), |
112 | | - pass.cumulative_col_string_sizes.begin(), |
113 | | - std::plus<>{}); |
114 | | - |
115 | 103 | // Check for overflow in cumulative column string sizes of this pass so that the page string |
116 | 104 | // offsets of overflowing (large) string columns are treated as 64-bit. |
117 | 105 | auto const threshold = static_cast<size_t>(strings::detail::get_offset64_threshold()); |
118 | | - auto const has_large_strings = std::any_of(pass.cumulative_col_string_sizes.cbegin(), |
119 | | - pass.cumulative_col_string_sizes.cend(), |
| 106 | + auto const has_large_strings = std::any_of(col_string_sizes.cbegin(), |
| 107 | + col_string_sizes.cend(), |
120 | 108 | [=](std::size_t sz) { return sz > threshold; }); |
121 | 109 | if (has_large_strings and not strings::detail::is_large_strings_enabled()) { |
122 | 110 | CUDF_FAIL("String column exceeds the column size limit", std::overflow_error); |
123 | 111 | } |
124 | 112 |
|
125 | | - // Mark any chunks for which the cumulative column string size has exceeded the |
126 | | - // large strings threshold |
127 | | - if (has_large_strings) { |
128 | | - for (auto& chunk : pass.chunks) { |
129 | | - auto const idx = chunk.src_col_index; |
130 | | - if (pass.cumulative_col_string_sizes[idx] > threshold) { chunk.is_large_string_col = true; } |
131 | | - } |
| 113 | + // Mark/unmark column-chunk descriptors depending on the string sizes of corresponding output |
| 114 | + // column chunks and the large strings threshold. |
| 115 | + for (auto& chunk : pass.chunks) { |
| 116 | + auto const idx = chunk.src_col_index; |
| 117 | + chunk.is_large_string_col = (col_string_sizes[idx] > threshold); |
132 | 118 | } |
133 | 119 | } |
134 | 120 |
|
@@ -210,11 +196,9 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num |
210 | 196 | // only do string buffer for leaf |
211 | 197 | if (idx == max_depth - 1 and out_buf.string_size() == 0 and |
212 | 198 | col_string_sizes[pass.chunks[c].src_col_index] > 0) { |
213 | | - out_buf.create_string_data( |
214 | | - col_string_sizes[pass.chunks[c].src_col_index], |
215 | | - pass.cumulative_col_string_sizes[pass.chunks[c].src_col_index] > |
216 | | - static_cast<size_t>(strings::detail::get_offset64_threshold()), |
217 | | - _stream); |
| 199 | + out_buf.create_string_data(col_string_sizes[pass.chunks[c].src_col_index], |
| 200 | + pass.chunks[c].is_large_string_col, |
| 201 | + _stream); |
218 | 202 | } |
219 | 203 | if (has_strings) { str_data[idx] = out_buf.string_data(); } |
220 | 204 | out_buf.user_data |= |
@@ -416,11 +400,11 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num |
416 | 400 | final_offsets.emplace_back(offset); |
417 | 401 | out_buf.user_data |= PARQUET_COLUMN_BUFFER_FLAG_LIST_TERMINATED; |
418 | 402 | } else if (out_buf.type.id() == type_id::STRING) { |
419 | | - // need to cap off the string offsets column |
420 | | - auto const sz = static_cast<size_type>(col_string_sizes[idx]); |
421 | | - if (sz <= strings::detail::get_offset64_threshold()) { |
| 403 | + // only if it is not a large strings column |
| 404 | + if (col_string_sizes[idx] <= |
| 405 | + static_cast<size_t>(strings::detail::get_offset64_threshold())) { |
422 | 406 | out_buffers.emplace_back(static_cast<size_type*>(out_buf.data()) + out_buf.size); |
423 | | - final_offsets.emplace_back(sz); |
| 407 | + final_offsets.emplace_back(static_cast<size_type>(col_string_sizes[idx])); |
424 | 408 | } |
425 | 409 | } |
426 | 410 | } |
|
0 commit comments