Skip to content

Commit 5743030

Browse files
authored
Merge branch 'branch-25.02' into dask-expr-migration
2 parents 2a6821d + dc2a75c commit 5743030

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+423
-436
lines changed

conda/environments/all_cuda-118_arch-x86_64.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ dependencies:
5555
- nbsphinx
5656
- ninja
5757
- notebook
58-
- numba-cuda>=0.0.13,<0.0.18
58+
- numba-cuda>=0.2.0,<0.3.0
5959
- numpy>=1.23,<3.0a0
6060
- numpydoc
6161
- nvcc_linux-64=11.8

conda/environments/all_cuda-125_arch-x86_64.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ dependencies:
5454
- nbsphinx
5555
- ninja
5656
- notebook
57-
- numba-cuda>=0.0.13,<0.0.18
57+
- numba-cuda>=0.2.0,<0.3.0
5858
- numpy>=1.23,<3.0a0
5959
- numpydoc
6060
- nvcomp==4.1.0.6

conda/recipes/cudf/meta.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2018-2024, NVIDIA CORPORATION.
1+
# Copyright (c) 2018-2025, NVIDIA CORPORATION.
22

33
{% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') %}
44
{% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %}
@@ -80,7 +80,7 @@ requirements:
8080
- typing_extensions >=4.0.0
8181
- pandas >=2.0,<2.2.4dev0
8282
- cupy >=12.0.0
83-
- numba-cuda >=0.0.13,<0.0.18
83+
- numba-cuda >=0.2.0,<0.3.0
8484
- numpy >=1.23,<3.0a0
8585
- pyarrow>=14.0.0,<18.0.0a0
8686
- libcudf ={{ version }}

cpp/include/cudf/detail/utilities/integer_utils.hpp

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/*
22
* Copyright 2019 BlazingDB, Inc.
33
* Copyright 2019 Eyal Rozenberg <[email protected]>
4-
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
4+
* Copyright (c) 2020-2025, NVIDIA CORPORATION.
55
*
66
* Licensed under the Apache License, Version 2.0 (the "License");
77
* you may not use this file except in compliance with the License.
@@ -23,6 +23,8 @@
2323
*/
2424

2525
#include <cudf/fixed_point/temporary.hpp>
26+
#include <cudf/types.hpp>
27+
#include <cudf/utilities/error.hpp>
2628

2729
#include <cmath>
2830
#include <cstdlib>
@@ -44,13 +46,17 @@ namespace util {
4446
* `modulus` is positive. The safety is in regard to rollover.
4547
*/
4648
template <typename S>
47-
constexpr S round_up_safe(S number_to_round, S modulus)
49+
CUDF_HOST_DEVICE constexpr S round_up_safe(S number_to_round, S modulus)
4850
{
4951
auto remainder = number_to_round % modulus;
5052
if (remainder == 0) { return number_to_round; }
5153
auto rounded_up = number_to_round - remainder + modulus;
5254
if (rounded_up < number_to_round) {
53-
throw std::invalid_argument("Attempt to round up beyond the type's maximum value");
55+
#ifndef __CUDA_ARCH__
56+
CUDF_FAIL("Attempt to round up beyond the type's maximum value", cudf::data_type_error);
57+
#else
58+
CUDF_UNREACHABLE("Attempt to round up beyond the type's maximum value");
59+
#endif
5460
}
5561
return rounded_up;
5662
}

cpp/include/cudf/utilities/span.hpp

Lines changed: 24 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
2+
* Copyright (c) 2020-2025, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -197,11 +197,16 @@ struct host_span : public cudf::detail::span_base<T, Extent, host_span<T, Extent
197197

198198
constexpr host_span() noexcept : base() {} // required to compile on centos
199199

200-
/// Constructor from pointer and size
201-
/// @param data Pointer to the first element in the span
202-
/// @param size The number of elements in the span
203-
/// @param is_device_accessible Whether the data is device accessible (e.g. pinned memory)
204-
constexpr host_span(T* data, std::size_t size, bool is_device_accessible)
200+
/**
201+
* @brief Constructor from pointer and size
202+
*
203+
* @note This needs to be host-device , as it's used by a host-device function in base_2dspan
204+
*
205+
* @param data Pointer to the first element in the span
206+
* @param size The number of elements in the span
207+
* @param is_device_accessible Whether the data is device accessible (e.g. pinned memory)
208+
*/
209+
CUDF_HOST_DEVICE constexpr host_span(T* data, std::size_t size, bool is_device_accessible)
205210
: base(data, size), _is_device_accessible{is_device_accessible}
206211
{
207212
}
@@ -311,8 +316,8 @@ struct host_span : public cudf::detail::span_base<T, Extent, host_span<T, Extent
311316
* @param count The number of elements in the subspan
312317
* @return A subspan of the sequence, of requested count and offset
313318
*/
314-
[[nodiscard]] constexpr host_span subspan(typename base::size_type offset,
315-
typename base::size_type count) const noexcept
319+
[[nodiscard]] CUDF_HOST_DEVICE constexpr host_span subspan(
320+
typename base::size_type offset, typename base::size_type count) const noexcept
316321
{
317322
return host_span{this->data() + offset, count, _is_device_accessible};
318323
}
@@ -434,8 +439,8 @@ struct device_span : public cudf::detail::span_base<T, Extent, device_span<T, Ex
434439
* @param count The number of elements in the subspan
435440
* @return A subspan of the sequence, of requested count and offset
436441
*/
437-
[[nodiscard]] constexpr device_span subspan(typename base::size_type offset,
438-
typename base::size_type count) const noexcept
442+
[[nodiscard]] CUDF_HOST_DEVICE constexpr device_span subspan(
443+
typename base::size_type offset, typename base::size_type count) const noexcept
439444
{
440445
return device_span{this->data() + offset, count};
441446
}
@@ -475,28 +480,28 @@ class base_2dspan {
475480
*
476481
* @return A pointer to the first element of the span
477482
*/
478-
[[nodiscard]] constexpr auto data() const noexcept { return _flat.data(); }
483+
[[nodiscard]] CUDF_HOST_DEVICE constexpr auto data() const noexcept { return _flat.data(); }
479484

480485
/**
481486
* @brief Returns the size in the span as pair.
482487
*
483488
* @return pair representing rows and columns size of the span
484489
*/
485-
[[nodiscard]] constexpr auto size() const noexcept { return _size; }
490+
[[nodiscard]] CUDF_HOST_DEVICE constexpr auto size() const noexcept { return _size; }
486491

487492
/**
488493
* @brief Returns the number of elements in the span.
489494
*
490495
* @return Number of elements in the span
491496
*/
492-
[[nodiscard]] constexpr auto count() const noexcept { return _flat.size(); }
497+
[[nodiscard]] CUDF_HOST_DEVICE constexpr auto count() const noexcept { return _flat.size(); }
493498

494499
/**
495500
* @brief Checks if the span is empty.
496501
*
497502
* @return True if the span is empty, false otherwise
498503
*/
499-
[[nodiscard]] constexpr bool is_empty() const noexcept { return count() == 0; }
504+
[[nodiscard]] CUDF_HOST_DEVICE constexpr bool is_empty() const noexcept { return count() == 0; }
500505

501506
/**
502507
* @brief Returns a reference to the row-th element of the sequence.
@@ -507,7 +512,7 @@ class base_2dspan {
507512
* @param row the index of the element to access
508513
* @return A reference to the row-th element of the sequence, i.e., `data()[row]`
509514
*/
510-
constexpr RowType<T, dynamic_extent> operator[](size_t row) const
515+
CUDF_HOST_DEVICE constexpr RowType<T, dynamic_extent> operator[](size_t row) const
511516
{
512517
return _flat.subspan(row * _size.second, _size.second);
513518
}
@@ -517,7 +522,10 @@ class base_2dspan {
517522
*
518523
* @return A flattened span of the 2D span
519524
*/
520-
[[nodiscard]] constexpr RowType<T, dynamic_extent> flat_view() const { return _flat; }
525+
[[nodiscard]] CUDF_HOST_DEVICE constexpr RowType<T, dynamic_extent> flat_view() const
526+
{
527+
return _flat;
528+
}
521529

522530
/**
523531
* @brief Construct a 2D span from another 2D span of convertible type

cpp/src/io/json/read_json.cu

Lines changed: 61 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2022-2024, NVIDIA CORPORATION.
2+
* Copyright (c) 2022-2025, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -30,19 +30,33 @@
3030
#include <cudf/utilities/memory_resource.hpp>
3131
#include <cudf/utilities/span.hpp>
3232

33+
#include <rmm/cuda_stream_pool.hpp>
3334
#include <rmm/device_uvector.hpp>
3435
#include <rmm/exec_policy.hpp>
3536

3637
#include <thrust/distance.h>
3738
#include <thrust/iterator/constant_iterator.h>
3839
#include <thrust/scatter.h>
3940

41+
#include <BS_thread_pool.hpp>
42+
#include <BS_thread_pool_utils.hpp>
43+
4044
#include <numeric>
4145

4246
namespace cudf::io::json::detail {
4347

4448
namespace {
4549

50+
namespace pools {
51+
52+
BS::thread_pool& tpool()
53+
{
54+
static BS::thread_pool _tpool(std::thread::hardware_concurrency());
55+
return _tpool;
56+
}
57+
58+
} // namespace pools
59+
4660
class compressed_host_buffer_source final : public datasource {
4761
public:
4862
explicit compressed_host_buffer_source(std::unique_ptr<datasource> const& src,
@@ -51,8 +65,8 @@ class compressed_host_buffer_source final : public datasource {
5165
{
5266
auto ch_buffer = host_span<uint8_t const>(reinterpret_cast<uint8_t const*>(_dbuf_ptr->data()),
5367
_dbuf_ptr->size());
54-
if (comptype == compression_type::GZIP || comptype == compression_type::ZIP ||
55-
comptype == compression_type::SNAPPY) {
68+
if (_comptype == compression_type::GZIP || _comptype == compression_type::ZIP ||
69+
_comptype == compression_type::SNAPPY) {
5670
_decompressed_ch_buffer_size = cudf::io::detail::get_uncompressed_size(_comptype, ch_buffer);
5771
} else {
5872
_decompressed_buffer = cudf::io::detail::decompress(_comptype, ch_buffer);
@@ -96,7 +110,22 @@ class compressed_host_buffer_source final : public datasource {
96110
return std::make_unique<non_owning_buffer>(_decompressed_buffer.data() + offset, count);
97111
}
98112

99-
[[nodiscard]] bool supports_device_read() const override { return false; }
113+
std::future<size_t> device_read_async(size_t offset,
114+
size_t size,
115+
uint8_t* dst,
116+
rmm::cuda_stream_view stream) override
117+
{
118+
auto& thread_pool = pools::tpool();
119+
return thread_pool.submit_task([this, offset, size, dst, stream] {
120+
auto hbuf = host_read(offset, size);
121+
CUDF_CUDA_TRY(
122+
cudaMemcpyAsync(dst, hbuf->data(), hbuf->size(), cudaMemcpyHostToDevice, stream.value()));
123+
stream.synchronize();
124+
return hbuf->size();
125+
});
126+
}
127+
128+
[[nodiscard]] bool supports_device_read() const override { return true; }
100129

101130
[[nodiscard]] size_t size() const override { return _decompressed_ch_buffer_size; }
102131

@@ -431,6 +460,8 @@ device_span<char> ingest_raw_input(device_span<char> buffer,
431460
// line of file i+1 don't end up on the same JSON line, if file i does not already end with a line
432461
// delimiter.
433462
auto constexpr num_delimiter_chars = 1;
463+
std::vector<std::future<size_t>> thread_tasks;
464+
auto stream_pool = cudf::detail::fork_streams(stream, pools::tpool().get_thread_count());
434465

435466
auto delimiter_map = cudf::detail::make_empty_host_vector<std::size_t>(sources.size(), stream);
436467
std::vector<std::size_t> prefsum_source_sizes(sources.size());
@@ -447,13 +478,17 @@ device_span<char> ingest_raw_input(device_span<char> buffer,
447478

448479
auto const total_bytes_to_read = std::min(range_size, prefsum_source_sizes.back() - range_offset);
449480
range_offset -= start_source ? prefsum_source_sizes[start_source - 1] : 0;
450-
for (std::size_t i = start_source; i < sources.size() && bytes_read < total_bytes_to_read; i++) {
481+
for (std::size_t i = start_source, cur_stream = 0;
482+
i < sources.size() && bytes_read < total_bytes_to_read;
483+
i++) {
451484
if (sources[i]->is_empty()) continue;
452485
auto data_size = std::min(sources[i]->size() - range_offset, total_bytes_to_read - bytes_read);
453486
auto destination = reinterpret_cast<uint8_t*>(buffer.data()) + bytes_read +
454487
(num_delimiter_chars * delimiter_map.size());
455-
if (sources[i]->is_device_read_preferred(data_size)) {
456-
bytes_read += sources[i]->device_read(range_offset, data_size, destination, stream);
488+
if (sources[i]->supports_device_read()) {
489+
thread_tasks.emplace_back(sources[i]->device_read_async(
490+
range_offset, data_size, destination, stream_pool[cur_stream++ % stream_pool.size()]));
491+
bytes_read += data_size;
457492
} else {
458493
h_buffers.emplace_back(sources[i]->host_read(range_offset, data_size));
459494
auto const& h_buffer = h_buffers.back();
@@ -481,6 +516,15 @@ device_span<char> ingest_raw_input(device_span<char> buffer,
481516
buffer.data());
482517
}
483518
stream.synchronize();
519+
520+
if (thread_tasks.size()) {
521+
auto const bytes_read = std::accumulate(
522+
thread_tasks.begin(), thread_tasks.end(), std::size_t{0}, [](std::size_t sum, auto& task) {
523+
return sum + task.get();
524+
});
525+
CUDF_EXPECTS(bytes_read == total_bytes_to_read, "something's fishy");
526+
}
527+
484528
return buffer.first(bytes_read + (delimiter_map.size() * num_delimiter_chars));
485529
}
486530

@@ -505,10 +549,17 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
505549
return read_json_impl(sources, reader_opts, stream, mr);
506550

507551
std::vector<std::unique_ptr<datasource>> compressed_sources;
508-
for (size_t i = 0; i < sources.size(); i++) {
509-
compressed_sources.emplace_back(
510-
std::make_unique<compressed_host_buffer_source>(sources[i], reader_opts.get_compression()));
552+
std::vector<std::future<std::unique_ptr<compressed_host_buffer_source>>> thread_tasks;
553+
auto& thread_pool = pools::tpool();
554+
for (auto& src : sources) {
555+
thread_tasks.emplace_back(thread_pool.submit_task([&reader_opts, &src] {
556+
return std::make_unique<compressed_host_buffer_source>(src, reader_opts.get_compression());
557+
}));
511558
}
559+
std::transform(thread_tasks.begin(),
560+
thread_tasks.end(),
561+
std::back_inserter(compressed_sources),
562+
[](auto& task) { return task.get(); });
512563
// in read_json_impl, we need the compressed source size to actually be the
513564
// uncompressed source size for correct batching
514565
return read_json_impl(compressed_sources, reader_opts, stream, mr);

cpp/src/io/parquet/decode_fixed.cu

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2024, NVIDIA CORPORATION.
2+
* Copyright (c) 2024-2025, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -961,9 +961,6 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8)
961961
return;
962962
}
963963

964-
// if we have no work to do (eg, in a skip_rows/num_rows case) in this page.
965-
if (s->num_rows == 0) { return; }
966-
967964
using value_decoder_type = std::conditional_t<
968965
split_decode_t,
969966
decode_fixed_width_split_values_func<decode_block_size_t, has_lists_t, state_buf_t>,

0 commit comments

Comments
 (0)