Skip to content

Commit dc99d2f

Browse files
authored
Introduce some simple benchmarks for rolling window aggregations (#17613)
Previously we did not have any benchmarks for rolling aggregations. Introduce some, so we can measure the effects of any performance improvements we might make. Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - MithunR (https://github.com/mythrocks) - Vyas Ramasubramani (https://github.com/vyasr) URL: #17613
1 parent 30c6caa commit dc99d2f

File tree

3 files changed

+210
-1
lines changed

3 files changed

+210
-1
lines changed

cpp/benchmarks/CMakeLists.txt

+6-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# =============================================================================
2-
# Copyright (c) 2018-2024, NVIDIA CORPORATION.
2+
# Copyright (c) 2018-2025, NVIDIA CORPORATION.
33
#
44
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
55
# in compliance with the License. You may obtain a copy of the License at
@@ -425,6 +425,11 @@ ConfigureNVBench(DECIMAL_NVBENCH decimal/convert_floating.cpp)
425425
# ---------------------------------------------------------------------------------
426426
ConfigureNVBench(RESHAPE_NVBENCH reshape/interleave.cpp)
427427

428+
# ##################################################################################################
429+
# * rolling benchmark
430+
# ---------------------------------------------------------------------------------
431+
ConfigureNVBench(ROLLING_NVBENCH rolling/grouped_rolling_sum.cpp rolling/rolling_sum.cpp)
432+
428433
add_custom_target(
429434
run_benchmarks
430435
DEPENDS CUDF_BENCHMARKS
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
/*
2+
* Copyright (c) 2024-2025, NVIDIA CORPORATION.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#include <benchmarks/common/generate_input.hpp>
18+
#include <benchmarks/fixture/benchmark_fixture.hpp>
19+
20+
#include <cudf/aggregation.hpp>
21+
#include <cudf/rolling.hpp>
22+
#include <cudf/sorting.hpp>
23+
#include <cudf/utilities/default_stream.hpp>
24+
25+
#include <nvbench/nvbench.cuh>
26+
27+
template <typename Type>
28+
void bench_row_grouped_rolling_sum(nvbench::state& state, nvbench::type_list<Type>)
29+
{
30+
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
31+
auto const cardinality = static_cast<cudf::size_type>(state.get_int64("cardinality"));
32+
auto const preceding_size = static_cast<cudf::size_type>(state.get_int64("preceding_size"));
33+
auto const following_size = static_cast<cudf::size_type>(state.get_int64("following_size"));
34+
auto const min_periods = static_cast<cudf::size_type>(state.get_int64("min_periods"));
35+
36+
auto const keys = [&] {
37+
data_profile const profile =
38+
data_profile_builder()
39+
.cardinality(cardinality)
40+
.no_validity()
41+
.distribution(cudf::type_to_id<int32_t>(), distribution_id::UNIFORM, 0, num_rows);
42+
auto keys = create_random_column(cudf::type_to_id<int32_t>(), row_count{num_rows}, profile);
43+
return cudf::sort(cudf::table_view{{keys->view()}});
44+
}();
45+
data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution(
46+
cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, 100);
47+
auto vals = create_random_column(cudf::type_to_id<Type>(), row_count{num_rows}, profile);
48+
49+
auto req = cudf::make_sum_aggregation<cudf::rolling_aggregation>();
50+
51+
auto const mem_stats_logger = cudf::memory_stats_logger();
52+
state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
53+
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
54+
auto const result = cudf::grouped_rolling_window(
55+
keys->view(), vals->view(), preceding_size, following_size, min_periods, *req);
56+
});
57+
auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
58+
state.add_element_count(static_cast<double>(num_rows) / elapsed_time / 1'000'000., "Mrows/s");
59+
state.add_buffer_size(
60+
mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
61+
}
62+
63+
NVBENCH_BENCH_TYPES(bench_row_grouped_rolling_sum,
64+
NVBENCH_TYPE_AXES(nvbench::type_list<std::int32_t, double>))
65+
.set_name("row_grouped_rolling_sum")
66+
.add_int64_power_of_two_axis("num_rows", {14, 28})
67+
.add_int64_axis("preceding_size", {1, 10})
68+
.add_int64_axis("following_size", {2})
69+
.add_int64_axis("min_periods", {1})
70+
.add_int64_axis("cardinality", {10, 100, 1'000'000, 100'000'000});
+134
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
/*
2+
* Copyright (c) 2024-2025, NVIDIA CORPORATION.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#include <benchmarks/common/generate_input.hpp>
18+
#include <benchmarks/fixture/benchmark_fixture.hpp>
19+
20+
#include <cudf/aggregation.hpp>
21+
#include <cudf/rolling.hpp>
22+
#include <cudf/sorting.hpp>
23+
#include <cudf/types.hpp>
24+
#include <cudf/utilities/default_stream.hpp>
25+
26+
#include <rmm/device_buffer.hpp>
27+
#include <rmm/device_uvector.hpp>
28+
#include <rmm/exec_policy.hpp>
29+
30+
#include <thrust/iterator/counting_iterator.h>
31+
32+
#include <nvbench/nvbench.cuh>
33+
34+
#include <algorithm>
35+
36+
template <typename Type>
37+
void bench_row_fixed_rolling_sum(nvbench::state& state, nvbench::type_list<Type>)
38+
{
39+
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
40+
auto const preceding_size = static_cast<cudf::size_type>(state.get_int64("preceding_size"));
41+
auto const following_size = static_cast<cudf::size_type>(state.get_int64("following_size"));
42+
auto const min_periods = static_cast<cudf::size_type>(state.get_int64("min_periods"));
43+
44+
data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution(
45+
cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, 100);
46+
auto vals = create_random_column(cudf::type_to_id<Type>(), row_count{num_rows}, profile);
47+
48+
auto req = cudf::make_sum_aggregation<cudf::rolling_aggregation>();
49+
50+
auto const mem_stats_logger = cudf::memory_stats_logger();
51+
state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
52+
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
53+
auto const result =
54+
cudf::rolling_window(vals->view(), preceding_size, following_size, min_periods, *req);
55+
});
56+
auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
57+
state.add_element_count(static_cast<double>(num_rows) / elapsed_time / 1'000'000., "Mrows/s");
58+
state.add_buffer_size(
59+
mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
60+
}
61+
62+
template <typename Type>
63+
void bench_row_variable_rolling_sum(nvbench::state& state, nvbench::type_list<Type>)
64+
{
65+
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
66+
auto const preceding_size = static_cast<cudf::size_type>(state.get_int64("preceding_size"));
67+
auto const following_size = static_cast<cudf::size_type>(state.get_int64("following_size"));
68+
69+
auto vals = [&]() {
70+
data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution(
71+
cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, 100);
72+
return create_random_column(cudf::type_to_id<Type>(), row_count{num_rows}, profile);
73+
}();
74+
75+
auto preceding = [&]() {
76+
auto data = std::vector<cudf::size_type>(num_rows);
77+
auto it = thrust::make_counting_iterator<cudf::size_type>(0);
78+
std::transform(it, it + num_rows, data.begin(), [num_rows, preceding_size](auto i) {
79+
return std::min(i + 1, std::max(preceding_size, i + 1 - num_rows));
80+
});
81+
auto buf = rmm::device_buffer(
82+
data.data(), num_rows * sizeof(cudf::size_type), cudf::get_default_stream());
83+
cudf::get_default_stream().synchronize();
84+
return std::make_unique<cudf::column>(cudf::data_type(cudf::type_to_id<cudf::size_type>()),
85+
num_rows,
86+
std::move(buf),
87+
rmm::device_buffer{},
88+
0);
89+
}();
90+
91+
auto following = [&]() {
92+
auto data = std::vector<cudf::size_type>(num_rows);
93+
auto it = thrust::make_counting_iterator<cudf::size_type>(0);
94+
std::transform(it, it + num_rows, data.begin(), [num_rows, following_size](auto i) {
95+
return std::max(-i - 1, std::min(following_size, num_rows - i - 1));
96+
});
97+
auto buf = rmm::device_buffer(
98+
data.data(), num_rows * sizeof(cudf::size_type), cudf::get_default_stream());
99+
cudf::get_default_stream().synchronize();
100+
return std::make_unique<cudf::column>(cudf::data_type(cudf::type_to_id<cudf::size_type>()),
101+
num_rows,
102+
std::move(buf),
103+
rmm::device_buffer{},
104+
0);
105+
}();
106+
107+
auto req = cudf::make_sum_aggregation<cudf::rolling_aggregation>();
108+
109+
auto const mem_stats_logger = cudf::memory_stats_logger();
110+
state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
111+
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
112+
auto const result =
113+
cudf::rolling_window(vals->view(), preceding->view(), following->view(), 1, *req);
114+
});
115+
auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
116+
state.add_element_count(static_cast<double>(num_rows) / elapsed_time / 1'000'000., "Mrows/s");
117+
state.add_buffer_size(
118+
mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
119+
}
120+
121+
NVBENCH_BENCH_TYPES(bench_row_fixed_rolling_sum,
122+
NVBENCH_TYPE_AXES(nvbench::type_list<std::int32_t, double>))
123+
.set_name("row_fixed_rolling_sum")
124+
.add_int64_power_of_two_axis("num_rows", {14, 22, 28})
125+
.add_int64_axis("preceding_size", {1, 10, 100})
126+
.add_int64_axis("following_size", {2})
127+
.add_int64_axis("min_periods", {1, 20});
128+
129+
NVBENCH_BENCH_TYPES(bench_row_variable_rolling_sum,
130+
NVBENCH_TYPE_AXES(nvbench::type_list<std::int32_t, double>))
131+
.set_name("row_variable_rolling_sum")
132+
.add_int64_power_of_two_axis("num_rows", {14, 22, 28})
133+
.add_int64_axis("preceding_size", {10, 100})
134+
.add_int64_axis("following_size", {2});

0 commit comments

Comments
 (0)