rapidsai
diff --git a/‎.github/workflows/build.yaml
+1-1 b/‎.github/workflows/build.yaml
+1-1
diff --git a/‎.github/workflows/pr.yaml
+3-3 b/‎.github/workflows/pr.yaml
+3-3
diff --git a/‎.github/workflows/test.yaml
+3-3 b/‎.github/workflows/test.yaml
+3-3
diff --git a/‎ci/test_python_other.sh
+3-10 b/‎ci/test_python_other.sh
+3-10
diff --git a/‎ci/test_wheel_dask_cudf.sh
+3-13 b/‎ci/test_wheel_dask_cudf.sh
+3-13
diff --git a/‎cpp/CMakeLists.txt
+1 b/‎cpp/CMakeLists.txt
+1
diff --git a/‎cpp/benchmarks/CMakeLists.txt
+6-1 b/‎cpp/benchmarks/CMakeLists.txt
+6-1
diff --git a/‎cpp/benchmarks/rolling/grouped_rolling_sum.cpp
+70 b/‎cpp/benchmarks/rolling/grouped_rolling_sum.cpp
+70
diff --git a/‎cpp/benchmarks/rolling/rolling_sum.cpp
+134 b/‎cpp/benchmarks/rolling/rolling_sum.cpp
+134
diff --git a/‎cpp/include/cudf/hashing.hpp
+21-1 b/‎cpp/include/cudf/hashing.hpp
+21-1
@@ -62,7 +62,7 @@ jobs:
       arch: "amd64"
       branch: ${{ inputs.branch }}
       build_type: ${{ inputs.build_type || 'branch' }}
-      container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11"
+      container_image: "rapidsai/ci-conda:latest"
       date: ${{ inputs.date }}
       node_type: "gpu-v100-latest-1"
       run_script: "ci/build_docs.sh"
 
@@ -186,7 +186,7 @@ jobs:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11"
+      container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/test_java.sh"
   static-configure:
     needs: checks
@@ -207,7 +207,7 @@ jobs:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11"
+      container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/test_notebooks.sh"
   docs-build:
     needs: conda-python-build
@@ -217,7 +217,7 @@ jobs:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11"
+      container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/build_docs.sh"
   wheel-build-libcudf:
     needs: checks
 
@@ -41,7 +41,7 @@ jobs:
       sha: ${{ inputs.sha }}
       node_type: "gpu-v100-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11"
+      container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/test_cpp_memcheck.sh"
   static-configure:
     secrets: inherit
@@ -94,7 +94,7 @@ jobs:
       sha: ${{ inputs.sha }}
       node_type: "gpu-v100-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11"
+      container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/test_java.sh"
   conda-notebook-tests:
     secrets: inherit
@@ -106,7 +106,7 @@ jobs:
       sha: ${{ inputs.sha }}
       node_type: "gpu-v100-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11"
+      container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/test_notebooks.sh"
   wheel-tests-cudf:
     secrets: inherit
 
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+# Copyright (c) 2022-2025, NVIDIA CORPORATION.
 
 # Support invoking test_python_cudf.sh outside the script directory
 cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../
@@ -24,8 +24,8 @@ EXITCODE=0
 trap "EXITCODE=1" ERR
 set +e
 
-rapids-logger "pytest dask_cudf (dask-expr)"
-DASK_DATAFRAME__QUERY_PLANNING=True ./ci/run_dask_cudf_pytests.sh \
+rapids-logger "pytest dask_cudf"
+./ci/run_dask_cudf_pytests.sh \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf.xml" \
   --numprocesses=8 \
   --dist=worksteal \
@@ -34,13 +34,6 @@ DASK_DATAFRAME__QUERY_PLANNING=True ./ci/run_dask_cudf_pytests.sh \
   --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/dask-cudf-coverage.xml" \
   --cov-report=term
 
-rapids-logger "pytest dask_cudf (legacy)"
-DASK_DATAFRAME__QUERY_PLANNING=False ./ci/run_dask_cudf_pytests.sh \
-  --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf-legacy.xml" \
-  --numprocesses=8 \
-  --dist=worksteal \
-  .
-
 rapids-logger "pytest cudf_kafka"
 ./ci/run_cudf_kafka_pytests.sh \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf-kafka.xml"
 
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Copyright (c) 2023-2025, NVIDIA CORPORATION.
 
 set -eou pipefail
 
@@ -30,21 +30,11 @@ RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${RESULTS_DIR}/test-results"}/
 mkdir -p "${RAPIDS_TESTS_DIR}"
 
 # Run tests in dask_cudf/tests and dask_cudf/io/tests
-rapids-logger "pytest dask_cudf (dask-expr)"
+rapids-logger "pytest dask_cudf"
 pushd python/dask_cudf/dask_cudf
-DASK_DATAFRAME__QUERY_PLANNING=True python -m pytest \
+python -m pytest \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf.xml" \
   --numprocesses=8 \
   --dist=worksteal \
   .
 popd
-
-# Run tests in dask_cudf/tests and dask_cudf/io/tests (legacy)
-rapids-logger "pytest dask_cudf (legacy)"
-pushd python/dask_cudf/dask_cudf
-DASK_DATAFRAME__QUERY_PLANNING=False python -m pytest \
-  --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf-legacy.xml" \
-  --numprocesses=8 \
-  --dist=worksteal \
-  .
-popd
@@ -461,6 +461,7 @@ add_library(
   src/hash/sha256_hash.cu
   src/hash/sha384_hash.cu
   src/hash/sha512_hash.cu
+  src/hash/xxhash_32.cu
   src/hash/xxhash_64.cu
   src/interop/dlpack.cpp
   src/interop/arrow_utilities.cpp
 
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -425,6 +425,11 @@ ConfigureNVBench(DECIMAL_NVBENCH decimal/convert_floating.cpp)
 # ---------------------------------------------------------------------------------
 ConfigureNVBench(RESHAPE_NVBENCH reshape/interleave.cpp)
 
+# ##################################################################################################
+# * rolling benchmark
+# ---------------------------------------------------------------------------------
+ConfigureNVBench(ROLLING_NVBENCH rolling/grouped_rolling_sum.cpp rolling/rolling_sum.cpp)
+
 add_custom_target(
   run_benchmarks
   DEPENDS CUDF_BENCHMARKS
 
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+
+#include <cudf/aggregation.hpp>
+#include <cudf/rolling.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template <typename Type>
+void bench_row_grouped_rolling_sum(nvbench::state& state, nvbench::type_list<Type>)
+{
+  auto const num_rows       = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const cardinality    = static_cast<cudf::size_type>(state.get_int64("cardinality"));
+  auto const preceding_size = static_cast<cudf::size_type>(state.get_int64("preceding_size"));
+  auto const following_size = static_cast<cudf::size_type>(state.get_int64("following_size"));
+  auto const min_periods    = static_cast<cudf::size_type>(state.get_int64("min_periods"));
+
+  auto const keys = [&] {
+    data_profile const profile =
+      data_profile_builder()
+        .cardinality(cardinality)
+        .no_validity()
+        .distribution(cudf::type_to_id<int32_t>(), distribution_id::UNIFORM, 0, num_rows);
+    auto keys = create_random_column(cudf::type_to_id<int32_t>(), row_count{num_rows}, profile);
+    return cudf::sort(cudf::table_view{{keys->view()}});
+  }();
+  data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution(
+    cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, 100);
+  auto vals = create_random_column(cudf::type_to_id<Type>(), row_count{num_rows}, profile);
+
+  auto req = cudf::make_sum_aggregation<cudf::rolling_aggregation>();
+
+  auto const mem_stats_logger = cudf::memory_stats_logger();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    auto const result = cudf::grouped_rolling_window(
+      keys->view(), vals->view(), preceding_size, following_size, min_periods, *req);
+  });
+  auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
+  state.add_element_count(static_cast<double>(num_rows) / elapsed_time / 1'000'000., "Mrows/s");
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
+}
+
+NVBENCH_BENCH_TYPES(bench_row_grouped_rolling_sum,
+                    NVBENCH_TYPE_AXES(nvbench::type_list<std::int32_t, double>))
+  .set_name("row_grouped_rolling_sum")
+  .add_int64_power_of_two_axis("num_rows", {14, 28})
+  .add_int64_axis("preceding_size", {1, 10})
+  .add_int64_axis("following_size", {2})
+  .add_int64_axis("min_periods", {1})
+  .add_int64_axis("cardinality", {10, 100, 1'000'000, 100'000'000});
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+
+#include <cudf/aggregation.hpp>
+#include <cudf/rolling.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <rmm/device_buffer.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+
+#include <nvbench/nvbench.cuh>
+
+#include <algorithm>
+
+template <typename Type>
+void bench_row_fixed_rolling_sum(nvbench::state& state, nvbench::type_list<Type>)
+{
+  auto const num_rows       = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const preceding_size = static_cast<cudf::size_type>(state.get_int64("preceding_size"));
+  auto const following_size = static_cast<cudf::size_type>(state.get_int64("following_size"));
+  auto const min_periods    = static_cast<cudf::size_type>(state.get_int64("min_periods"));
+
+  data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution(
+    cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, 100);
+  auto vals = create_random_column(cudf::type_to_id<Type>(), row_count{num_rows}, profile);
+
+  auto req = cudf::make_sum_aggregation<cudf::rolling_aggregation>();
+
+  auto const mem_stats_logger = cudf::memory_stats_logger();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    auto const result =
+      cudf::rolling_window(vals->view(), preceding_size, following_size, min_periods, *req);
+  });
+  auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
+  state.add_element_count(static_cast<double>(num_rows) / elapsed_time / 1'000'000., "Mrows/s");
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
+}
+
+template <typename Type>
+void bench_row_variable_rolling_sum(nvbench::state& state, nvbench::type_list<Type>)
+{
+  auto const num_rows       = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const preceding_size = static_cast<cudf::size_type>(state.get_int64("preceding_size"));
+  auto const following_size = static_cast<cudf::size_type>(state.get_int64("following_size"));
+
+  auto vals = [&]() {
+    data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution(
+      cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, 100);
+    return create_random_column(cudf::type_to_id<Type>(), row_count{num_rows}, profile);
+  }();
+
+  auto preceding = [&]() {
+    auto data = std::vector<cudf::size_type>(num_rows);
+    auto it   = thrust::make_counting_iterator<cudf::size_type>(0);
+    std::transform(it, it + num_rows, data.begin(), [num_rows, preceding_size](auto i) {
+      return std::min(i + 1, std::max(preceding_size, i + 1 - num_rows));
+    });
+    auto buf = rmm::device_buffer(
+      data.data(), num_rows * sizeof(cudf::size_type), cudf::get_default_stream());
+    cudf::get_default_stream().synchronize();
+    return std::make_unique<cudf::column>(cudf::data_type(cudf::type_to_id<cudf::size_type>()),
+                                          num_rows,
+                                          std::move(buf),
+                                          rmm::device_buffer{},
+                                          0);
+  }();
+
+  auto following = [&]() {
+    auto data = std::vector<cudf::size_type>(num_rows);
+    auto it   = thrust::make_counting_iterator<cudf::size_type>(0);
+    std::transform(it, it + num_rows, data.begin(), [num_rows, following_size](auto i) {
+      return std::max(-i - 1, std::min(following_size, num_rows - i - 1));
+    });
+    auto buf = rmm::device_buffer(
+      data.data(), num_rows * sizeof(cudf::size_type), cudf::get_default_stream());
+    cudf::get_default_stream().synchronize();
+    return std::make_unique<cudf::column>(cudf::data_type(cudf::type_to_id<cudf::size_type>()),
+                                          num_rows,
+                                          std::move(buf),
+                                          rmm::device_buffer{},
+                                          0);
+  }();
+
+  auto req = cudf::make_sum_aggregation<cudf::rolling_aggregation>();
+
+  auto const mem_stats_logger = cudf::memory_stats_logger();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    auto const result =
+      cudf::rolling_window(vals->view(), preceding->view(), following->view(), 1, *req);
+  });
+  auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
+  state.add_element_count(static_cast<double>(num_rows) / elapsed_time / 1'000'000., "Mrows/s");
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
+}
+
+NVBENCH_BENCH_TYPES(bench_row_fixed_rolling_sum,
+                    NVBENCH_TYPE_AXES(nvbench::type_list<std::int32_t, double>))
+  .set_name("row_fixed_rolling_sum")
+  .add_int64_power_of_two_axis("num_rows", {14, 22, 28})
+  .add_int64_axis("preceding_size", {1, 10, 100})
+  .add_int64_axis("following_size", {2})
+  .add_int64_axis("min_periods", {1, 20});
+
+NVBENCH_BENCH_TYPES(bench_row_variable_rolling_sum,
+                    NVBENCH_TYPE_AXES(nvbench::type_list<std::int32_t, double>))
+  .set_name("row_variable_rolling_sum")
+  .add_int64_power_of_two_axis("num_rows", {14, 22, 28})
+  .add_int64_axis("preceding_size", {10, 100})
+  .add_int64_axis("following_size", {2});
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -166,6 +166,26 @@ std::unique_ptr<column> sha512(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
+/**
+ * @brief Computes the XXHash_32 hash value of each row in the given table
+ *
+ * This function computes the hash of each column using the `seed` for the first column
+ * and the resulting hash as a seed for the next column and so on.
+ * The result is a uint32 value for each row.
+ *
+ * @param input The table of columns to hash
+ * @param seed Optional seed value to use for the hash function
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ *
+ * @returns A column where each row is the hash of a row from the input
+ */
+std::unique_ptr<column> xxhash_32(
+  table_view const& input,
+  uint32_t seed                     = DEFAULT_HASH_SEED,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
+
 /**
  * @brief Computes the XXHash_64 hash value of each row in the given table
  *