From f84725ea7b7377069528026d2c3b89e3abae327e Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Thu, 23 Jan 2025 15:06:01 -0500
Subject: [PATCH 01/17] DOC v25.04 Updates [skip ci]

---
 .../cuda11.8-conda/devcontainer.json          |  6 +--
 .devcontainer/cuda11.8-pip/devcontainer.json  |  6 +--
 .../cuda12.5-conda/devcontainer.json          |  8 +--
 .devcontainer/cuda12.5-pip/devcontainer.json  |  6 +--
 .github/workflows/build.yaml                  | 28 +++++-----
 .github/workflows/pandas-tests.yaml           |  2 +-
 .github/workflows/pr.yaml                     | 54 +++++++++----------
 .../workflows/pr_issue_status_automation.yml  |  8 +--
 .github/workflows/test.yaml                   | 30 +++++------
 .../trigger-breaking-change-alert.yaml        |  2 +-
 README.md                                     |  2 +-
 VERSION                                       |  2 +-
 .../all_cuda-118_arch-x86_64.yaml             | 10 ++--
 .../all_cuda-125_arch-x86_64.yaml             | 10 ++--
 cpp/examples/versions.cmake                   |  4 +-
 dependencies.yaml                             | 50 ++++++++---------
 java/ci/README.md                             |  4 +-
 java/pom.xml                                  |  2 +-
 .../dependencies.yaml                         |  6 +--
 python/cudf/pyproject.toml                    | 14 ++---
 python/cudf_kafka/pyproject.toml              |  4 +-
 python/cudf_polars/docs/overview.md           |  2 +-
 python/cudf_polars/pyproject.toml             |  4 +-
 python/custreamz/pyproject.toml               |  6 +--
 python/dask_cudf/pyproject.toml               |  6 +--
 python/libcudf/pyproject.toml                 |  6 +--
 python/pylibcudf/pyproject.toml               | 12 ++---
 27 files changed, 147 insertions(+), 147 deletions(-)

diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json
index e793dda3823..8c2226bfa98 100644
--- a/.devcontainer/cuda11.8-conda/devcontainer.json
+++ b/.devcontainer/cuda11.8-conda/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:25.02-cpp-cuda11.8-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:25.04-cpp-cuda11.8-mambaforge-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.02-cuda11.8-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.04-cuda11.8-conda"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.2": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.4": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json
index ba959f2bc27..a0c2e65b337 100644
--- a/.devcontainer/cuda11.8-pip/devcontainer.json
+++ b/.devcontainer/cuda11.8-pip/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:25.02-cpp-cuda11.8-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:25.04-cpp-cuda11.8-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.02-cuda11.8-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.04-cuda11.8-pip"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.2": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.4": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda12.5-conda/devcontainer.json b/.devcontainer/cuda12.5-conda/devcontainer.json
index ecc88038136..244c624b871 100644
--- a/.devcontainer/cuda12.5-conda/devcontainer.json
+++ b/.devcontainer/cuda12.5-conda/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "12.5",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:25.02-cpp-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:25.04-cpp-mambaforge-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.02-cuda12.5-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.04-cuda12.5-conda"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/cuda:25.2": {
+    "ghcr.io/rapidsai/devcontainers/features/cuda:25.4": {
         "version": "12.5",
         "installCompilers": false,
         "installProfilers": true,
@@ -36,7 +36,7 @@
         "installnvJPEG": false,
         "pruneStaticLibs": true
       },
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.2": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.4": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/cuda",
diff --git a/.devcontainer/cuda12.5-pip/devcontainer.json b/.devcontainer/cuda12.5-pip/devcontainer.json
index b4828038f7d..a4b0c1acb7d 100644
--- a/.devcontainer/cuda12.5-pip/devcontainer.json
+++ b/.devcontainer/cuda12.5-pip/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "12.5",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:25.02-cpp-cuda12.5-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:25.04-cpp-cuda12.5-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.02-cuda12.5-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.04-cuda12.5-pip"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.2": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.4": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 65aebfb7f8c..85bac8395a2 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-25.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -57,7 +57,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.04
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -69,7 +69,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-libcudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.04
     with:
       # build for every combination of arch and CUDA version, but only for the latest Python
       matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
@@ -81,7 +81,7 @@ jobs:
   wheel-publish-libcudf:
     needs: wheel-build-libcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -92,7 +92,7 @@ jobs:
   wheel-build-pylibcudf:
     needs: [wheel-publish-libcudf]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -102,7 +102,7 @@ jobs:
   wheel-publish-pylibcudf:
     needs: wheel-build-pylibcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -113,7 +113,7 @@ jobs:
   wheel-build-cudf:
     needs: wheel-publish-pylibcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -123,7 +123,7 @@ jobs:
   wheel-publish-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -134,7 +134,7 @@ jobs:
   wheel-build-dask-cudf:
     needs: wheel-publish-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.04
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -146,7 +146,7 @@ jobs:
   wheel-publish-dask-cudf:
     needs: wheel-build-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -157,7 +157,7 @@ jobs:
   wheel-build-cudf-polars:
     needs: wheel-publish-pylibcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.04
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -169,7 +169,7 @@ jobs:
   wheel-publish-cudf-polars:
     needs: wheel-build-cudf-polars
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pandas-tests.yaml b/.github/workflows/pandas-tests.yaml
index a29babb218f..fea393c549e 100644
--- a/.github/workflows/pandas-tests.yaml
+++ b/.github/workflows/pandas-tests.yaml
@@ -17,7 +17,7 @@ jobs:
   pandas-tests:
       # run the Pandas unit tests
       secrets: inherit
-      uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.02
+      uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.04
       with:
         # This selects "ARCH=amd64 + the latest supported Python + CUDA".
         matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index e955b8f1f80..9c0bb7d6840 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -42,7 +42,7 @@ jobs:
       - pandas-tests-diff
       - telemetry-setup
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-25.04
     if: always()
     with:
       needs: ${{ toJSON(needs) }}
@@ -70,7 +70,7 @@ jobs:
   changed-files:
     secrets: inherit
     needs: telemetry-setup
-    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@branch-25.04
     with:
       files_yaml: |
         test_cpp:
@@ -123,47 +123,47 @@ jobs:
   checks:
     secrets: inherit
     needs: telemetry-setup
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-25.04
     with:
       enable_check_generated_files: false
       ignored_pr_jobs: "telemetry-summarize"
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.04
     with:
       build_type: pull-request
   cpp-linters:
     secrets: inherit
     needs: checks
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.04
     with:
       build_type: pull-request
       run_script: "ci/cpp_linters.sh"
   conda-cpp-checks:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-25.04
     with:
       build_type: pull-request
       enable_check_symbols: true
   conda-cpp-tests:
     needs: [conda-cpp-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-25.04
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_cpp
     with:
       build_type: pull-request
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.04
     with:
       build_type: pull-request
   conda-python-cudf-tests:
     needs: [conda-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.04
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
@@ -172,7 +172,7 @@ jobs:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     needs: [conda-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.04
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
@@ -180,7 +180,7 @@ jobs:
   conda-java-tests:
     needs: [conda-cpp-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.04
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_java
     with:
       build_type: pull-request
@@ -191,7 +191,7 @@ jobs:
   static-configure:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.04
     with:
       build_type: pull-request
       # Use the wheel container so we can skip conda solves and since our
@@ -201,7 +201,7 @@ jobs:
   conda-notebook-tests:
     needs: [conda-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.04
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_notebooks
     with:
       build_type: pull-request
@@ -212,7 +212,7 @@ jobs:
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.04
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -222,7 +222,7 @@ jobs:
   wheel-build-libcudf:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.04
     with:
       # build for every combination of arch and CUDA version, but only for the latest Python
       matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
@@ -231,21 +231,21 @@ jobs:
   wheel-build-pylibcudf:
     needs: [checks, wheel-build-libcudf]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.04
     with:
       build_type: pull-request
       script: "ci/build_wheel_pylibcudf.sh"
   wheel-build-cudf:
     needs: wheel-build-pylibcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.04
     with:
       build_type: pull-request
       script: "ci/build_wheel_cudf.sh"
   wheel-tests-cudf:
     needs: [wheel-build-cudf, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.04
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
@@ -253,7 +253,7 @@ jobs:
   wheel-build-cudf-polars:
     needs: wheel-build-pylibcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.04
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -262,7 +262,7 @@ jobs:
   wheel-tests-cudf-polars:
     needs: [wheel-build-cudf-polars, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.04
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
@@ -272,7 +272,7 @@ jobs:
   cudf-polars-polars-tests:
     needs: wheel-build-cudf-polars
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.04
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -281,7 +281,7 @@ jobs:
   wheel-build-dask-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.04
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -290,7 +290,7 @@ jobs:
   wheel-tests-dask-cudf:
     needs: [wheel-build-dask-cudf, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.04
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
@@ -300,7 +300,7 @@ jobs:
   devcontainer:
     secrets: inherit
     needs: telemetry-setup
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-25.04
     with:
       arch: '["amd64"]'
       cuda: '["12.5"]'
@@ -311,7 +311,7 @@ jobs:
   unit-tests-cudf-pandas:
     needs: [wheel-build-cudf, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.04
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python || fromJSON(needs.changed-files.outputs.changed_file_groups).test_cudf_pandas
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
@@ -322,7 +322,7 @@ jobs:
     # run the Pandas unit tests using PR branch
     needs: [wheel-build-cudf, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.04
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python || fromJSON(needs.changed-files.outputs.changed_file_groups).test_cudf_pandas
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
@@ -334,7 +334,7 @@ jobs:
   pandas-tests-diff:
     # diff the results of running the Pandas unit tests and publish a job summary
     needs: pandas-tests
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.04
     with:
         node_type: cpu4
         build_type: pull-request
diff --git a/.github/workflows/pr_issue_status_automation.yml b/.github/workflows/pr_issue_status_automation.yml
index 20db9623e1b..44e48f691a2 100644
--- a/.github/workflows/pr_issue_status_automation.yml
+++ b/.github/workflows/pr_issue_status_automation.yml
@@ -23,7 +23,7 @@ on:
 
 jobs:
     get-project-id:
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@branch-25.02
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@branch-25.04
       if: github.event.pull_request.state == 'open'
       secrets: inherit
       permissions:
@@ -34,7 +34,7 @@ jobs:
 
     update-status:
       # This job sets the PR and its linked issues to "In Progress" status
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-25.02
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-25.04
       if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }}
       needs: get-project-id
       with:
@@ -50,7 +50,7 @@ jobs:
 
     update-sprint:
       # This job sets the PR and its linked issues to the current "Weekly Sprint"
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@branch-25.02
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@branch-25.04
       if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }}
       needs: get-project-id
       with:
@@ -79,7 +79,7 @@ jobs:
 
     update-release:
       # This job sets the PR and its linked issues to the release they are targeting
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-25.02
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-25.04
       if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }}
       needs: [get-project-id, process-branch-name]
       with:
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index dc82c17022a..233d15dd145 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-25.04
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -25,7 +25,7 @@ jobs:
       enable_check_symbols: true
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-25.04
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -33,7 +33,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-cpp-memcheck-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.04
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -45,7 +45,7 @@ jobs:
       run_script: "ci/test_cpp_memcheck.sh"
   static-configure:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.04
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -57,7 +57,7 @@ jobs:
       run_script: "ci/configure_cpp_static.sh"
   cpp-linters:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.04
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -67,7 +67,7 @@ jobs:
       file_to_upload: iwyu_results.txt
   conda-python-cudf-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.04
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -77,7 +77,7 @@ jobs:
   conda-python-other-tests:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.04
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -86,7 +86,7 @@ jobs:
       script: "ci/test_python_other.sh"
   conda-java-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.04
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -98,7 +98,7 @@ jobs:
       run_script: "ci/test_java.sh"
   conda-notebook-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.04
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -110,7 +110,7 @@ jobs:
       run_script: "ci/test_notebooks.sh"
   wheel-tests-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.04
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -119,7 +119,7 @@ jobs:
       script: ci/test_wheel_cudf.sh
   wheel-tests-dask-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.04
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -128,7 +128,7 @@ jobs:
       script: ci/test_wheel_dask_cudf.sh
   unit-tests-cudf-pandas:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.04
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -137,7 +137,7 @@ jobs:
       script: ci/cudf_pandas_scripts/run_tests.sh
   third-party-integration-tests-cudf-pandas:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.04
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -149,7 +149,7 @@ jobs:
         ci/cudf_pandas_scripts/third-party-integration/test.sh python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml
   wheel-tests-cudf-polars:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.04
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -158,7 +158,7 @@ jobs:
       script: "ci/test_wheel_cudf_polars.sh"
   cudf-polars-polars-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.04
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/trigger-breaking-change-alert.yaml b/.github/workflows/trigger-breaking-change-alert.yaml
index 01dd2436beb..9764c62c15c 100644
--- a/.github/workflows/trigger-breaking-change-alert.yaml
+++ b/.github/workflows/trigger-breaking-change-alert.yaml
@@ -12,7 +12,7 @@ jobs:
   trigger-notifier:
     if: contains(github.event.pull_request.labels.*.name, 'breaking')
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@branch-25.04
     with:
       sender_login: ${{ github.event.sender.login }}
       sender_avatar: ${{ github.event.sender.avatar_url }}
diff --git a/README.md b/README.md
index b83d2140a33..20b1d64a5e0 100644
--- a/README.md
+++ b/README.md
@@ -83,7 +83,7 @@ cuDF can be installed with conda (via [miniforge](https://github.com/conda-forge
 
 ```bash
 conda install -c rapidsai -c conda-forge -c nvidia \
-    cudf=25.02 python=3.12 cuda-version=12.5
+    cudf=25.04 python=3.12 cuda-version=12.5
 ```
 
 We also provide [nightly Conda packages](https://anaconda.org/rapidsai-nightly) built from the HEAD
diff --git a/VERSION b/VERSION
index 72eefaf7c79..b922658ff3f 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-25.02.00
+25.04.00
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 60d8e96d932..0e4c190c885 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -26,7 +26,7 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.3
-- dask-cuda==25.2.*,>=0.0.0a0
+- dask-cuda==25.4.*,>=0.0.0a0
 - dlpack>=0.8,<1.0
 - doxygen=1.9.1
 - fastavro>=0.22.9
@@ -42,9 +42,9 @@ dependencies:
 - libcufile=1.4.0.31
 - libcurand-dev=10.3.0.86
 - libcurand=10.3.0.86
-- libkvikio==25.2.*,>=0.0.0a0
+- libkvikio==25.4.*,>=0.0.0a0
 - librdkafka>=2.5.0,<2.6.0a0
-- librmm==25.2.*,>=0.0.0a0
+- librmm==25.4.*,>=0.0.0a0
 - make
 - mmh3
 - moto>=4.0.8
@@ -82,9 +82,9 @@ dependencies:
 - python-xxhash
 - python>=3.10,<3.13
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
-- rapids-dask-dependency==25.2.*,>=0.0.0a0
+- rapids-dask-dependency==25.4.*,>=0.0.0a0
 - rich
-- rmm==25.2.*,>=0.0.0a0
+- rmm==25.4.*,>=0.0.0a0
 - s3fs>=2022.3.0
 - scikit-build-core>=0.10.0
 - scipy
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index fe1a32ccb87..770ac4a80c8 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -27,7 +27,7 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.3
-- dask-cuda==25.2.*,>=0.0.0a0
+- dask-cuda==25.4.*,>=0.0.0a0
 - dlpack>=0.8,<1.0
 - doxygen=1.9.1
 - fastavro>=0.22.9
@@ -41,9 +41,9 @@ dependencies:
 - jupyter_client
 - libcufile-dev
 - libcurand-dev
-- libkvikio==25.2.*,>=0.0.0a0
+- libkvikio==25.4.*,>=0.0.0a0
 - librdkafka>=2.5.0,<2.6.0a0
-- librmm==25.2.*,>=0.0.0a0
+- librmm==25.4.*,>=0.0.0a0
 - make
 - mmh3
 - moto>=4.0.8
@@ -81,9 +81,9 @@ dependencies:
 - python>=3.10,<3.13
 - pytorch>=2.4.0
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
-- rapids-dask-dependency==25.2.*,>=0.0.0a0
+- rapids-dask-dependency==25.4.*,>=0.0.0a0
 - rich
-- rmm==25.2.*,>=0.0.0a0
+- rmm==25.4.*,>=0.0.0a0
 - s3fs>=2022.3.0
 - scikit-build-core>=0.10.0
 - scipy
diff --git a/cpp/examples/versions.cmake b/cpp/examples/versions.cmake
index 13e0cf81625..c6c07dbc150 100644
--- a/cpp/examples/versions.cmake
+++ b/cpp/examples/versions.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -12,4 +12,4 @@
 # the License.
 # =============================================================================
 
-set(CUDF_TAG branch-25.02)
+set(CUDF_TAG branch-25.04)
diff --git a/dependencies.yaml b/dependencies.yaml
index edd83e6e07d..c3122671e94 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -634,7 +634,7 @@ dependencies:
       - output_types: [conda]
         packages:
           - breathe>=4.35.0
-          - dask-cuda==25.2.*,>=0.0.0a0
+          - dask-cuda==25.4.*,>=0.0.0a0
           - *doxygen
           - make
           - myst-nb
@@ -782,13 +782,13 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - rapids-dask-dependency==25.2.*,>=0.0.0a0
+          - rapids-dask-dependency==25.4.*,>=0.0.0a0
   run_dask_cudf:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
           - pynvml>=12.0.0,<13.0.0a0
-          - rapids-dask-dependency==25.2.*,>=0.0.0a0
+          - rapids-dask-dependency==25.4.*,>=0.0.0a0
   run_custreamz:
     common:
       - output_types: conda
@@ -926,7 +926,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - dask-cuda==25.2.*,>=0.0.0a0
+          - dask-cuda==25.4.*,>=0.0.0a0
           - *numba-cuda-dep
           - *numba-dep
     specific:
@@ -947,7 +947,7 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &libcudf_unsuffixed libcudf==25.2.*,>=0.0.0a0
+          - &libcudf_unsuffixed libcudf==25.4.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -961,18 +961,18 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - libcudf-cu12==25.2.*,>=0.0.0a0
+              - libcudf-cu12==25.4.*,>=0.0.0a0
           - matrix:
               cuda: "11.*"
               cuda_suffixed: "true"
             packages:
-              - libcudf-cu11==25.2.*,>=0.0.0a0
+              - libcudf-cu11==25.4.*,>=0.0.0a0
           - {matrix: null, packages: [*libcudf_unsuffixed]}
   depends_on_pylibcudf:
     common:
       - output_types: conda
         packages:
-          - &pylibcudf_unsuffixed pylibcudf==25.2.*,>=0.0.0a0
+          - &pylibcudf_unsuffixed pylibcudf==25.4.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -986,18 +986,18 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - pylibcudf-cu12==25.2.*,>=0.0.0a0
+              - pylibcudf-cu12==25.4.*,>=0.0.0a0
           - matrix:
               cuda: "11.*"
               cuda_suffixed: "true"
             packages:
-              - pylibcudf-cu11==25.2.*,>=0.0.0a0
+              - pylibcudf-cu11==25.4.*,>=0.0.0a0
           - {matrix: null, packages: [*pylibcudf_unsuffixed]}
   depends_on_cudf:
     common:
       - output_types: conda
         packages:
-          - &cudf_unsuffixed cudf==25.2.*,>=0.0.0a0
+          - &cudf_unsuffixed cudf==25.4.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -1011,18 +1011,18 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - cudf-cu12==25.2.*,>=0.0.0a0
+              - cudf-cu12==25.4.*,>=0.0.0a0
           - matrix:
               cuda: "11.*"
               cuda_suffixed: "true"
             packages:
-              - cudf-cu11==25.2.*,>=0.0.0a0
+              - cudf-cu11==25.4.*,>=0.0.0a0
           - {matrix: null, packages: [*cudf_unsuffixed]}
   depends_on_cudf_kafka:
     common:
       - output_types: conda
         packages:
-          - &cudf_kafka_unsuffixed cudf_kafka==25.2.*,>=0.0.0a0
+          - &cudf_kafka_unsuffixed cudf_kafka==25.4.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -1036,12 +1036,12 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - cudf_kafka-cu12==25.2.*,>=0.0.0a0
+              - cudf_kafka-cu12==25.4.*,>=0.0.0a0
           - matrix:
               cuda: "11.*"
               cuda_suffixed: "true"
             packages:
-              - cudf_kafka-cu11==25.2.*,>=0.0.0a0
+              - cudf_kafka-cu11==25.4.*,>=0.0.0a0
           - {matrix: null, packages: [*cudf_kafka_unsuffixed]}
   depends_on_cupy:
     common:
@@ -1062,7 +1062,7 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &libkvikio_unsuffixed libkvikio==25.2.*,>=0.0.0a0
+          - &libkvikio_unsuffixed libkvikio==25.4.*,>=0.0.0a0
       - output_types: requirements
         packages:
           - --extra-index-url=https://pypi.nvidia.com
@@ -1074,12 +1074,12 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - libkvikio-cu12==25.2.*,>=0.0.0a0
+              - libkvikio-cu12==25.4.*,>=0.0.0a0
           - matrix:
               cuda: "11.*"
               cuda_suffixed: "true"
             packages:
-              - libkvikio-cu11==25.2.*,>=0.0.0a0
+              - libkvikio-cu11==25.4.*,>=0.0.0a0
           - matrix:
             packages:
               - *libkvikio_unsuffixed
@@ -1087,7 +1087,7 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &librmm_unsuffixed librmm==25.2.*,>=0.0.0a0
+          - &librmm_unsuffixed librmm==25.4.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -1101,12 +1101,12 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - librmm-cu12==25.2.*,>=0.0.0a0
+              - librmm-cu12==25.4.*,>=0.0.0a0
           - matrix:
               cuda: "11.*"
               cuda_suffixed: "true"
             packages:
-              - librmm-cu11==25.2.*,>=0.0.0a0
+              - librmm-cu11==25.4.*,>=0.0.0a0
           - matrix:
             packages:
               - *librmm_unsuffixed
@@ -1114,7 +1114,7 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &rmm_unsuffixed rmm==25.2.*,>=0.0.0a0
+          - &rmm_unsuffixed rmm==25.4.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -1128,12 +1128,12 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - rmm-cu12==25.2.*,>=0.0.0a0
+              - rmm-cu12==25.4.*,>=0.0.0a0
           - matrix:
               cuda: "11.*"
               cuda_suffixed: "true"
             packages:
-              - rmm-cu11==25.2.*,>=0.0.0a0
+              - rmm-cu11==25.4.*,>=0.0.0a0
           - matrix:
             packages:
               - *rmm_unsuffixed
diff --git a/java/ci/README.md b/java/ci/README.md
index bfb35bc1d23..cc8ab77bf6c 100644
--- a/java/ci/README.md
+++ b/java/ci/README.md
@@ -34,7 +34,7 @@ nvidia-docker run -it cudf-build:11.8.0-devel-rocky8 bash
 You can download the cuDF repo in the docker container or you can mount it into the container.
 Here I choose to download again in the container.
 ```bash
-git clone --recursive https://github.com/rapidsai/cudf.git -b branch-25.02
+git clone --recursive https://github.com/rapidsai/cudf.git -b branch-25.04
 ```
 
 ### Build cuDF jar with devtoolset
@@ -47,4 +47,4 @@ scl enable gcc-toolset-11 "java/ci/build-in-docker.sh"
 
 ### The output
 
-You can find the cuDF jar in java/target/ like cudf-25.02.0-SNAPSHOT-cuda11.jar.
+You can find the cuDF jar in java/target/ like cudf-25.04.0-SNAPSHOT-cuda11.jar.
diff --git a/java/pom.xml b/java/pom.xml
index 8bbeac20c99..1f80381dd7e 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -21,7 +21,7 @@
 
     <groupId>ai.rapids</groupId>
     <artifactId>cudf</artifactId>
-    <version>25.02.0-SNAPSHOT</version>
+    <version>25.04.0-SNAPSHOT</version>
 
     <name>cudfjni</name>
     <description>
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml b/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml
index 3891110e9d3..56f0586f89a 100644
--- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml
@@ -182,7 +182,7 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - cudf==25.2.*,>=0.0.0a0
+          - cudf==25.4.*,>=0.0.0a0
           - pandas
           - pytest
           - pytest-xdist
@@ -248,13 +248,13 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - cuml==25.2.*,>=0.0.0a0
+          - cuml==25.4.*,>=0.0.0a0
           - scikit-learn
   test_cugraph:
     common:
       - output_types: conda
         packages:
-          - cugraph==25.2.*,>=0.0.0a0
+          - cugraph==25.4.*,>=0.0.0a0
           - networkx
   test_ibis:
     common:
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 2b03f515657..478581138e4 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -23,7 +23,7 @@ dependencies = [
     "cuda-python>=11.8.5,<12.0a0",
     "cupy-cuda11x>=12.0.0",
     "fsspec>=0.6.0",
-    "libcudf==25.2.*,>=0.0.0a0",
+    "libcudf==25.4.*,>=0.0.0a0",
     "numba-cuda>=0.2.0,<0.3.0a0",
     "numba>=0.59.1,<0.61.0a0",
     "numpy>=1.23,<3.0a0",
@@ -33,9 +33,9 @@ dependencies = [
     "ptxcompiler",
     "pyarrow>=14.0.0,<19.0.0a0,!=17.0.0; platform_machine=='aarch64'",
     "pyarrow>=14.0.0,<19.0.0a0; platform_machine=='x86_64'",
-    "pylibcudf==25.2.*,>=0.0.0a0",
+    "pylibcudf==25.4.*,>=0.0.0a0",
     "rich",
-    "rmm==25.2.*,>=0.0.0a0",
+    "rmm==25.4.*,>=0.0.0a0",
     "typing_extensions>=4.0.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
@@ -120,11 +120,11 @@ matrix-entry = "cuda_suffixed=true"
 requires = [
     "cmake>=3.26.4,!=3.30.0",
     "cython>=3.0.3",
-    "libcudf==25.2.*,>=0.0.0a0",
-    "librmm==25.2.*,>=0.0.0a0",
+    "libcudf==25.4.*,>=0.0.0a0",
+    "librmm==25.4.*,>=0.0.0a0",
     "ninja",
-    "pylibcudf==25.2.*,>=0.0.0a0",
-    "rmm==25.2.*,>=0.0.0a0",
+    "pylibcudf==25.4.*,>=0.0.0a0",
+    "rmm==25.4.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [tool.scikit-build]
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index a9d937435e9..a1f15574d2d 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+# Copyright (c) 2021-2025, NVIDIA CORPORATION.
 
 [build-system]
 build-backend = "rapids_build_backend.build"
@@ -18,7 +18,7 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.10"
 dependencies = [
-    "cudf==25.2.*,>=0.0.0a0",
+    "cudf==25.4.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.optional-dependencies]
diff --git a/python/cudf_polars/docs/overview.md b/python/cudf_polars/docs/overview.md
index a8cad5622fb..be48d500a36 100644
--- a/python/cudf_polars/docs/overview.md
+++ b/python/cudf_polars/docs/overview.md
@@ -8,7 +8,7 @@ You will need:
    preferred configuration. Or else, use
    [rustup](https://www.rust-lang.org/tools/install)
 2. A [cudf development
-   environment](https://github.com/rapidsai/cudf/blob/branch-25.02/CONTRIBUTING.md#setting-up-your-build-environment).
+   environment](https://github.com/rapidsai/cudf/blob/branch-25.04/CONTRIBUTING.md#setting-up-your-build-environment).
    The combined devcontainer works, or whatever your favourite approach is.
 
 :::{note}
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index 9fb9bbf391e..290c3cff92f 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -20,7 +20,7 @@ license = { text = "Apache 2.0" }
 requires-python = ">=3.10"
 dependencies = [
     "polars>=1.11,<1.18",
-    "pylibcudf==25.2.*,>=0.0.0a0",
+    "pylibcudf==25.4.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
@@ -41,7 +41,7 @@ test = [
     "pytest<8",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 experimental = [
-    "rapids-dask-dependency==25.2.*,>=0.0.0a0",
+    "rapids-dask-dependency==25.4.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.urls]
diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml
index 7820157d89b..665b0a76ecf 100644
--- a/python/custreamz/pyproject.toml
+++ b/python/custreamz/pyproject.toml
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+# Copyright (c) 2021-2025, NVIDIA CORPORATION.
 
 [build-system]
 build-backend = "rapids_build_backend.build"
@@ -20,8 +20,8 @@ license = { text = "Apache 2.0" }
 requires-python = ">=3.10"
 dependencies = [
     "confluent-kafka>=2.5.0,<2.6.0a0",
-    "cudf==25.2.*,>=0.0.0a0",
-    "cudf_kafka==25.2.*,>=0.0.0a0",
+    "cudf==25.4.*,>=0.0.0a0",
+    "cudf_kafka==25.4.*,>=0.0.0a0",
     "streamz",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index 3725722a8ae..87bf282f376 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -19,13 +19,13 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.10"
 dependencies = [
-    "cudf==25.2.*,>=0.0.0a0",
+    "cudf==25.4.*,>=0.0.0a0",
     "cupy-cuda11x>=12.0.0",
     "fsspec>=0.6.0",
     "numpy>=1.23,<3.0a0",
     "pandas>=2.0,<2.2.4dev0",
     "pynvml>=12.0.0,<13.0.0a0",
-    "rapids-dask-dependency==25.2.*,>=0.0.0a0",
+    "rapids-dask-dependency==25.4.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
@@ -46,7 +46,7 @@ cudf = "dask_cudf.backends:CudfBackendEntrypoint"
 
 [project.optional-dependencies]
 test = [
-    "dask-cuda==25.2.*,>=0.0.0a0",
+    "dask-cuda==25.4.*,>=0.0.0a0",
     "numba-cuda>=0.2.0,<0.3.0a0",
     "numba>=0.59.1,<0.61.0a0",
     "pytest-cov",
diff --git a/python/libcudf/pyproject.toml b/python/libcudf/pyproject.toml
index 9fe930d754c..9c54864ccc8 100644
--- a/python/libcudf/pyproject.toml
+++ b/python/libcudf/pyproject.toml
@@ -38,7 +38,7 @@ classifiers = [
     "Environment :: GPU :: NVIDIA CUDA",
 ]
 dependencies = [
-    "libkvikio==25.2.*,>=0.0.0a0",
+    "libkvikio==25.4.*,>=0.0.0a0",
     "nvidia-nvcomp==4.1.0.6",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
@@ -78,7 +78,7 @@ dependencies-file = "../../dependencies.yaml"
 matrix-entry = "cuda_suffixed=true;use_cuda_wheels=true"
 requires = [
     "cmake>=3.26.4,!=3.30.0",
-    "libkvikio==25.2.*,>=0.0.0a0",
-    "librmm==25.2.*,>=0.0.0a0",
+    "libkvikio==25.4.*,>=0.0.0a0",
+    "librmm==25.4.*,>=0.0.0a0",
     "ninja",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml
index e0055d5ebf8..1bfda4ce74d 100644
--- a/python/pylibcudf/pyproject.toml
+++ b/python/pylibcudf/pyproject.toml
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+# Copyright (c) 2021-2025, NVIDIA CORPORATION.
 
 [build-system]
 build-backend = "rapids_build_backend.build"
@@ -19,12 +19,12 @@ license = { text = "Apache 2.0" }
 requires-python = ">=3.10"
 dependencies = [
     "cuda-python>=11.8.5,<12.0a0",
-    "libcudf==25.2.*,>=0.0.0a0",
+    "libcudf==25.4.*,>=0.0.0a0",
     "nvtx>=0.2.1",
     "packaging",
     "pyarrow>=14.0.0,<19.0.0a0,!=17.0.0; platform_machine=='aarch64'",
     "pyarrow>=14.0.0,<19.0.0a0; platform_machine=='x86_64'",
-    "rmm==25.2.*,>=0.0.0a0",
+    "rmm==25.4.*,>=0.0.0a0",
     "typing_extensions>=4.0.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
@@ -111,10 +111,10 @@ matrix-entry = "cuda_suffixed=true"
 requires = [
     "cmake>=3.26.4,!=3.30.0",
     "cython>=3.0.3",
-    "libcudf==25.2.*,>=0.0.0a0",
-    "librmm==25.2.*,>=0.0.0a0",
+    "libcudf==25.4.*,>=0.0.0a0",
+    "librmm==25.4.*,>=0.0.0a0",
     "ninja",
-    "rmm==25.2.*,>=0.0.0a0",
+    "rmm==25.4.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [tool.scikit-build]

From a6f90f0737d6306a364671dee59c05a2cf3d33b4 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Wed, 29 Jan 2025 08:35:23 -0600
Subject: [PATCH 02/17] Add multi-partition `Shuffle` operation to cuDF Polars
 (#17744)

This PR pulls out the `Shuffle` logic from https://github.com/rapidsai/cudf/pull/17518 to simplify the review process.

The goal is to establish the shuffle groundwork for multi-partition `Join` and `Sort` operations.

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/17744
---
 .../cudf_polars/experimental/base.py          |  26 ++-
 .../cudf_polars/experimental/parallel.py      |   5 +-
 .../cudf_polars/experimental/shuffle.py       | 204 ++++++++++++++++++
 .../tests/experimental/test_shuffle.py        |  66 ++++++
 4 files changed, 289 insertions(+), 12 deletions(-)
 create mode 100644 python/cudf_polars/cudf_polars/experimental/shuffle.py
 create mode 100644 python/cudf_polars/tests/experimental/test_shuffle.py

diff --git a/python/cudf_polars/cudf_polars/experimental/base.py b/python/cudf_polars/cudf_polars/experimental/base.py
index 8f660632df2..36c7745c3f4 100644
--- a/python/cudf_polars/cudf_polars/experimental/base.py
+++ b/python/cudf_polars/cudf_polars/experimental/base.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
 """Multi-partition base classes."""
 
@@ -12,20 +12,26 @@
     from collections.abc import Iterator, Sequence
 
     from cudf_polars.containers import DataFrame
+    from cudf_polars.dsl.expr import NamedExpr
     from cudf_polars.dsl.nodebase import Node
 
 
 class PartitionInfo:
-    """
-    Partitioning information.
-
-    This class only tracks the partition count (for now).
-    """
-
-    __slots__ = ("count",)
-
-    def __init__(self, count: int):
+    """Partitioning information."""
+
+    __slots__ = ("count", "partitioned_on")
+    count: int
+    """Partition count."""
+    partitioned_on: tuple[NamedExpr, ...]
+    """Columns the data is hash-partitioned on."""
+
+    def __init__(
+        self,
+        count: int,
+        partitioned_on: tuple[NamedExpr, ...] = (),
+    ):
         self.count = count
+        self.partitioned_on = partitioned_on
 
     def keys(self, node: Node) -> Iterator[tuple[str, int]]:
         """Return the partitioned keys for a given node."""
diff --git a/python/cudf_polars/cudf_polars/experimental/parallel.py b/python/cudf_polars/cudf_polars/experimental/parallel.py
index 6843ed9ee2e..5a5eaab8b2f 100644
--- a/python/cudf_polars/cudf_polars/experimental/parallel.py
+++ b/python/cudf_polars/cudf_polars/experimental/parallel.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
 """Multi-partition Dask execution."""
 
@@ -10,7 +10,8 @@
 from typing import TYPE_CHECKING, Any
 
 import cudf_polars.experimental.io
-import cudf_polars.experimental.select  # noqa: F401
+import cudf_polars.experimental.select
+import cudf_polars.experimental.shuffle  # noqa: F401
 from cudf_polars.dsl.ir import IR, Cache, Filter, HStack, Projection, Select, Union
 from cudf_polars.dsl.traversal import CachingVisitor, traversal
 from cudf_polars.experimental.base import PartitionInfo, _concat, get_key_name
diff --git a/python/cudf_polars/cudf_polars/experimental/shuffle.py b/python/cudf_polars/cudf_polars/experimental/shuffle.py
new file mode 100644
index 00000000000..d49f13375ed
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/experimental/shuffle.py
@@ -0,0 +1,204 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+"""Shuffle Logic."""
+
+from __future__ import annotations
+
+import json
+import operator
+from typing import TYPE_CHECKING, Any
+
+import pyarrow as pa
+
+import pylibcudf as plc
+
+from cudf_polars.containers import DataFrame
+from cudf_polars.dsl.ir import IR
+from cudf_polars.experimental.base import _concat, get_key_name
+from cudf_polars.experimental.dispatch import generate_ir_tasks, lower_ir_node
+
+if TYPE_CHECKING:
+    from collections.abc import Hashable, MutableMapping
+
+    from cudf_polars.dsl.expr import NamedExpr
+    from cudf_polars.experimental.dispatch import LowerIRTransformer
+    from cudf_polars.experimental.parallel import PartitionInfo
+    from cudf_polars.typing import Schema
+
+
+class Shuffle(IR):
+    """
+    Shuffle multi-partition data.
+
+    Notes
+    -----
+    Only hash-based partitioning is supported (for now).
+    """
+
+    __slots__ = ("keys", "options")
+    _non_child = ("schema", "keys", "options")
+    keys: tuple[NamedExpr, ...]
+    """Keys to shuffle on."""
+    options: dict[str, Any]
+    """Shuffling options."""
+
+    def __init__(
+        self,
+        schema: Schema,
+        keys: tuple[NamedExpr, ...],
+        options: dict[str, Any],
+        df: IR,
+    ):
+        self.schema = schema
+        self.keys = keys
+        self.options = options
+        self._non_child_args = (schema, keys, options)
+        self.children = (df,)
+
+    def get_hashable(self) -> Hashable:
+        """Hashable representation of the node."""
+        return (
+            type(self),
+            tuple(self.schema.items()),
+            self.keys,
+            json.dumps(self.options),
+            self.children,
+        )
+
+    @classmethod
+    def do_evaluate(
+        cls,
+        schema: Schema,
+        keys: tuple[NamedExpr, ...],
+        options: dict[str, Any],
+        df: DataFrame,
+    ):  # pragma: no cover
+        """Evaluate and return a dataframe."""
+        # Single-partition Shuffle evaluation is a no-op
+        return df
+
+
+def _partition_dataframe(
+    df: DataFrame,
+    keys: tuple[NamedExpr, ...],
+    count: int,
+) -> dict[int, DataFrame]:
+    """
+    Partition an input DataFrame for shuffling.
+
+    Notes
+    -----
+    This utility only supports hash partitioning (for now).
+
+    Parameters
+    ----------
+    df
+        DataFrame to partition.
+    keys
+        Shuffle key(s).
+    count
+        Total number of output partitions.
+
+    Returns
+    -------
+    A dictionary mapping between int partition indices and
+    DataFrame fragments.
+    """
+    # Hash the specified keys to calculate the output
+    # partition for each row
+    partition_map = plc.binaryop.binary_operation(
+        plc.hashing.murmurhash3_x86_32(
+            DataFrame([expr.evaluate(df) for expr in keys]).table
+        ),
+        plc.interop.from_arrow(pa.scalar(count, type="uint32")),
+        plc.binaryop.BinaryOperator.PYMOD,
+        plc.types.DataType(plc.types.TypeId.UINT32),
+    )
+
+    # Apply partitioning
+    t, offsets = plc.partitioning.partition(
+        df.table,
+        partition_map,
+        count,
+    )
+
+    # Split and return the partitioned result
+    return {
+        i: DataFrame.from_table(
+            split,
+            df.column_names,
+        )
+        for i, split in enumerate(plc.copying.split(t, offsets[1:-1]))
+    }
+
+
+def _simple_shuffle_graph(
+    name_out: str,
+    name_in: str,
+    keys: tuple[NamedExpr, ...],
+    count_in: int,
+    count_out: int,
+) -> MutableMapping[Any, Any]:
+    """Make a simple all-to-all shuffle graph."""
+    split_name = f"split-{name_out}"
+    inter_name = f"inter-{name_out}"
+
+    graph: MutableMapping[Any, Any] = {}
+    for part_out in range(count_out):
+        _concat_list = []
+        for part_in in range(count_in):
+            graph[(split_name, part_in)] = (
+                _partition_dataframe,
+                (name_in, part_in),
+                keys,
+                count_out,
+            )
+            _concat_list.append((inter_name, part_out, part_in))
+            graph[_concat_list[-1]] = (
+                operator.getitem,
+                (split_name, part_in),
+                part_out,
+            )
+        graph[(name_out, part_out)] = (_concat, _concat_list)
+    return graph
+
+
+@lower_ir_node.register(Shuffle)
+def _(
+    ir: Shuffle, rec: LowerIRTransformer
+) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
+    # Simple lower_ir_node handling for the default hash-based shuffle.
+    # More-complex logic (e.g. joining and sorting) should
+    # be handled separately.
+    from cudf_polars.experimental.parallel import PartitionInfo
+
+    (child,) = ir.children
+
+    new_child, pi = rec(child)
+    if pi[new_child].count == 1 or ir.keys == pi[new_child].partitioned_on:
+        # Already shuffled
+        return new_child, pi
+    new_node = ir.reconstruct([new_child])
+    pi[new_node] = PartitionInfo(
+        # Default shuffle preserves partition count
+        count=pi[new_child].count,
+        # Add partitioned_on info
+        partitioned_on=ir.keys,
+    )
+    return new_node, pi
+
+
+@generate_ir_tasks.register(Shuffle)
+def _(
+    ir: Shuffle, partition_info: MutableMapping[IR, PartitionInfo]
+) -> MutableMapping[Any, Any]:
+    # Use a simple all-to-all shuffle graph.
+
+    # TODO: Optionally use rapidsmp.
+    return _simple_shuffle_graph(
+        get_key_name(ir),
+        get_key_name(ir.children[0]),
+        ir.keys,
+        partition_info[ir.children[0]].count,
+        partition_info[ir].count,
+    )
diff --git a/python/cudf_polars/tests/experimental/test_shuffle.py b/python/cudf_polars/tests/experimental/test_shuffle.py
new file mode 100644
index 00000000000..294557fd0d6
--- /dev/null
+++ b/python/cudf_polars/tests/experimental/test_shuffle.py
@@ -0,0 +1,66 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+from polars.testing import assert_frame_equal
+
+from cudf_polars import Translator
+from cudf_polars.dsl.expr import Col, NamedExpr
+from cudf_polars.experimental.parallel import evaluate_dask, lower_ir_graph
+from cudf_polars.experimental.shuffle import Shuffle
+
+
+@pytest.fixture(scope="module")
+def engine():
+    return pl.GPUEngine(
+        raise_on_fail=True,
+        executor="dask-experimental",
+        executor_options={"max_rows_per_partition": 4},
+    )
+
+
+@pytest.fixture(scope="module")
+def df():
+    return pl.LazyFrame(
+        {
+            "x": [1, 2, 3, 4, 5, 6, 7],
+            "y": [1, 1, 1, 1, 1, 1, 1],
+            "z": ["a", "b", "c", "d", "e", "f", "g"],
+        }
+    )
+
+
+def test_hash_shuffle(df, engine):
+    # Extract translated IR
+    qir = Translator(df._ldf.visit(), engine).translate_ir()
+
+    # Add first Shuffle node
+    keys = (NamedExpr("x", Col(qir.schema["x"], "x")),)
+    options = {}
+    qir1 = Shuffle(qir.schema, keys, options, qir)
+
+    # Add second Shuffle node (on the same keys)
+    qir2 = Shuffle(qir.schema, keys, options, qir1)
+
+    # Check that sequential shuffles on the same keys
+    # are replaced with a single shuffle node
+    partition_info = lower_ir_graph(qir2)[1]
+    assert len([node for node in partition_info if isinstance(node, Shuffle)]) == 1
+
+    # Add second Shuffle node (on different keys)
+    keys2 = (NamedExpr("z", Col(qir.schema["z"], "z")),)
+    qir3 = Shuffle(qir2.schema, keys2, options, qir2)
+
+    # Check that we have an additional shuffle
+    # node after shuffling on different keys
+    partition_info = lower_ir_graph(qir3)[1]
+    assert len([node for node in partition_info if isinstance(node, Shuffle)]) == 2
+
+    # Check that Dask evaluation works
+    result = evaluate_dask(qir3).to_polars()
+    expect = df.collect(engine="cpu")
+    assert_frame_equal(result, expect, check_row_order=False)

From 0cf72b6604a2c1a0e05148b4d32fc3c6d83f9b63 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Thu, 30 Jan 2025 08:43:55 -0500
Subject: [PATCH 03/17] fix style check violations

---
 .devcontainer/cuda12.8-conda/devcontainer.json | 2 +-
 .devcontainer/cuda12.8-pip/devcontainer.json   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.devcontainer/cuda12.8-conda/devcontainer.json b/.devcontainer/cuda12.8-conda/devcontainer.json
index b8f2b4ca77b..3977a1c5f86 100644
--- a/.devcontainer/cuda12.8-conda/devcontainer.json
+++ b/.devcontainer/cuda12.8-conda/devcontainer.json
@@ -71,4 +71,4 @@
       ]
     }
   }
-}
\ No newline at end of file
+}
diff --git a/.devcontainer/cuda12.8-pip/devcontainer.json b/.devcontainer/cuda12.8-pip/devcontainer.json
index 9c3678e9b44..ab83fa88300 100644
--- a/.devcontainer/cuda12.8-pip/devcontainer.json
+++ b/.devcontainer/cuda12.8-pip/devcontainer.json
@@ -48,4 +48,4 @@
       ]
     }
   }
-}
\ No newline at end of file
+}

From ca06c398b3d60f30a2416b05efb31354f9437dc5 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Thu, 30 Jan 2025 11:49:47 -0500
Subject: [PATCH 04/17] [DOC] Make pylibcudf docs more visible (#17803)

Closes https://github.com/rapidsai/cudf/issues/17190

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/17803
---
 docs/cudf/source/developer_guide/index.md     |  1 -
 .../source/developer_guide/library_design.md  |  9 +++++
 docs/cudf/source/index.rst                    |  1 +
 .../api_docs}/aggregation.rst                 |  0
 .../api_docs}/binaryop.rst                    |  0
 .../api_docs}/column.rst                      |  0
 .../api_docs}/column_factories.rst            |  0
 .../api_docs}/concatenate.rst                 |  0
 .../api_docs}/copying.rst                     |  0
 .../api_docs}/datetime.rst                    |  0
 .../api_docs}/expressions.rst                 |  0
 .../api_docs}/filling.rst                     |  0
 .../api_docs}/gpumemoryview.rst               |  0
 .../api_docs}/groupby.rst                     |  0
 .../api_docs}/hashing.rst                     |  0
 .../api_docs}/index.rst                       |  6 +--
 .../api_docs}/interop.rst                     |  0
 .../api_docs}/io/avro.rst                     |  0
 .../api_docs}/io/csv.rst                      |  0
 .../api_docs}/io/index.rst                    |  0
 .../api_docs}/io/json.rst                     |  0
 .../api_docs}/io/parquet.rst                  |  0
 .../api_docs}/io/parquet_metadata.rst         |  0
 .../api_docs}/io/text.rst                     |  0
 .../api_docs}/io/timezone.rst                 |  0
 .../pylibcudf => pylibcudf/api_docs}/join.rst |  0
 .../pylibcudf => pylibcudf/api_docs}/json.rst |  0
 .../api_docs}/labeling.rst                    |  0
 .../api_docs}/lists.rst                       |  0
 .../api_docs}/merge.rst                       |  0
 .../api_docs}/null_mask.rst                   |  0
 .../api_docs}/nvtext/byte_pair_encode.rst     |  0
 .../api_docs}/nvtext/edit_distance.rst        |  0
 .../api_docs}/nvtext/generate_ngrams.rst      |  0
 .../api_docs}/nvtext/index.rst                |  0
 .../api_docs}/nvtext/jaccard.rst              |  0
 .../api_docs}/nvtext/minhash.rst              |  0
 .../api_docs}/nvtext/ngrams_tokenize.rst      |  0
 .../api_docs}/nvtext/normalize.rst            |  0
 .../api_docs}/nvtext/replace.rst              |  0
 .../api_docs}/nvtext/stemmer.rst              |  0
 .../api_docs}/nvtext/subword_tokenize.rst     |  0
 .../api_docs}/nvtext/tokenize.rst             |  0
 .../api_docs}/partitioning.rst                |  0
 .../api_docs}/quantiles.rst                   |  0
 .../api_docs}/reduce.rst                      |  0
 .../api_docs}/replace.rst                     |  0
 .../api_docs}/reshape.rst                     |  0
 .../api_docs}/rolling.rst                     |  0
 .../api_docs}/round.rst                       |  0
 .../api_docs}/scalar.rst                      |  0
 .../api_docs}/search.rst                      |  0
 .../api_docs}/sorting.rst                     |  0
 .../api_docs}/stream_compaction.rst           |  0
 .../api_docs}/strings/capitalize.rst          |  0
 .../api_docs}/strings/char_types.rst          |  0
 .../api_docs}/strings/combine.rst             |  0
 .../api_docs}/strings/contains.rst            |  0
 .../strings/convert/convert_booleans.rst      |  0
 .../strings/convert/convert_datetime.rst      |  0
 .../strings/convert/convert_durations.rst     |  0
 .../strings/convert/convert_fixed_point.rst   |  0
 .../strings/convert/convert_floats.rst        |  0
 .../strings/convert/convert_integers.rst      |  0
 .../strings/convert/convert_ipv4.rst          |  0
 .../strings/convert/convert_lists.rst         |  0
 .../strings/convert/convert_urls.rst          |  0
 .../api_docs}/strings/convert/index.rst       |  0
 .../api_docs}/strings/extract.rst             |  0
 .../api_docs}/strings/find.rst                |  0
 .../api_docs}/strings/find_multiple.rst       |  0
 .../api_docs}/strings/findall.rst             |  0
 .../api_docs}/strings/index.rst               |  0
 .../api_docs}/strings/padding.rst             |  0
 .../api_docs}/strings/regex_flags.rst         |  0
 .../api_docs}/strings/regex_program.rst       |  0
 .../api_docs}/strings/repeat.rst              |  0
 .../api_docs}/strings/replace.rst             |  0
 .../api_docs}/strings/replace_re.rst          |  0
 .../api_docs}/strings/side_type.rst           |  0
 .../api_docs}/strings/slice.rst               |  0
 .../api_docs}/strings/split.rst               |  0
 .../api_docs}/strings/strip.rst               |  0
 .../api_docs}/strings/wrap.rst                |  0
 .../api_docs}/table.rst                       |  0
 .../api_docs}/traits.rst                      |  0
 .../api_docs}/transform.rst                   |  0
 .../api_docs}/transpose.rst                   |  0
 .../api_docs}/types.rst                       |  0
 .../api_docs}/unary.rst                       |  0
 .../developer_docs.md}                        |  2 +-
 docs/cudf/source/pylibcudf/index.rst          | 38 +++++++++++++++++++
 .../cudf/source/user_guide/api_docs/index.rst |  1 -
 93 files changed, 52 insertions(+), 6 deletions(-)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/aggregation.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/binaryop.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/column.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/column_factories.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/concatenate.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/copying.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/datetime.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/expressions.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/filling.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/gpumemoryview.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/groupby.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/hashing.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/index.rst (94%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/interop.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/io/avro.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/io/csv.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/io/index.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/io/json.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/io/parquet.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/io/parquet_metadata.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/io/text.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/io/timezone.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/join.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/json.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/labeling.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/lists.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/merge.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/null_mask.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/nvtext/byte_pair_encode.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/nvtext/edit_distance.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/nvtext/generate_ngrams.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/nvtext/index.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/nvtext/jaccard.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/nvtext/minhash.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/nvtext/ngrams_tokenize.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/nvtext/normalize.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/nvtext/replace.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/nvtext/stemmer.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/nvtext/subword_tokenize.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/nvtext/tokenize.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/partitioning.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/quantiles.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/reduce.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/replace.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/reshape.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/rolling.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/round.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/scalar.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/search.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/sorting.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/stream_compaction.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/strings/capitalize.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/strings/char_types.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/strings/combine.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/strings/contains.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/strings/convert/convert_booleans.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/strings/convert/convert_datetime.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/strings/convert/convert_durations.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/strings/convert/convert_fixed_point.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/strings/convert/convert_floats.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/strings/convert/convert_integers.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/strings/convert/convert_ipv4.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/strings/convert/convert_lists.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/strings/convert/convert_urls.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/strings/convert/index.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/strings/extract.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/strings/find.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/strings/find_multiple.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/strings/findall.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/strings/index.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/strings/padding.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/strings/regex_flags.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/strings/regex_program.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/strings/repeat.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/strings/replace.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/strings/replace_re.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/strings/side_type.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/strings/slice.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/strings/split.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/strings/strip.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/strings/wrap.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/table.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/traits.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/transform.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/transpose.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/types.rst (100%)
 rename docs/cudf/source/{user_guide/api_docs/pylibcudf => pylibcudf/api_docs}/unary.rst (100%)
 rename docs/cudf/source/{developer_guide/pylibcudf.md => pylibcudf/developer_docs.md} (99%)
 create mode 100644 docs/cudf/source/pylibcudf/index.rst

diff --git a/docs/cudf/source/developer_guide/index.md b/docs/cudf/source/developer_guide/index.md
index 5e099631fc5..a1cc1c9d586 100644
--- a/docs/cudf/source/developer_guide/index.md
+++ b/docs/cudf/source/developer_guide/index.md
@@ -26,6 +26,5 @@ documentation
 testing
 benchmarking
 options
-pylibcudf
 cudf_pandas
 ```
diff --git a/docs/cudf/source/developer_guide/library_design.md b/docs/cudf/source/developer_guide/library_design.md
index b2c9ddf9fe4..9cccbbdc22a 100644
--- a/docs/cudf/source/developer_guide/library_design.md
+++ b/docs/cudf/source/developer_guide/library_design.md
@@ -1,5 +1,10 @@
 # Library Design
 
+```{note}
+This page is significantly outdated!
+It will be updated in 25.04 to reflect the current state of cuDF. Which includes libcudf, pylibcudf, cudf classic, cudf.pandas, and cudf.polars.
+```
+
 At a high level, cuDF is structured in three layers, each of which serves a distinct purpose:
 
 1. The Frame layer: The user-facing implementation of pandas-like data structures like `DataFrame` and `Series`.
@@ -273,6 +278,10 @@ To have each worker in dask print spill statistics, do something like:
 
 ## The Cython layer
 
+```{note}
+As of 25.02, most of the functionality in the Cython layer has been moved to pylibcudf. All that remains is the Column layer which will be removed in a future release.
+```
+
 The lowest level of cuDF is its interaction with `libcudf` via Cython.
 The Cython layer is composed of two components: C++ bindings and Cython wrappers.
 The first component consists of [`.pxd` files](https://cython.readthedocs.io/en/latest/src/tutorial/pxd_files.html),
diff --git a/docs/cudf/source/index.rst b/docs/cudf/source/index.rst
index 1b86cafeb48..6ce7c8dceb8 100644
--- a/docs/cudf/source/index.rst
+++ b/docs/cudf/source/index.rst
@@ -30,5 +30,6 @@ other operations.
    user_guide/index
    cudf_pandas/index
    cudf_polars/index
+   pylibcudf/index
    libcudf_docs/index
    developer_guide/index
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/aggregation.rst b/docs/cudf/source/pylibcudf/api_docs/aggregation.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/aggregation.rst
rename to docs/cudf/source/pylibcudf/api_docs/aggregation.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/binaryop.rst b/docs/cudf/source/pylibcudf/api_docs/binaryop.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/binaryop.rst
rename to docs/cudf/source/pylibcudf/api_docs/binaryop.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/column.rst b/docs/cudf/source/pylibcudf/api_docs/column.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/column.rst
rename to docs/cudf/source/pylibcudf/api_docs/column.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/column_factories.rst b/docs/cudf/source/pylibcudf/api_docs/column_factories.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/column_factories.rst
rename to docs/cudf/source/pylibcudf/api_docs/column_factories.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/concatenate.rst b/docs/cudf/source/pylibcudf/api_docs/concatenate.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/concatenate.rst
rename to docs/cudf/source/pylibcudf/api_docs/concatenate.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/copying.rst b/docs/cudf/source/pylibcudf/api_docs/copying.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/copying.rst
rename to docs/cudf/source/pylibcudf/api_docs/copying.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/datetime.rst b/docs/cudf/source/pylibcudf/api_docs/datetime.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/datetime.rst
rename to docs/cudf/source/pylibcudf/api_docs/datetime.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/expressions.rst b/docs/cudf/source/pylibcudf/api_docs/expressions.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/expressions.rst
rename to docs/cudf/source/pylibcudf/api_docs/expressions.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/filling.rst b/docs/cudf/source/pylibcudf/api_docs/filling.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/filling.rst
rename to docs/cudf/source/pylibcudf/api_docs/filling.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/gpumemoryview.rst b/docs/cudf/source/pylibcudf/api_docs/gpumemoryview.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/gpumemoryview.rst
rename to docs/cudf/source/pylibcudf/api_docs/gpumemoryview.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/groupby.rst b/docs/cudf/source/pylibcudf/api_docs/groupby.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/groupby.rst
rename to docs/cudf/source/pylibcudf/api_docs/groupby.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/hashing.rst b/docs/cudf/source/pylibcudf/api_docs/hashing.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/hashing.rst
rename to docs/cudf/source/pylibcudf/api_docs/hashing.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/pylibcudf/api_docs/index.rst
similarity index 94%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
rename to docs/cudf/source/pylibcudf/api_docs/index.rst
index 997ece6d29c..04e2d199f75 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
+++ b/docs/cudf/source/pylibcudf/api_docs/index.rst
@@ -1,6 +1,6 @@
-=========
-pylibcudf
-=========
+=============
+API Reference
+=============
 
 This page provides API documentation for pylibcudf.
 
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/interop.rst b/docs/cudf/source/pylibcudf/api_docs/interop.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/interop.rst
rename to docs/cudf/source/pylibcudf/api_docs/interop.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/avro.rst b/docs/cudf/source/pylibcudf/api_docs/io/avro.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/io/avro.rst
rename to docs/cudf/source/pylibcudf/api_docs/io/avro.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/csv.rst b/docs/cudf/source/pylibcudf/api_docs/io/csv.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/io/csv.rst
rename to docs/cudf/source/pylibcudf/api_docs/io/csv.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst b/docs/cudf/source/pylibcudf/api_docs/io/index.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
rename to docs/cudf/source/pylibcudf/api_docs/io/index.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/json.rst b/docs/cudf/source/pylibcudf/api_docs/io/json.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/io/json.rst
rename to docs/cudf/source/pylibcudf/api_docs/io/json.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/parquet.rst b/docs/cudf/source/pylibcudf/api_docs/io/parquet.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/io/parquet.rst
rename to docs/cudf/source/pylibcudf/api_docs/io/parquet.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/parquet_metadata.rst b/docs/cudf/source/pylibcudf/api_docs/io/parquet_metadata.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/io/parquet_metadata.rst
rename to docs/cudf/source/pylibcudf/api_docs/io/parquet_metadata.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/text.rst b/docs/cudf/source/pylibcudf/api_docs/io/text.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/io/text.rst
rename to docs/cudf/source/pylibcudf/api_docs/io/text.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/timezone.rst b/docs/cudf/source/pylibcudf/api_docs/io/timezone.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/io/timezone.rst
rename to docs/cudf/source/pylibcudf/api_docs/io/timezone.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/join.rst b/docs/cudf/source/pylibcudf/api_docs/join.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/join.rst
rename to docs/cudf/source/pylibcudf/api_docs/join.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/json.rst b/docs/cudf/source/pylibcudf/api_docs/json.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/json.rst
rename to docs/cudf/source/pylibcudf/api_docs/json.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/labeling.rst b/docs/cudf/source/pylibcudf/api_docs/labeling.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/labeling.rst
rename to docs/cudf/source/pylibcudf/api_docs/labeling.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/lists.rst b/docs/cudf/source/pylibcudf/api_docs/lists.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/lists.rst
rename to docs/cudf/source/pylibcudf/api_docs/lists.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/merge.rst b/docs/cudf/source/pylibcudf/api_docs/merge.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/merge.rst
rename to docs/cudf/source/pylibcudf/api_docs/merge.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/null_mask.rst b/docs/cudf/source/pylibcudf/api_docs/null_mask.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/null_mask.rst
rename to docs/cudf/source/pylibcudf/api_docs/null_mask.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/byte_pair_encode.rst b/docs/cudf/source/pylibcudf/api_docs/nvtext/byte_pair_encode.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/byte_pair_encode.rst
rename to docs/cudf/source/pylibcudf/api_docs/nvtext/byte_pair_encode.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/edit_distance.rst b/docs/cudf/source/pylibcudf/api_docs/nvtext/edit_distance.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/edit_distance.rst
rename to docs/cudf/source/pylibcudf/api_docs/nvtext/edit_distance.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/generate_ngrams.rst b/docs/cudf/source/pylibcudf/api_docs/nvtext/generate_ngrams.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/generate_ngrams.rst
rename to docs/cudf/source/pylibcudf/api_docs/nvtext/generate_ngrams.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst b/docs/cudf/source/pylibcudf/api_docs/nvtext/index.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
rename to docs/cudf/source/pylibcudf/api_docs/nvtext/index.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/jaccard.rst b/docs/cudf/source/pylibcudf/api_docs/nvtext/jaccard.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/jaccard.rst
rename to docs/cudf/source/pylibcudf/api_docs/nvtext/jaccard.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/minhash.rst b/docs/cudf/source/pylibcudf/api_docs/nvtext/minhash.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/minhash.rst
rename to docs/cudf/source/pylibcudf/api_docs/nvtext/minhash.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/ngrams_tokenize.rst b/docs/cudf/source/pylibcudf/api_docs/nvtext/ngrams_tokenize.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/ngrams_tokenize.rst
rename to docs/cudf/source/pylibcudf/api_docs/nvtext/ngrams_tokenize.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/normalize.rst b/docs/cudf/source/pylibcudf/api_docs/nvtext/normalize.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/normalize.rst
rename to docs/cudf/source/pylibcudf/api_docs/nvtext/normalize.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/replace.rst b/docs/cudf/source/pylibcudf/api_docs/nvtext/replace.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/replace.rst
rename to docs/cudf/source/pylibcudf/api_docs/nvtext/replace.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/stemmer.rst b/docs/cudf/source/pylibcudf/api_docs/nvtext/stemmer.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/stemmer.rst
rename to docs/cudf/source/pylibcudf/api_docs/nvtext/stemmer.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/subword_tokenize.rst b/docs/cudf/source/pylibcudf/api_docs/nvtext/subword_tokenize.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/subword_tokenize.rst
rename to docs/cudf/source/pylibcudf/api_docs/nvtext/subword_tokenize.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/tokenize.rst b/docs/cudf/source/pylibcudf/api_docs/nvtext/tokenize.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/tokenize.rst
rename to docs/cudf/source/pylibcudf/api_docs/nvtext/tokenize.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/partitioning.rst b/docs/cudf/source/pylibcudf/api_docs/partitioning.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/partitioning.rst
rename to docs/cudf/source/pylibcudf/api_docs/partitioning.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/quantiles.rst b/docs/cudf/source/pylibcudf/api_docs/quantiles.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/quantiles.rst
rename to docs/cudf/source/pylibcudf/api_docs/quantiles.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/reduce.rst b/docs/cudf/source/pylibcudf/api_docs/reduce.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/reduce.rst
rename to docs/cudf/source/pylibcudf/api_docs/reduce.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/replace.rst b/docs/cudf/source/pylibcudf/api_docs/replace.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/replace.rst
rename to docs/cudf/source/pylibcudf/api_docs/replace.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/reshape.rst b/docs/cudf/source/pylibcudf/api_docs/reshape.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/reshape.rst
rename to docs/cudf/source/pylibcudf/api_docs/reshape.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/rolling.rst b/docs/cudf/source/pylibcudf/api_docs/rolling.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/rolling.rst
rename to docs/cudf/source/pylibcudf/api_docs/rolling.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/round.rst b/docs/cudf/source/pylibcudf/api_docs/round.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/round.rst
rename to docs/cudf/source/pylibcudf/api_docs/round.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/scalar.rst b/docs/cudf/source/pylibcudf/api_docs/scalar.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/scalar.rst
rename to docs/cudf/source/pylibcudf/api_docs/scalar.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/search.rst b/docs/cudf/source/pylibcudf/api_docs/search.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/search.rst
rename to docs/cudf/source/pylibcudf/api_docs/search.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/sorting.rst b/docs/cudf/source/pylibcudf/api_docs/sorting.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/sorting.rst
rename to docs/cudf/source/pylibcudf/api_docs/sorting.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/stream_compaction.rst b/docs/cudf/source/pylibcudf/api_docs/stream_compaction.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/stream_compaction.rst
rename to docs/cudf/source/pylibcudf/api_docs/stream_compaction.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/capitalize.rst b/docs/cudf/source/pylibcudf/api_docs/strings/capitalize.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/strings/capitalize.rst
rename to docs/cudf/source/pylibcudf/api_docs/strings/capitalize.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/char_types.rst b/docs/cudf/source/pylibcudf/api_docs/strings/char_types.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/strings/char_types.rst
rename to docs/cudf/source/pylibcudf/api_docs/strings/char_types.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/combine.rst b/docs/cudf/source/pylibcudf/api_docs/strings/combine.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/strings/combine.rst
rename to docs/cudf/source/pylibcudf/api_docs/strings/combine.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/contains.rst b/docs/cudf/source/pylibcudf/api_docs/strings/contains.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/strings/contains.rst
rename to docs/cudf/source/pylibcudf/api_docs/strings/contains.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_booleans.rst b/docs/cudf/source/pylibcudf/api_docs/strings/convert/convert_booleans.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_booleans.rst
rename to docs/cudf/source/pylibcudf/api_docs/strings/convert/convert_booleans.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_datetime.rst b/docs/cudf/source/pylibcudf/api_docs/strings/convert/convert_datetime.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_datetime.rst
rename to docs/cudf/source/pylibcudf/api_docs/strings/convert/convert_datetime.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_durations.rst b/docs/cudf/source/pylibcudf/api_docs/strings/convert/convert_durations.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_durations.rst
rename to docs/cudf/source/pylibcudf/api_docs/strings/convert/convert_durations.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_fixed_point.rst b/docs/cudf/source/pylibcudf/api_docs/strings/convert/convert_fixed_point.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_fixed_point.rst
rename to docs/cudf/source/pylibcudf/api_docs/strings/convert/convert_fixed_point.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_floats.rst b/docs/cudf/source/pylibcudf/api_docs/strings/convert/convert_floats.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_floats.rst
rename to docs/cudf/source/pylibcudf/api_docs/strings/convert/convert_floats.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_integers.rst b/docs/cudf/source/pylibcudf/api_docs/strings/convert/convert_integers.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_integers.rst
rename to docs/cudf/source/pylibcudf/api_docs/strings/convert/convert_integers.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_ipv4.rst b/docs/cudf/source/pylibcudf/api_docs/strings/convert/convert_ipv4.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_ipv4.rst
rename to docs/cudf/source/pylibcudf/api_docs/strings/convert/convert_ipv4.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_lists.rst b/docs/cudf/source/pylibcudf/api_docs/strings/convert/convert_lists.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_lists.rst
rename to docs/cudf/source/pylibcudf/api_docs/strings/convert/convert_lists.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_urls.rst b/docs/cudf/source/pylibcudf/api_docs/strings/convert/convert_urls.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_urls.rst
rename to docs/cudf/source/pylibcudf/api_docs/strings/convert/convert_urls.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/index.rst b/docs/cudf/source/pylibcudf/api_docs/strings/convert/index.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/index.rst
rename to docs/cudf/source/pylibcudf/api_docs/strings/convert/index.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/extract.rst b/docs/cudf/source/pylibcudf/api_docs/strings/extract.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/strings/extract.rst
rename to docs/cudf/source/pylibcudf/api_docs/strings/extract.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/find.rst b/docs/cudf/source/pylibcudf/api_docs/strings/find.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/strings/find.rst
rename to docs/cudf/source/pylibcudf/api_docs/strings/find.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/find_multiple.rst b/docs/cudf/source/pylibcudf/api_docs/strings/find_multiple.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/strings/find_multiple.rst
rename to docs/cudf/source/pylibcudf/api_docs/strings/find_multiple.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/findall.rst b/docs/cudf/source/pylibcudf/api_docs/strings/findall.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/strings/findall.rst
rename to docs/cudf/source/pylibcudf/api_docs/strings/findall.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/pylibcudf/api_docs/strings/index.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
rename to docs/cudf/source/pylibcudf/api_docs/strings/index.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/padding.rst b/docs/cudf/source/pylibcudf/api_docs/strings/padding.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/strings/padding.rst
rename to docs/cudf/source/pylibcudf/api_docs/strings/padding.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/regex_flags.rst b/docs/cudf/source/pylibcudf/api_docs/strings/regex_flags.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/strings/regex_flags.rst
rename to docs/cudf/source/pylibcudf/api_docs/strings/regex_flags.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/regex_program.rst b/docs/cudf/source/pylibcudf/api_docs/strings/regex_program.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/strings/regex_program.rst
rename to docs/cudf/source/pylibcudf/api_docs/strings/regex_program.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/repeat.rst b/docs/cudf/source/pylibcudf/api_docs/strings/repeat.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/strings/repeat.rst
rename to docs/cudf/source/pylibcudf/api_docs/strings/repeat.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/replace.rst b/docs/cudf/source/pylibcudf/api_docs/strings/replace.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/strings/replace.rst
rename to docs/cudf/source/pylibcudf/api_docs/strings/replace.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/replace_re.rst b/docs/cudf/source/pylibcudf/api_docs/strings/replace_re.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/strings/replace_re.rst
rename to docs/cudf/source/pylibcudf/api_docs/strings/replace_re.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/side_type.rst b/docs/cudf/source/pylibcudf/api_docs/strings/side_type.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/strings/side_type.rst
rename to docs/cudf/source/pylibcudf/api_docs/strings/side_type.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/slice.rst b/docs/cudf/source/pylibcudf/api_docs/strings/slice.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/strings/slice.rst
rename to docs/cudf/source/pylibcudf/api_docs/strings/slice.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/split.rst b/docs/cudf/source/pylibcudf/api_docs/strings/split.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/strings/split.rst
rename to docs/cudf/source/pylibcudf/api_docs/strings/split.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/strip.rst b/docs/cudf/source/pylibcudf/api_docs/strings/strip.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/strings/strip.rst
rename to docs/cudf/source/pylibcudf/api_docs/strings/strip.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/wrap.rst b/docs/cudf/source/pylibcudf/api_docs/strings/wrap.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/strings/wrap.rst
rename to docs/cudf/source/pylibcudf/api_docs/strings/wrap.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/table.rst b/docs/cudf/source/pylibcudf/api_docs/table.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/table.rst
rename to docs/cudf/source/pylibcudf/api_docs/table.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/traits.rst b/docs/cudf/source/pylibcudf/api_docs/traits.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/traits.rst
rename to docs/cudf/source/pylibcudf/api_docs/traits.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/transform.rst b/docs/cudf/source/pylibcudf/api_docs/transform.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/transform.rst
rename to docs/cudf/source/pylibcudf/api_docs/transform.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/transpose.rst b/docs/cudf/source/pylibcudf/api_docs/transpose.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/transpose.rst
rename to docs/cudf/source/pylibcudf/api_docs/transpose.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/types.rst b/docs/cudf/source/pylibcudf/api_docs/types.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/types.rst
rename to docs/cudf/source/pylibcudf/api_docs/types.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/unary.rst b/docs/cudf/source/pylibcudf/api_docs/unary.rst
similarity index 100%
rename from docs/cudf/source/user_guide/api_docs/pylibcudf/unary.rst
rename to docs/cudf/source/pylibcudf/api_docs/unary.rst
diff --git a/docs/cudf/source/developer_guide/pylibcudf.md b/docs/cudf/source/pylibcudf/developer_docs.md
similarity index 99%
rename from docs/cudf/source/developer_guide/pylibcudf.md
rename to docs/cudf/source/pylibcudf/developer_docs.md
index 1ee828e7c4e..cce0d0b25b8 100644
--- a/docs/cudf/source/developer_guide/pylibcudf.md
+++ b/docs/cudf/source/pylibcudf/developer_docs.md
@@ -1,4 +1,4 @@
-# pylibcudf
+# Developer Documentation
 
 pylibcudf is a lightweight Cython wrapper around libcudf.
 It aims to provide a near-zero overhead interface to accessing libcudf in Python.
diff --git a/docs/cudf/source/pylibcudf/index.rst b/docs/cudf/source/pylibcudf/index.rst
new file mode 100644
index 00000000000..7a015b33196
--- /dev/null
+++ b/docs/cudf/source/pylibcudf/index.rst
@@ -0,0 +1,38 @@
+pylibcudf documentation
+=======================
+
+pylibcudf is a lightweight Cython interface to libcudf that provides near-zero overhead for GPU-accelerated data processing in Python.
+It aims to provide minimal overhead interfaces to the C++ libcudf library, while integrating seamlessly with community protocols like ``__cuda_array_interface__``, and common libraries such as CuPy and Numba.
+Both our zero-code pandas accelerator (``cudf.pandas``) and our polars GPU execution engine (``cudf.polars``) are built on top of pylibcudf.
+
+Ex: Reading data from a parquet file
+
+pylibcudf:
+
+.. code-block:: python
+
+   import pylibcudf as plc
+
+   source = plc.io.SourceInfo(["dataset.parquet"])
+   options = plc.io.parquet.ParquetReaderOptions.builder(source)
+   table = plc.io.parquet.read_parquet(options)
+
+libcudf:
+
+.. code-block:: cpp
+
+   #include <cudf/io/parquet.hpp>
+
+   int main()
+   {
+      auto source  = cudf::io::source_info("dataset.parquet");
+      auto options = cudf::io::parquet_reader_options::builder(source);
+      auto table  = cudf::io::read_parquet(options);
+   }
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Contents:
+
+   api_docs/index.rst
+   developer_docs
diff --git a/docs/cudf/source/user_guide/api_docs/index.rst b/docs/cudf/source/user_guide/api_docs/index.rst
index f711327f9ed..249a41ade17 100644
--- a/docs/cudf/source/user_guide/api_docs/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/index.rst
@@ -25,5 +25,4 @@ This page provides a list of all publicly accessible modules, methods and classe
     struct_handling
     options
     extension_dtypes
-    pylibcudf/index.rst
     performance_tracking

From 94229d58031182305a0841a9f9307d37f398f3df Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Thu, 30 Jan 2025 12:57:16 -0800
Subject: [PATCH 05/17] Include more information in error messages in the
 nvcomp adapter (#17814)

Error messages in the nvcomp adapter don't include any information about the error, making it harder to triage issues.

Errors due to unsupported compression type include the type. When an nvCOMP API returns a non-success status, the status is included in the error message as well.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

URL: https://github.com/rapidsai/cudf/pull/17814
---
 cpp/src/io/comp/nvcomp_adapter.cpp | 85 +++++++++++++++++++-----------
 1 file changed, 55 insertions(+), 30 deletions(-)

diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp
index ac81dd421fa..7c191b03350 100644
--- a/cpp/src/io/comp/nvcomp_adapter.cpp
+++ b/cpp/src/io/comp/nvcomp_adapter.cpp
@@ -33,6 +33,49 @@
 namespace cudf::io::detail::nvcomp {
 namespace {
 
+[[nodiscard]] std::string nvcomp_status_to_string(nvcompStatus_t status)
+{
+  switch (status) {
+    case nvcompStatus_t::nvcompSuccess: return "nvcompSuccess";
+    case nvcompStatus_t::nvcompErrorInvalidValue: return "nvcompErrorInvalidValue";
+    case nvcompStatus_t::nvcompErrorNotSupported: return "nvcompErrorNotSupported";
+    case nvcompStatus_t::nvcompErrorCannotDecompress: return "nvcompErrorCannotDecompress";
+    case nvcompStatus_t::nvcompErrorBadChecksum: return "nvcompErrorBadChecksum";
+    case nvcompStatus_t::nvcompErrorCannotVerifyChecksums:
+      return "nvcompErrorCannotVerifyChecksums";
+    case nvcompStatus_t::nvcompErrorOutputBufferTooSmall: return "nvcompErrorOutputBufferTooSmall";
+    case nvcompStatus_t::nvcompErrorWrongHeaderLength: return "nvcompErrorWrongHeaderLength";
+    case nvcompStatus_t::nvcompErrorAlignment: return "nvcompErrorAlignment";
+    case nvcompStatus_t::nvcompErrorChunkSizeTooLarge: return "nvcompErrorChunkSizeTooLarge";
+    case nvcompStatus_t::nvcompErrorCudaError: return "nvcompErrorCudaError";
+    case nvcompStatus_t::nvcompErrorInternal: return "nvcompErrorInternal";
+  }
+  return "nvcompStatus_t(" + std::to_string(static_cast<int>(status)) + ")";
+}
+
+[[nodiscard]] std::string compression_type_name(compression_type compression)
+{
+  switch (compression) {
+    case compression_type::SNAPPY: return "Snappy";
+    case compression_type::ZSTD: return "Zstandard";
+    case compression_type::DEFLATE: return "Deflate";
+    case compression_type::LZ4: return "LZ4";
+    case compression_type::GZIP: return "GZIP";
+  }
+  return "compression_type(" + std::to_string(static_cast<int>(compression)) + ")";
+}
+
+#define CHECK_NVCOMP_STATUS(status)                                   \
+  do {                                                                \
+    CUDF_EXPECTS(status == nvcompStatus_t::nvcompSuccess,             \
+                 "nvCOMP error: " + nvcomp_status_to_string(status)); \
+  } while (0)
+
+#define UNSUPPORTED_COMPRESSION(compression)                                          \
+  do {                                                                                \
+    CUDF_FAIL("Unsupported compression type: " + compression_type_name(compression)); \
+  } while (0)
+
 // Dispatcher for nvcompBatched<format>DecompressGetTempSizeEx
 template <typename... Args>
 auto batched_decompress_get_temp_size_ex(compression_type compression, Args&&... args)
@@ -48,7 +91,7 @@ auto batched_decompress_get_temp_size_ex(compression_type compression, Args&&...
       return nvcompBatchedDeflateDecompressGetTempSizeEx(std::forward<Args>(args)...);
     case compression_type::GZIP:
       return nvcompBatchedGzipDecompressGetTempSizeEx(std::forward<Args>(args)...);
-    default: CUDF_FAIL("Unsupported compression type");
+    default: UNSUPPORTED_COMPRESSION(compression);
   }
 }
 
@@ -66,22 +109,10 @@ auto batched_decompress_async(compression_type compression, Args&&... args)
     case compression_type::LZ4: return nvcompBatchedLZ4DecompressAsync(std::forward<Args>(args)...);
     case compression_type::GZIP:
       return nvcompBatchedGzipDecompressAsync(std::forward<Args>(args)...);
-    default: CUDF_FAIL("Unsupported compression type");
+    default: UNSUPPORTED_COMPRESSION(compression);
   }
 }
 
-[[maybe_unused]] std::string compression_type_name(compression_type compression)
-{
-  switch (compression) {
-    case compression_type::SNAPPY: return "Snappy";
-    case compression_type::ZSTD: return "Zstandard";
-    case compression_type::DEFLATE: return "Deflate";
-    case compression_type::LZ4: return "LZ4";
-    case compression_type::GZIP: return "GZIP";
-  }
-  return "compression_type(" + std::to_string(static_cast<int>(compression)) + ")";
-}
-
 size_t batched_compress_temp_size(compression_type compression,
                                   size_t batch_size,
                                   size_t max_uncompressed_chunk_bytes,
@@ -118,11 +149,9 @@ size_t batched_compress_temp_size(compression_type compression,
                                                             &temp_size,
                                                             max_total_uncompressed_bytes);
       break;
-    default: CUDF_FAIL("Unsupported compression type");
+    default: UNSUPPORTED_COMPRESSION(compression);
   }
-
-  CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess,
-               "Unable to get scratch size for compression");
+  CHECK_NVCOMP_STATUS(nvcomp_status);
   return temp_size;
 }
 
@@ -188,9 +217,9 @@ void batched_compress_async(compression_type compression,
                                                     nvcompBatchedLZ4DefaultOpts,
                                                     stream.value());
       break;
-    default: CUDF_FAIL("Unsupported compression type");
+    default: UNSUPPORTED_COMPRESSION(compression);
   }
-  CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess, "Error in compression");
+  CHECK_NVCOMP_STATUS(nvcomp_status);
 }
 
 bool is_aligned(void const* ptr, std::uintptr_t alignment) noexcept
@@ -254,9 +283,7 @@ size_t batched_decompress_temp_size(compression_type compression,
   size_t temp_size                   = 0;
   nvcompStatus_t const nvcomp_status = batched_decompress_get_temp_size_ex(
     compression, num_chunks, max_uncomp_chunk_size, &temp_size, max_total_uncomp_size);
-
-  CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess,
-               "Unable to get scratch size for decompression");
+  CHECK_NVCOMP_STATUS(nvcomp_status);
   return temp_size;
 }
 
@@ -289,7 +316,7 @@ void batched_decompress(compression_type compression,
                                                       nvcomp_args.output_data_ptrs.data(),
                                                       nvcomp_statuses.data(),
                                                       stream.value());
-  CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess, "unable to perform decompression");
+  CHECK_NVCOMP_STATUS(nvcomp_status);
 
   update_compression_results(nvcomp_statuses, actual_uncompressed_data_sizes, results, stream);
 }
@@ -321,11 +348,9 @@ size_t compress_max_output_chunk_size(compression_type compression,
       status = nvcompBatchedLZ4CompressGetMaxOutputChunkSize(
         capped_uncomp_bytes, nvcompBatchedLZ4DefaultOpts, &max_comp_chunk_size);
       break;
-    default: CUDF_FAIL("Unsupported compression type");
+    default: UNSUPPORTED_COMPRESSION(compression);
   }
-
-  CUDF_EXPECTS(status == nvcompStatus_t::nvcompSuccess,
-               "failed to get max uncompressed chunk size");
+  CHECK_NVCOMP_STATUS(status);
   return max_comp_chunk_size;
 }
 
@@ -463,7 +488,7 @@ size_t required_alignment(compression_type compression)
     case compression_type::SNAPPY: return nvcompSnappyRequiredAlignment;
     case compression_type::ZSTD: return nvcompZstdRequiredAlignment;
     case compression_type::LZ4: return nvcompLZ4RequiredAlignment;
-    default: CUDF_FAIL("Unsupported compression type");
+    default: UNSUPPORTED_COMPRESSION(compression);
   }
 }
 
@@ -474,7 +499,7 @@ std::optional<size_t> compress_max_allowed_chunk_size(compression_type compressi
     case compression_type::SNAPPY: return nvcompSnappyCompressionMaxAllowedChunkSize;
     case compression_type::ZSTD: return nvcompZstdCompressionMaxAllowedChunkSize;
     case compression_type::LZ4: return nvcompLZ4CompressionMaxAllowedChunkSize;
-    default: CUDF_FAIL("Unsupported compression type");
+    default: UNSUPPORTED_COMPRESSION(compression);
   }
 }
 

From f949deed9a39edae914b8e701852554fe01224dd Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Thu, 30 Jan 2025 16:02:59 -0600
Subject: [PATCH 06/17] Move `isinstance_cudf_pandas` to `fast_slow_proxy`
 (#17875)

https://github.com/rapidsai/cudf/pull/17629/files#diff-8731197057aec7c2ece5535ff5fb740a7d2109b213bb859ccd19290d40b7b703R11 broke number of cuml pytests.

This was because `pandas_compatible` mode was being set in `._wrappers.pandas` and the import introduced in the above pr was the reason for it. This pr fixes it by moving the `isinstance_cudf_pandas` to `fast_slow_proxy` module.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/17875
---
 docs/cudf/source/cudf_pandas/faq.md           | 14 +++++-----
 python/cudf/cudf/pandas/__init__.py           | 11 +++++---
 python/cudf/cudf/pandas/_wrappers/pandas.py   |  7 -----
 python/cudf/cudf/pandas/fast_slow_proxy.py    |  4 +++
 python/cudf/cudf/pandas/module_accelerator.py |  3 ++-
 .../cudf_pandas_tests/test_cudf_pandas.py     | 26 +++++++++----------
 6 files changed, 34 insertions(+), 31 deletions(-)

diff --git a/docs/cudf/source/cudf_pandas/faq.md b/docs/cudf/source/cudf_pandas/faq.md
index 4e3cc319605..92165a32266 100644
--- a/docs/cudf/source/cudf_pandas/faq.md
+++ b/docs/cudf/source/cudf_pandas/faq.md
@@ -144,13 +144,13 @@ provides a similar configuration-based plugin for Spark.
 
 ## How do I know if an object is a `cudf.pandas` proxy object?
 
-To determine if an object is a `cudf.pandas` proxy object, you can use the `isinstance_cudf_pandas` API. This function checks if the given object is a proxy object that wraps either a `cudf` or `pandas` object. Here is an example of how to use this API:
+To determine if an object is a `cudf.pandas` proxy object, you can use the `is_proxy_instance` API. This function checks if the given object is a proxy object that wraps either a `cudf` or `pandas` object. Here is an example of how to use this API:
 
 ```python
-from cudf.pandas import isinstance_cudf_pandas
+from cudf.pandas import is_proxy_instance
 
 obj = ...  # Your object here
-if isinstance_cudf_pandas(obj, pd.Series):
+if is_proxy_instance(obj, pd.Series):
     print("The object is a cudf.pandas proxy Series object.")
 else:
     print("The object is not a cudf.pandas proxy Series object.")
@@ -158,10 +158,10 @@ else:
 
 To detect `Series`, `DataFrame`, `Index`, and `ndarray` objects separately, you can pass the type names as the second parameter:
 
-* `isinstance_cudf_pandas(obj, pd.Series)`: Detects if the object is a `cudf.pandas` proxy `Series`.
-* `isinstance_cudf_pandas(obj, pd.DataFrame)`: Detects if the object is a `cudf.pandas` proxy `DataFrame`.
-* `isinstance_cudf_pandas(obj, pd.Index)`: Detects if the object is a `cudf.pandas` proxy `Index`.
-* `isinstance_cudf_pandas(obj, np.ndarray)`: Detects if the object is a `cudf.pandas` proxy `ndarray`.
+* `is_proxy_instance(obj, pd.Series)`: Detects if the object is a `cudf.pandas` proxy `Series`.
+* `is_proxy_instance(obj, pd.DataFrame)`: Detects if the object is a `cudf.pandas` proxy `DataFrame`.
+* `is_proxy_instance(obj, pd.Index)`: Detects if the object is a `cudf.pandas` proxy `Index`.
+* `is_proxy_instance(obj, np.ndarray)`: Detects if the object is a `cudf.pandas` proxy `ndarray`.
 
 ## How can I access the underlying GPU or CPU objects?
 
diff --git a/python/cudf/cudf/pandas/__init__.py b/python/cudf/cudf/pandas/__init__.py
index 70ab7d48879..52fc945709e 100644
--- a/python/cudf/cudf/pandas/__init__.py
+++ b/python/cudf/cudf/pandas/__init__.py
@@ -8,12 +8,17 @@
 import pylibcudf
 import rmm.mr
 
-from ._wrappers.pandas import isinstance_cudf_pandas
-from .fast_slow_proxy import is_proxy_object
+from .fast_slow_proxy import is_proxy_instance, is_proxy_object
 from .magics import load_ipython_extension
 from .profiler import Profiler
 
-__all__ = ["Profiler", "install", "is_proxy_object", "load_ipython_extension"]
+__all__ = [
+    "Profiler",
+    "install",
+    "is_proxy_instance",
+    "is_proxy_object",
+    "load_ipython_extension",
+]
 
 
 LOADED = False
diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py
index 1cda27ba1d6..526778b4ecb 100644
--- a/python/cudf/cudf/pandas/_wrappers/pandas.py
+++ b/python/cudf/cudf/pandas/_wrappers/pandas.py
@@ -37,7 +37,6 @@
     _FunctionProxy,
     _maybe_wrap_result,
     _Unusable,
-    is_proxy_object,
     make_final_proxy_type as _make_final_proxy_type,
     make_intermediate_proxy_type as _make_intermediate_proxy_type,
     register_proxy_func,
@@ -70,8 +69,6 @@
 except ImportError:
     ipython_shell = None
 
-cudf.set_option("mode.pandas_compatible", True)
-
 
 def _pandas_util_dir():
     # In pandas 2.0, pandas.util contains public APIs under
@@ -1713,10 +1710,6 @@ def holiday_calendar_factory_wrapper(*args, **kwargs):
     )
 
 
-def isinstance_cudf_pandas(obj, type):
-    return is_proxy_object(obj) and obj.__class__.__name__ == type.__name__
-
-
 # timestamps and timedeltas are not proxied, but non-proxied
 # pandas types are currently not picklable. Thus, we define
 # custom reducer/unpicker functions for these types:
diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py
index c189280be09..46df2b047a4 100644
--- a/python/cudf/cudf/pandas/fast_slow_proxy.py
+++ b/python/cudf/cudf/pandas/fast_slow_proxy.py
@@ -1335,6 +1335,10 @@ def _get_proxy_base_class(cls):
     return object
 
 
+def is_proxy_instance(obj, type):
+    return is_proxy_object(obj) and obj.__class__.__name__ == type.__name__
+
+
 PROXY_BASE_CLASSES: set[type] = {
     ProxyNDarrayBase,
 }
diff --git a/python/cudf/cudf/pandas/module_accelerator.py b/python/cudf/cudf/pandas/module_accelerator.py
index 9e549713f7b..818971105cb 100644
--- a/python/cudf/cudf/pandas/module_accelerator.py
+++ b/python/cudf/cudf/pandas/module_accelerator.py
@@ -595,9 +595,10 @@ def install(
                 )
             mode = deduce_cudf_pandas_mode(slow_lib, fast_lib)
             if mode.use_fast_lib:
-                importlib.import_module(
+                pandas_wrappers = importlib.import_module(
                     f".._wrappers.{mode.slow_lib}", __name__
                 )
+                pandas_wrappers.cudf.set_option("mode.pandas_compatible", True)
             try:
                 (self,) = (
                     p
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index efa875cf73a..938d22de076 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -66,7 +66,7 @@
 )
 
 from cudf.pandas import (
-    isinstance_cudf_pandas,
+    is_proxy_instance,
 )
 
 # Accelerated pandas has the real pandas and cudf modules as attributes
@@ -1902,23 +1902,23 @@ def test_is_cudf_pandas():
     df = xpd.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]})
     index = xpd.Index([1, 2, 3])
 
-    assert isinstance_cudf_pandas(s, pd.Series)
-    assert isinstance_cudf_pandas(df, pd.DataFrame)
-    assert isinstance_cudf_pandas(index, pd.Index)
-    assert isinstance_cudf_pandas(index.values, np.ndarray)
+    assert is_proxy_instance(s, pd.Series)
+    assert is_proxy_instance(df, pd.DataFrame)
+    assert is_proxy_instance(index, pd.Index)
+    assert is_proxy_instance(index.values, np.ndarray)
 
     for obj in [s, df, index, index.values]:
-        assert not isinstance_cudf_pandas(obj._fsproxy_slow, pd.Series)
-        assert not isinstance_cudf_pandas(obj._fsproxy_fast, pd.Series)
+        assert not is_proxy_instance(obj._fsproxy_slow, pd.Series)
+        assert not is_proxy_instance(obj._fsproxy_fast, pd.Series)
 
-        assert not isinstance_cudf_pandas(obj._fsproxy_slow, pd.DataFrame)
-        assert not isinstance_cudf_pandas(obj._fsproxy_fast, pd.DataFrame)
+        assert not is_proxy_instance(obj._fsproxy_slow, pd.DataFrame)
+        assert not is_proxy_instance(obj._fsproxy_fast, pd.DataFrame)
 
-        assert not isinstance_cudf_pandas(obj._fsproxy_slow, pd.Index)
-        assert not isinstance_cudf_pandas(obj._fsproxy_fast, pd.Index)
+        assert not is_proxy_instance(obj._fsproxy_slow, pd.Index)
+        assert not is_proxy_instance(obj._fsproxy_fast, pd.Index)
 
-        assert not isinstance_cudf_pandas(obj._fsproxy_slow, np.ndarray)
-        assert not isinstance_cudf_pandas(obj._fsproxy_fast, np.ndarray)
+        assert not is_proxy_instance(obj._fsproxy_slow, np.ndarray)
+        assert not is_proxy_instance(obj._fsproxy_fast, np.ndarray)
 
 
 def test_series_dtype_property():

From a46307a576e815e19a0ab1f4be17a985c0d312ad Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 30 Jan 2025 14:12:41 -0800
Subject: [PATCH 07/17] Remove cudf.Scalar from scatter APIs (#17847)

Towards https://github.com/rapidsai/cudf/issues/17843

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/17847
---
 python/cudf/cudf/core/_internals/copying.py |  4 +--
 python/cudf/cudf/core/column/categorical.py | 18 +++++++++----
 python/cudf/cudf/core/column/column.py      | 28 +++++++++------------
 python/cudf/cudf/core/column/datetime.py    |  7 ++++--
 python/cudf/cudf/core/column/lists.py       |  3 ++-
 python/cudf/cudf/core/column/numerical.py   | 11 +++++---
 python/cudf/cudf/core/indexed_frame.py      |  4 ++-
 python/cudf/cudf/core/multiindex.py         |  7 +++---
 8 files changed, 49 insertions(+), 33 deletions(-)

diff --git a/python/cudf/cudf/core/_internals/copying.py b/python/cudf/cudf/core/_internals/copying.py
index 76122f89445..9e63ec63828 100644
--- a/python/cudf/cudf/core/_internals/copying.py
+++ b/python/cudf/cudf/core/_internals/copying.py
@@ -36,7 +36,7 @@ def gather(
 
 @acquire_spill_lock()
 def scatter(
-    sources: list[ColumnBase | cudf.Scalar],
+    sources: list[ColumnBase | plc.Scalar],
     scatter_map: NumericalColumn,
     target_columns: list[ColumnBase],
     bounds_check: bool = True,
@@ -67,7 +67,7 @@ def scatter(
     plc_tbl = plc.copying.scatter(
         plc.Table([col.to_pylibcudf(mode="read") for col in sources])  # type: ignore[union-attr]
         if isinstance(sources[0], cudf._lib.column.Column)
-        else [slr.device_value for slr in sources],  # type: ignore[union-attr]
+        else sources,  # type: ignore[union-attr]
         scatter_map.to_pylibcudf(mode="read"),
         plc.Table([col.to_pylibcudf(mode="read") for col in target_columns]),
     )
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 9be47107b14..985b689f087 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -11,10 +11,13 @@
 import pyarrow as pa
 from typing_extensions import Self
 
+import pylibcudf as plc
+
 import cudf
 from cudf.core.column import column
 from cudf.core.column.methods import ColumnMethods
 from cudf.core.dtypes import CategoricalDtype, IntervalDtype
+from cudf.core.scalar import pa_scalar_to_plc_scalar
 from cudf.utils.dtypes import (
     SIZE_TYPE_DTYPE,
     find_common_type,
@@ -657,7 +660,7 @@ def __setitem__(self, key, value):
 
     def _fill(
         self,
-        fill_value: ScalarLike,
+        fill_value: plc.Scalar,
         begin: int,
         end: int,
         inplace: bool = False,
@@ -665,9 +668,14 @@ def _fill(
         if end <= begin or begin >= self.size:
             return self if inplace else self.copy()
 
-        fill_code = self._encode(fill_value)
+        fill_code = self._encode(plc.interop.to_arrow(fill_value))
         result = self if inplace else self.copy()
-        result.codes._fill(fill_code, begin, end, inplace=True)
+        result.codes._fill(
+            pa_scalar_to_plc_scalar(pa.scalar(fill_code)),
+            begin,
+            end,
+            inplace=True,
+        )
         return result
 
     def slice(self, start: int, stop: int, stride: int | None = None) -> Self:
@@ -1017,7 +1025,7 @@ def isnull(self) -> ColumnBase:
             categories = self.categories.isnan()
             if categories.any():
                 code = self._encode(np.nan)
-                result = result | (self.codes == cudf.Scalar(code))
+                result = result | (self.codes == code)
 
         return result
 
@@ -1033,7 +1041,7 @@ def notnull(self) -> ColumnBase:
             categories = self.categories.isnan()
             if categories.any():
                 code = self._encode(np.nan)
-                result = result & (self.codes != cudf.Scalar(code))
+                result = result & (self.codes != code)
 
         return result
 
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 57d1ad56f82..f3931fbe919 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -421,7 +421,7 @@ def memory_usage(self) -> int:
 
     def _fill(
         self,
-        fill_value: ScalarLike,
+        fill_value: plc.Scalar,
         begin: int,
         end: int,
         inplace: bool = False,
@@ -429,10 +429,6 @@ def _fill(
         if end <= begin or begin >= self.size:
             return self if inplace else self.copy()
 
-        # Constructing a cuDF scalar can cut unnecessary DtoH copy if
-        # the scalar is None when calling `is_valid`.
-        slr = cudf.Scalar(fill_value, dtype=self.dtype)
-
         if not inplace or is_string_dtype(self.dtype):
             with acquire_spill_lock():
                 result = type(self).from_pylibcudf(
@@ -440,14 +436,14 @@ def _fill(
                         self.to_pylibcudf(mode="read"),
                         begin,
                         end,
-                        slr.device_value,
+                        fill_value,
                     )
                 )
             if is_string_dtype(self.dtype):
                 return self._mimic_inplace(result, inplace=True)
             return result  # type: ignore[return-value]
 
-        if not slr.is_valid() and not self.nullable:
+        if not fill_value.is_valid() and not self.nullable:
             mask = as_buffer(
                 plc.null_mask.create_null_mask(
                     self.size, plc.null_mask.MaskState.ALL_VALID
@@ -460,7 +456,7 @@ def _fill(
                 self.to_pylibcudf(mode="write"),
                 begin,
                 end,
-                slr.device_value,
+                fill_value,
             )
         return self
 
@@ -629,8 +625,8 @@ def __setitem__(self, key: Any, value: Any):
         """
 
         # Normalize value to scalar/column
-        value_normalized: cudf.Scalar | ColumnBase = (
-            cudf.Scalar(value, dtype=self.dtype)
+        value_normalized: plc.Scalar | ColumnBase = (
+            cudf.Scalar(value, dtype=self.dtype).device_value
             if is_scalar(value)
             else as_column(value, dtype=self.dtype)
         )
@@ -658,7 +654,7 @@ def _wrap_binop_normalization(self, other):
     def _scatter_by_slice(
         self,
         key: builtins.slice,
-        value: cudf.core.scalar.Scalar | ColumnBase,
+        value: plc.Scalar | ColumnBase,
     ) -> Self | None:
         """If this function returns None, it's either a no-op (slice is empty),
         or the inplace replacement is already performed (fill-in-place).
@@ -672,12 +668,12 @@ def _scatter_by_slice(
         self._check_scatter_key_length(num_keys, value)
 
         if step == 1 and not isinstance(
-            self, (cudf.core.column.StructColumn, cudf.core.column.ListColumn)
+            self.dtype, (cudf.StructDtype, cudf.ListDtype)
         ):
             # NOTE: List & Struct dtypes aren't supported by both
             # inplace & out-of-place fill. Hence we need to use scatter for
             # these two types.
-            if isinstance(value, cudf.core.scalar.Scalar):
+            if isinstance(value, plc.Scalar):
                 return self._fill(value, start, stop, inplace=True)
             else:
                 with acquire_spill_lock():
@@ -705,7 +701,7 @@ def _scatter_by_slice(
     def _scatter_by_column(
         self,
         key: cudf.core.column.NumericalColumn,
-        value: cudf.core.scalar.Scalar | ColumnBase,
+        value: plc.Scalar | ColumnBase,
         bounds_check: bool = True,
     ) -> Self:
         if key.dtype.kind == "b":
@@ -738,7 +734,7 @@ def _scatter_by_column(
                 plc_table = plc.copying.boolean_mask_scatter(
                     plc.Table([value.to_pylibcudf(mode="read")])
                     if isinstance(value, Column)
-                    else [value.device_value],
+                    else [value],
                     plc.Table([self.to_pylibcudf(mode="read")]),
                     key.to_pylibcudf(mode="read"),
                 )
@@ -753,7 +749,7 @@ def _scatter_by_column(
             )[0]._with_type_metadata(self.dtype)
 
     def _check_scatter_key_length(
-        self, num_keys: int, value: cudf.core.scalar.Scalar | ColumnBase
+        self, num_keys: int, value: plc.Scalar | ColumnBase
     ) -> None:
         """`num_keys` is the number of keys to scatter. Should equal to the
         number of rows in ``value`` if ``value`` is a column.
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 3649e9830de..0ee4da9e08f 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -30,7 +30,8 @@
 from cudf.core.buffer import Buffer, acquire_spill_lock
 from cudf.core.column.column import ColumnBase, as_column
 from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion
-from cudf.utils.dtypes import _get_base_dtype
+from cudf.core.scalar import pa_scalar_to_plc_scalar
+from cudf.utils.dtypes import _get_base_dtype, cudf_dtype_to_pa_type
 from cudf.utils.utils import (
     _all_bools_with_nulls,
     _datetime_timedelta_find_and_replace,
@@ -949,7 +950,9 @@ def tz_localize(
         )
         localized = self._scatter_by_column(
             self.isnull() | (ambiguous_col | nonexistent_col),
-            cudf.Scalar(cudf.NaT, dtype=self.dtype),
+            pa_scalar_to_plc_scalar(
+                pa.scalar(None, type=cudf_dtype_to_pa_type(self.dtype))
+            ),
         )
 
         transition_times, offsets = get_tz_data(tzname)
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index 361da7d3be3..837763ee30c 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -545,7 +545,8 @@ def get(
             # replace the value in those rows (should be NA) with `default`
             if out_of_bounds_mask.any():
                 out = out._scatter_by_column(
-                    out_of_bounds_mask, cudf.Scalar(default)
+                    out_of_bounds_mask,
+                    pa_scalar_to_plc_scalar(pa.scalar(default)),
                 )
         if out.dtype != self._column.dtype.element_type:
             # libcudf doesn't maintain struct labels so we must transfer over
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index f901b5d735e..d4ef09e44e8 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -149,7 +149,7 @@ def __setitem__(self, key: Any, value: Any):
         """
 
         # Normalize value to scalar/column
-        device_value: cudf.Scalar | ColumnBase = (
+        value_normalized: cudf.Scalar | ColumnBase = (
             cudf.Scalar(
                 value,
                 dtype=self.dtype
@@ -160,12 +160,17 @@ def __setitem__(self, key: Any, value: Any):
             else as_column(value)
         )
 
-        if self.dtype.kind != "b" and device_value.dtype.kind == "b":
+        if self.dtype.kind != "b" and value_normalized.dtype.kind == "b":
             raise TypeError(f"Invalid value {value} for dtype {self.dtype}")
         else:
-            device_value = device_value.astype(self.dtype)
+            value_normalized = value_normalized.astype(self.dtype)
 
         out: ColumnBase | None  # If None, no need to perform mimic inplace.
+        device_value = (
+            value_normalized.device_value
+            if isinstance(value_normalized, cudf.Scalar)
+            else value_normalized
+        )
         if isinstance(key, slice):
             out = self._scatter_by_slice(key, device_value)
         else:
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 8c32da43c75..742f1d43ee1 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -20,6 +20,7 @@
 import cupy as cp
 import numpy as np
 import pandas as pd
+import pyarrow as pa
 from typing_extensions import Self
 
 import pylibcudf as plc
@@ -49,6 +50,7 @@
 from cudf.core.missing import NA
 from cudf.core.multiindex import MultiIndex
 from cudf.core.resample import _Resampler
+from cudf.core.scalar import pa_scalar_to_plc_scalar
 from cudf.core.udf.utils import (
     _compile_or_get,
     _get_input_args_from_frame,
@@ -3258,7 +3260,7 @@ def duplicated(
             True, length=len(self), dtype=bool
         )._scatter_by_column(
             distinct,
-            cudf.Scalar(False),
+            pa_scalar_to_plc_scalar(pa.scalar(False)),
             bounds_check=False,
         )
         return cudf.Series._from_column(result, index=self.index, name=name)
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index a6cad4cc5dc..514760d79f8 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -21,7 +21,7 @@
 from cudf.api.types import is_integer, is_list_like, is_object_dtype, is_scalar
 from cudf.core import column
 from cudf.core._base_index import _return_get_indexer_result
-from cudf.core._internals import copying, sorting
+from cudf.core._internals import sorting
 from cudf.core.algorithms import factorize
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column_accessor import ColumnAccessor
@@ -1964,8 +1964,9 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
             )
             scatter_map = libcudf.column.Column.from_pylibcudf(left_plc)
             indices = libcudf.column.Column.from_pylibcudf(right_plc)
-        result = copying.scatter([indices], scatter_map, [result])[0]
-        result_series = cudf.Series._from_column(result)
+        result_series = cudf.Series._from_column(
+            result._scatter_by_column(scatter_map, indices)
+        )
 
         if method in {"ffill", "bfill", "pad", "backfill"}:
             result_series = _get_indexer_basic(

From 0ce2d2d8ea34bc68d69876efadb085ed7d7f5a41 Mon Sep 17 00:00:00 2001
From: Taurean Dyer <46935140+taureandyernv@users.noreply.github.com>
Date: Thu, 30 Jan 2025 16:15:18 -0800
Subject: [PATCH 08/17] Explicitly call out that the GPU open beta runs on a
 single GPU (#17872)

due to questions that arose, I wanted to clarify up front that the GPU Open beta works on a single GPU.
Closes #17871

Authors:
  - Taurean Dyer (https://github.com/taureandyernv)
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/17872
---
 docs/cudf/source/cudf_polars/index.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/cudf/source/cudf_polars/index.rst b/docs/cudf/source/cudf_polars/index.rst
index a9b4bb2dff2..d4b2a2c7b1a 100644
--- a/docs/cudf/source/cudf_polars/index.rst
+++ b/docs/cudf/source/cudf_polars/index.rst
@@ -5,7 +5,7 @@ cuDF provides an in-memory, GPU-accelerated execution engine for Python users of
 The engine supports most of the core expressions and data types as well as a growing set of more advanced dataframe manipulations
 and data file formats. When using the GPU engine, Polars will convert expressions into an optimized query plan and determine
 whether the plan is supported on the GPU. If it is not, the execution will transparently fall back to the standard Polars engine
-and run on the CPU.
+and run on the CPU. This functionality is available in Open Beta, is undergoing rapid development, and is currently a single GPU implementation.
 
 Benchmark
 ---------
@@ -42,9 +42,9 @@ Launch on Google Colab
 
 .. figure:: ../_static/colab.png
    :width: 200px
-   :target: https://colab.research.google.com/github/rapidsai-community/showcase/blob/main/accelerated_data_processing_examples/polars_gpu_engine_demo.ipynb
+   :target: https://nvda.ws/4eKlWZW
 
-   Try out the GPU engine for Polars in a free GPU notebook environment. Sign in with your Google account and `launch the demo on Colab <https://colab.research.google.com/github/rapidsai-community/showcase/blob/main/accelerated_data_processing_examples/polars_gpu_engine_demo.ipynb>`__.
+   Try out the GPU engine for Polars in a free GPU notebook environment. Sign in with your Google account and `launch the demo on Colab <https://nvda.ws/4eKlWZW>`__.
 
 .. toctree::
    :maxdepth: 1

From 10c1fb4335865b6ac5ccbd7edee04ae7fd52b6a6 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Thu, 30 Jan 2025 18:19:25 -0800
Subject: [PATCH 09/17] Measure the number of Parquet row groups filtered by
 predicate pushdown (#17594)

Closes #17164

This PR adds a method to measure the number of remaining row groups after stats and bloom filtering during predicate pushdown.

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - MithunR (https://github.com/mythrocks)

URL: https://github.com/rapidsai/cudf/pull/17594
---
 cpp/include/cudf/io/types.hpp                |  17 +-
 cpp/src/io/parquet/bloom_filter_reader.cu    |  36 ++---
 cpp/src/io/parquet/predicate_pushdown.cpp    |  75 ++++-----
 cpp/src/io/parquet/reader_impl.cpp           |  11 ++
 cpp/src/io/parquet/reader_impl_chunking.hpp  |   5 +
 cpp/src/io/parquet/reader_impl_helpers.cpp   |  97 +++++++++++-
 cpp/src/io/parquet/reader_impl_helpers.hpp   |  61 +++++---
 cpp/src/io/parquet/reader_impl_preprocess.cu |   4 +-
 cpp/tests/io/parquet_reader_test.cpp         | 155 ++++++++++++++-----
 9 files changed, 332 insertions(+), 129 deletions(-)

diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp
index a34881942ce..9e171a62f78 100644
--- a/cpp/include/cudf/io/types.hpp
+++ b/cpp/include/cudf/io/types.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -277,13 +277,24 @@ struct column_name_info {
 struct table_metadata {
   std::vector<column_name_info>
     schema_info;  //!< Detailed name information for the entire output hierarchy
-  std::vector<size_t> num_rows_per_source;  //!< Number of rows read from each data source.
+  std::vector<size_t> num_rows_per_source;  //!< Number of rows read from each data source
                                             //!< Currently only computed for Parquet readers if no
-                                            //!< AST filters being used. Empty vector otherwise.
+                                            //!< AST filters being used. Empty vector otherwise
   std::map<std::string, std::string> user_data;  //!< Format-dependent metadata of the first input
                                                  //!< file as key-values pairs (deprecated)
   std::vector<std::unordered_map<std::string, std::string>>
     per_file_user_data;  //!< Per file format-dependent metadata as key-values pairs
+
+  // The following variables are currently only computed for Parquet reader
+  size_type num_input_row_groups{0};  //!< Total number of input row groups across all data sources
+  std::optional<size_type>
+    num_row_groups_after_stats_filter;  //!< Number of remaining row groups after stats filter.
+                                        //!< std::nullopt if no filtering done. Currently only
+                                        //!< reported by Parquet readers
+  std::optional<size_type>
+    num_row_groups_after_bloom_filter;  //!< Number of remaining row groups after bloom filter.
+                                        //!< std::nullopt if no filtering done. Currently only
+                                        //!< reported by Parquet readers
 };
 
 /**
diff --git a/cpp/src/io/parquet/bloom_filter_reader.cu b/cpp/src/io/parquet/bloom_filter_reader.cu
index af524e1f70a..a883981a467 100644
--- a/cpp/src/io/parquet/bloom_filter_reader.cu
+++ b/cpp/src/io/parquet/bloom_filter_reader.cu
@@ -599,9 +599,11 @@ std::vector<Type> aggregate_reader_metadata::get_parquet_types(
   return parquet_types;
 }
 
-std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::apply_bloom_filters(
+std::pair<std::optional<std::vector<std::vector<size_type>>>, bool>
+aggregate_reader_metadata::apply_bloom_filters(
   host_span<std::unique_ptr<datasource> const> sources,
   host_span<std::vector<size_type> const> input_row_group_indices,
+  size_type total_row_groups,
   host_span<data_type const> output_dtypes,
   host_span<int const> output_column_schemas,
   std::reference_wrapper<ast::expression const> filter,
@@ -610,17 +612,6 @@ std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::ap
   // Number of input table columns
   auto const num_input_columns = static_cast<cudf::size_type>(output_dtypes.size());
 
-  // Total number of row groups after StatsAST filtration
-  auto const total_row_groups = std::accumulate(
-    input_row_group_indices.begin(),
-    input_row_group_indices.end(),
-    size_t{0},
-    [](size_t sum, auto const& per_file_row_groups) { return sum + per_file_row_groups.size(); });
-
-  // Check if we have less than 2B total row groups.
-  CUDF_EXPECTS(total_row_groups <= std::numeric_limits<cudf::size_type>::max(),
-               "Total number of row groups exceed the size_type's limit");
-
   // Collect equality literals for each input table column
   auto const equality_literals =
     equality_literals_collector{filter.get(), num_input_columns}.get_equality_literals();
@@ -635,7 +626,7 @@ std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::ap
                   [](auto& eq_literals) { return not eq_literals.empty(); });
 
   // Return early if no column with equality predicate(s)
-  if (equality_col_schemas.empty()) { return std::nullopt; }
+  if (equality_col_schemas.empty()) { return {std::nullopt, false}; }
 
   // Required alignment:
   // https://github.com/NVIDIA/cuCollections/blob/deab5799f3e4226cb8a49acf2199c03b14941ee4/include/cuco/detail/bloom_filter/bloom_filter_impl.cuh#L55-L67
@@ -654,8 +645,8 @@ std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::ap
   auto bloom_filter_data = read_bloom_filters(
     sources, input_row_group_indices, equality_col_schemas, total_row_groups, stream, aligned_mr);
 
-  // No bloom filter buffers, return the original row group indices
-  if (bloom_filter_data.empty()) { return std::nullopt; }
+  // No bloom filter buffers, return early
+  if (bloom_filter_data.empty()) { return {std::nullopt, false}; }
 
   // Get parquet types for the predicate columns
   auto const parquet_types = get_parquet_types(input_row_group_indices, equality_col_schemas);
@@ -676,8 +667,10 @@ std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::ap
     h_bloom_filter_spans, stream, cudf::get_current_device_resource_ref());
 
   // Create a bloom filter query table caster
-  bloom_filter_caster const bloom_filter_col{
-    bloom_filter_spans, parquet_types, total_row_groups, equality_col_schemas.size()};
+  bloom_filter_caster const bloom_filter_col{bloom_filter_spans,
+                                             parquet_types,
+                                             static_cast<size_t>(total_row_groups),
+                                             equality_col_schemas.size()};
 
   // Converts bloom filter membership for equality predicate columns to a table
   // containing a column for each `col[i] == literal` predicate to be evaluated.
@@ -714,10 +707,11 @@ std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::ap
 
   // Filter bloom filter membership table with the BloomfilterAST expression and collect
   // filtered row group indices
-  return collect_filtered_row_group_indices(bloom_filter_membership_table,
-                                            bloom_filter_expr.get_bloom_filter_expr(),
-                                            input_row_group_indices,
-                                            stream);
+  return {collect_filtered_row_group_indices(bloom_filter_membership_table,
+                                             bloom_filter_expr.get_bloom_filter_expr(),
+                                             input_row_group_indices,
+                                             stream),
+          true};
 }
 
 }  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp
index 0e307bac097..1508b7eef8b 100644
--- a/cpp/src/io/parquet/predicate_pushdown.cpp
+++ b/cpp/src/io/parquet/predicate_pushdown.cpp
@@ -388,40 +388,17 @@ class stats_expression_converter : public ast::detail::expression_transformer {
 };
 }  // namespace
 
-std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::filter_row_groups(
+std::pair<std::optional<std::vector<std::vector<size_type>>>, surviving_row_group_metrics>
+aggregate_reader_metadata::filter_row_groups(
   host_span<std::unique_ptr<datasource> const> sources,
-  host_span<std::vector<size_type> const> row_group_indices,
+  host_span<std::vector<size_type> const> input_row_group_indices,
+  size_type total_row_groups,
   host_span<data_type const> output_dtypes,
   host_span<int const> output_column_schemas,
   std::reference_wrapper<ast::expression const> filter,
   rmm::cuda_stream_view stream) const
 {
   auto mr = cudf::get_current_device_resource_ref();
-  // Create row group indices.
-  std::vector<std::vector<size_type>> all_row_group_indices;
-  host_span<std::vector<size_type> const> input_row_group_indices;
-  if (row_group_indices.empty()) {
-    std::transform(per_file_metadata.cbegin(),
-                   per_file_metadata.cend(),
-                   std::back_inserter(all_row_group_indices),
-                   [](auto const& file_meta) {
-                     std::vector<size_type> rg_idx(file_meta.row_groups.size());
-                     std::iota(rg_idx.begin(), rg_idx.end(), 0);
-                     return rg_idx;
-                   });
-    input_row_group_indices = host_span<std::vector<size_type> const>(all_row_group_indices);
-  } else {
-    input_row_group_indices = row_group_indices;
-  }
-  auto const total_row_groups = std::accumulate(
-    input_row_group_indices.begin(),
-    input_row_group_indices.end(),
-    size_t{0},
-    [](size_t sum, auto const& per_file_row_groups) { return sum + per_file_row_groups.size(); });
-
-  // Check if we have less than 2B total row groups.
-  CUDF_EXPECTS(total_row_groups <= std::numeric_limits<cudf::size_type>::max(),
-               "Total number of row groups exceed the size_type's limit");
 
   // Converts Column chunk statistics to a table
   // where min(col[i]) = columns[i*2], max(col[i])=columns[i*2+1]
@@ -451,16 +428,22 @@ std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::fi
   // Converts AST to StatsAST with reference to min, max columns in above `stats_table`.
   stats_expression_converter const stats_expr{filter.get(),
                                               static_cast<size_type>(output_dtypes.size())};
-  auto stats_ast     = stats_expr.get_stats_expr();
-  auto predicate_col = cudf::detail::compute_column(stats_table, stats_ast.get(), stream, mr);
-  auto predicate     = predicate_col->view();
-  CUDF_EXPECTS(predicate.type().id() == cudf::type_id::BOOL8,
-               "Filter expression must return a boolean column");
 
   // Filter stats table with StatsAST expression and collect filtered row group indices
   auto const filtered_row_group_indices = collect_filtered_row_group_indices(
     stats_table, stats_expr.get_stats_expr(), input_row_group_indices, stream);
 
+  // Number of surviving row groups after applying stats filter
+  auto const num_stats_filtered_row_groups =
+    filtered_row_group_indices.has_value()
+      ? std::accumulate(filtered_row_group_indices.value().cbegin(),
+                        filtered_row_group_indices.value().cend(),
+                        size_type{0},
+                        [](auto& sum, auto const& per_file_row_groups) {
+                          return sum + per_file_row_groups.size();
+                        })
+      : total_row_groups;
+
   // Span of row groups to apply bloom filtering on.
   auto const bloom_filter_input_row_groups =
     filtered_row_group_indices.has_value()
@@ -468,12 +451,32 @@ std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::fi
       : input_row_group_indices;
 
   // Apply bloom filtering on the bloom filter input row groups
-  auto const bloom_filtered_row_groups = apply_bloom_filters(
-    sources, bloom_filter_input_row_groups, output_dtypes, output_column_schemas, filter, stream);
+  auto const [bloom_filtered_row_groups, bloom_filters_exist] =
+    apply_bloom_filters(sources,
+                        bloom_filter_input_row_groups,
+                        num_stats_filtered_row_groups,
+                        output_dtypes,
+                        output_column_schemas,
+                        filter,
+                        stream);
+
+  // Number of surviving row groups after applying bloom filter
+  auto const num_bloom_filtered_row_groups =
+    bloom_filters_exist
+      ? (bloom_filtered_row_groups.has_value()
+           ? std::make_optional(std::accumulate(bloom_filtered_row_groups.value().cbegin(),
+                                                bloom_filtered_row_groups.value().cend(),
+                                                size_type{0},
+                                                [](auto& sum, auto const& per_file_row_groups) {
+                                                  return sum + per_file_row_groups.size();
+                                                }))
+           : std::make_optional(num_stats_filtered_row_groups))
+      : std::nullopt;
 
   // Return bloom filtered row group indices iff collected
-  return bloom_filtered_row_groups.has_value() ? bloom_filtered_row_groups
-                                               : filtered_row_group_indices;
+  return {
+    bloom_filtered_row_groups.has_value() ? bloom_filtered_row_groups : filtered_row_group_indices,
+    {std::make_optional(num_stats_filtered_row_groups), num_bloom_filtered_row_groups}};
 }
 
 // convert column named expression to column index reference expression
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 9dd4e19de52..87e358e89f8 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -610,6 +610,17 @@ table_with_metadata reader::impl::read_chunk_internal(read_mode mode)
   auto out_columns = std::vector<std::unique_ptr<column>>{};
   out_columns.reserve(_output_buffers.size());
 
+  // Copy number of total input row groups and number of surviving row groups from predicate
+  // pushdown.
+  out_metadata.num_input_row_groups = _file_itm_data.num_input_row_groups;
+  // Copy the number surviving row groups from each predicate pushdown only if the filter has value.
+  if (_expr_conv.get_converted_expr().has_value()) {
+    out_metadata.num_row_groups_after_stats_filter =
+      _file_itm_data.surviving_row_groups.after_stats_filter;
+    out_metadata.num_row_groups_after_bloom_filter =
+      _file_itm_data.surviving_row_groups.after_bloom_filter;
+  }
+
   // no work to do (this can happen on the first pass if we have no rows to read)
   if (!has_more_work()) {
     // Check if number of rows per source should be included in output metadata.
diff --git a/cpp/src/io/parquet/reader_impl_chunking.hpp b/cpp/src/io/parquet/reader_impl_chunking.hpp
index 4a773fbced1..294eaf9ac16 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.hpp
+++ b/cpp/src/io/parquet/reader_impl_chunking.hpp
@@ -47,6 +47,11 @@ struct file_intermediate_data {
   // partial sum of the number of rows per data source
   std::vector<std::size_t> exclusive_sum_num_rows_per_source{};
 
+  size_type num_input_row_groups{0};  // total number of input row groups across all data sources
+
+  // struct containing the number of remaining row groups after each predicate pushdown filter
+  surviving_row_group_metrics surviving_row_groups;
+
   size_t _current_input_pass{0};  // current input pass index
   size_t _output_chunk_count{0};  // how many output chunks we have produced
 
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index 7d3b6a39d5b..768ca384352 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -408,10 +408,16 @@ int64_t aggregate_reader_metadata::calc_num_rows() const
 
 size_type aggregate_reader_metadata::calc_num_row_groups() const
 {
-  return std::accumulate(
-    per_file_metadata.cbegin(), per_file_metadata.cend(), 0, [](auto& sum, auto& pfm) {
+  auto const total_row_groups = std::accumulate(
+    per_file_metadata.cbegin(), per_file_metadata.cend(), size_t{0}, [](size_t& sum, auto& pfm) {
       return sum + pfm.row_groups.size();
     });
+
+  // Check if we have less than 2B total row groups.
+  CUDF_EXPECTS(total_row_groups <= std::numeric_limits<cudf::size_type>::max(),
+               "Total number of row groups exceed the size_type's limit");
+
+  return static_cast<size_type>(total_row_groups);
 }
 
 // Copies info from the column and offset indexes into the passed in row_group_info.
@@ -1029,7 +1035,12 @@ std::vector<std::string> aggregate_reader_metadata::get_pandas_index_names() con
   return names;
 }
 
-std::tuple<int64_t, size_type, std::vector<row_group_info>, std::vector<size_t>>
+std::tuple<int64_t,
+           size_type,
+           std::vector<row_group_info>,
+           std::vector<size_t>,
+           size_type,
+           surviving_row_group_metrics>
 aggregate_reader_metadata::select_row_groups(
   host_span<std::unique_ptr<datasource> const> sources,
   host_span<std::vector<size_type> const> row_group_indices,
@@ -1040,17 +1051,63 @@ aggregate_reader_metadata::select_row_groups(
   std::optional<std::reference_wrapper<ast::expression const>> filter,
   rmm::cuda_stream_view stream) const
 {
+  // Compute total number of input row groups
+  size_type total_row_groups = [&]() {
+    if (not row_group_indices.empty()) {
+      size_t const total_row_groups =
+        std::accumulate(row_group_indices.begin(),
+                        row_group_indices.end(),
+                        size_t{0},
+                        [](size_t& sum, auto const& pfm) { return sum + pfm.size(); });
+
+      // Check if we have less than 2B total row groups.
+      CUDF_EXPECTS(total_row_groups <= std::numeric_limits<cudf::size_type>::max(),
+                   "Total number of row groups exceed the size_type's limit");
+      return static_cast<size_type>(total_row_groups);
+    } else {
+      return num_row_groups;
+    }
+  }();
+
+  // Pair to store the number of row groups after stats and bloom filtering respectively. Initialize
+  // to total_row_groups.
+  surviving_row_group_metrics num_row_groups_after_filters{};
+
   std::optional<std::vector<std::vector<size_type>>> filtered_row_group_indices;
   // if filter is not empty, then gather row groups to read after predicate pushdown
   if (filter.has_value()) {
-    filtered_row_group_indices = filter_row_groups(
-      sources, row_group_indices, output_dtypes, output_column_schemas, filter.value(), stream);
+    // Span of input row group indices for predicate pushdown
+    host_span<std::vector<size_type> const> input_row_group_indices;
+    std::vector<std::vector<size_type>> all_row_group_indices;
+    if (row_group_indices.empty()) {
+      std::transform(per_file_metadata.cbegin(),
+                     per_file_metadata.cend(),
+                     std::back_inserter(all_row_group_indices),
+                     [](auto const& file_meta) {
+                       std::vector<size_type> rg_idx(file_meta.row_groups.size());
+                       std::iota(rg_idx.begin(), rg_idx.end(), 0);
+                       return rg_idx;
+                     });
+      input_row_group_indices = host_span<std::vector<size_type> const>(all_row_group_indices);
+    } else {
+      input_row_group_indices = row_group_indices;
+    }
+    // Predicate pushdown: Filter row groups using stats and bloom filters
+    std::tie(filtered_row_group_indices, num_row_groups_after_filters) =
+      filter_row_groups(sources,
+                        input_row_group_indices,
+                        total_row_groups,
+                        output_dtypes,
+                        output_column_schemas,
+                        filter.value(),
+                        stream);
     if (filtered_row_group_indices.has_value()) {
       row_group_indices =
         host_span<std::vector<size_type> const>(filtered_row_group_indices.value());
     }
   }
-  std::vector<row_group_info> selection;
+
+  // Compute the number of rows to read and skip
   auto [rows_to_skip, rows_to_read] = [&]() {
     if (not row_group_indices.empty()) { return std::pair<int64_t, size_type>{}; }
     auto const from_opts = cudf::io::detail::skip_rows_num_rows_from_options(
@@ -1061,7 +1118,9 @@ aggregate_reader_metadata::select_row_groups(
                      static_cast<size_type>(from_opts.second)};
   }();
 
-  // Get number of rows in each data source
+  // Vector to hold the `row_group_info` of selected row groups
+  std::vector<row_group_info> selection;
+  // Number of rows in each data source
   std::vector<size_t> num_rows_per_source(per_file_metadata.size(), 0);
 
   if (!row_group_indices.empty()) {
@@ -1083,6 +1142,10 @@ aggregate_reader_metadata::select_row_groups(
       }
     }
   } else {
+    // Reset and recompute input row group count to adjust for num_rows and skip_rows. Here, the
+    // output from predicate pushdown was empty. i.e., no row groups filtered.
+    total_row_groups = 0;
+
     size_type count = 0;
     for (size_t src_idx = 0; src_idx < per_file_metadata.size(); ++src_idx) {
       auto const& fmd = per_file_metadata[src_idx];
@@ -1093,6 +1156,9 @@ aggregate_reader_metadata::select_row_groups(
         auto const chunk_start_row = count;
         count += rg.num_rows;
         if (count > rows_to_skip || count == 0) {
+          // Keep this row group, increase count
+          total_row_groups++;
+
           // start row of this row group adjusted with rows_to_skip
           num_rows_per_source[src_idx] += count;
           num_rows_per_source[src_idx] -=
@@ -1113,9 +1179,24 @@ aggregate_reader_metadata::select_row_groups(
         }
       }
     }
+
+    // If filter had a value and no row groups were filtered, set the number of row groups after
+    // filters to the number of adjusted input row groups
+    auto const after_stats_filter = num_row_groups_after_filters.after_stats_filter.has_value()
+                                      ? std::make_optional(total_row_groups)
+                                      : std::nullopt;
+    auto const after_bloom_filter = num_row_groups_after_filters.after_bloom_filter.has_value()
+                                      ? std::make_optional(total_row_groups)
+                                      : std::nullopt;
+    num_row_groups_after_filters  = {after_stats_filter, after_bloom_filter};
   }
 
-  return {rows_to_skip, rows_to_read, std::move(selection), std::move(num_rows_per_source)};
+  return {rows_to_skip,
+          rows_to_read,
+          std::move(selection),
+          std::move(num_rows_per_source),
+          total_row_groups,
+          std::move(num_row_groups_after_filters)};
 }
 
 std::tuple<std::vector<input_column_info>,
diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp
index ba5e53e3104..c4372b2c1ff 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.hpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.hpp
@@ -125,6 +125,14 @@ struct arrow_schema_data_types {
   data_type type{type_id::EMPTY};
 };
 
+/**
+ * @brief Struct to store the number of row groups surviving each predicate pushdown filter.
+ */
+struct surviving_row_group_metrics {
+  std::optional<size_type> after_stats_filter;  // number of surviving row groups after stats filter
+  std::optional<size_type> after_bloom_filter;  // number of surviving row groups after bloom filter
+};
+
 class aggregate_reader_metadata {
   std::vector<metadata> per_file_metadata;
   std::vector<std::unordered_map<std::string, std::string>> keyval_maps;
@@ -358,40 +366,47 @@ class aggregate_reader_metadata {
    * @brief Filters the row groups based on predicate filter
    *
    * @param sources Lists of input datasources
-   * @param row_group_indices Lists of row groups to read, one per source
+   * @param input_row_group_indices Lists of input row groups, one per source
+   * @param total_row_groups Total number of row groups in `input_row_group_indices`
    * @param output_dtypes Datatypes of output columns
    * @param output_column_schemas schema indices of output columns
    * @param filter AST expression to filter row groups based on Column chunk statistics
    * @param stream CUDA stream used for device memory operations and kernel launches
-   * @return Filtered row group indices, if any is filtered
+   * @return A pair of a list of filtered row group indices if any are filtered, and a struct
+   *         containing the number of row groups surviving each predicate pushdown filter
    */
-  [[nodiscard]] std::optional<std::vector<std::vector<size_type>>> filter_row_groups(
-    host_span<std::unique_ptr<datasource> const> sources,
-    host_span<std::vector<size_type> const> row_group_indices,
-    host_span<data_type const> output_dtypes,
-    host_span<int const> output_column_schemas,
-    std::reference_wrapper<ast::expression const> filter,
-    rmm::cuda_stream_view stream) const;
+  [[nodiscard]] std::pair<std::optional<std::vector<std::vector<size_type>>>,
+                          surviving_row_group_metrics>
+  filter_row_groups(host_span<std::unique_ptr<datasource> const> sources,
+                    host_span<std::vector<size_type> const> input_row_group_indices,
+                    size_type total_row_groups,
+                    host_span<data_type const> output_dtypes,
+                    host_span<int const> output_column_schemas,
+                    std::reference_wrapper<ast::expression const> filter,
+                    rmm::cuda_stream_view stream) const;
 
   /**
    * @brief Filters the row groups using bloom filters
    *
    * @param sources Dataset sources
-   * @param row_group_indices Lists of input row groups to read, one per source
+   * @param input_row_group_indices Lists of input row groups, one per source
+   * @param total_row_groups Total number of row groups in `input_row_group_indices`
    * @param output_dtypes Datatypes of output columns
    * @param output_column_schemas schema indices of output columns
    * @param filter AST expression to filter row groups based on bloom filter membership
    * @param stream CUDA stream used for device memory operations and kernel launches
    *
-   * @return Filtered row group indices, if any is filtered
+   * @return A pair of filtered row group indices if any is filtered, and a boolean indicating if
+   *         bloom filtering was applied
    */
-  [[nodiscard]] std::optional<std::vector<std::vector<size_type>>> apply_bloom_filters(
-    host_span<std::unique_ptr<datasource> const> sources,
-    host_span<std::vector<size_type> const> input_row_group_indices,
-    host_span<data_type const> output_dtypes,
-    host_span<int const> output_column_schemas,
-    std::reference_wrapper<ast::expression const> filter,
-    rmm::cuda_stream_view stream) const;
+  [[nodiscard]] std::pair<std::optional<std::vector<std::vector<size_type>>>, bool>
+  apply_bloom_filters(host_span<std::unique_ptr<datasource> const> sources,
+                      host_span<std::vector<size_type> const> input_row_group_indices,
+                      size_type total_row_groups,
+                      host_span<data_type const> output_dtypes,
+                      host_span<int const> output_column_schemas,
+                      std::reference_wrapper<ast::expression const> filter,
+                      rmm::cuda_stream_view stream) const;
 
   /**
    * @brief Filters and reduces down to a selection of row groups
@@ -408,9 +423,15 @@ class aggregate_reader_metadata {
    * @param filter Optional AST expression to filter row groups based on Column chunk statistics
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @return A tuple of corrected row_start, row_count, list of row group indexes and its
-   *         starting row, and list of number of rows per source
+   *         starting row, list of number of rows per source, number of input row groups, and a
+   *         struct containing the number of row groups surviving each predicate pushdown filter
    */
-  [[nodiscard]] std::tuple<int64_t, size_type, std::vector<row_group_info>, std::vector<size_t>>
+  [[nodiscard]] std::tuple<int64_t,
+                           size_type,
+                           std::vector<row_group_info>,
+                           std::vector<size_t>,
+                           size_type,
+                           surviving_row_group_metrics>
   select_row_groups(host_span<std::unique_ptr<datasource> const> sources,
                     host_span<std::vector<size_type> const> row_group_indices,
                     int64_t row_start,
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 3874346e471..b6134947b0c 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -1285,7 +1285,9 @@ void reader::impl::preprocess_file(read_mode mode)
   std::tie(_file_itm_data.global_skip_rows,
            _file_itm_data.global_num_rows,
            _file_itm_data.row_groups,
-           _file_itm_data.num_rows_per_source) =
+           _file_itm_data.num_rows_per_source,
+           _file_itm_data.num_input_row_groups,
+           _file_itm_data.surviving_row_groups) =
     _metadata->select_row_groups(_sources,
                                  _options.row_group_indices,
                                  _options.skip_rows,
diff --git a/cpp/tests/io/parquet_reader_test.cpp b/cpp/tests/io/parquet_reader_test.cpp
index 177e6163d4f..b96c423917a 100644
--- a/cpp/tests/io/parquet_reader_test.cpp
+++ b/cpp/tests/io/parquet_reader_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -1328,6 +1328,26 @@ TEST_F(ParquetReaderTest, ReorderedReadMultipleFiles)
   CUDF_TEST_EXPECT_TABLES_EQUAL(sliced[1], swapped2);
 }
 
+TEST_F(ParquetReaderTest, NoFilter)
+{
+  srand(31337);
+  auto expected = create_random_fixed_table<int>(9, 9, false);
+
+  auto filepath = temp_env->get_temp_filepath("FilterSimple.parquet");
+  cudf::io::parquet_writer_options args =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, *expected);
+  cudf::io::write_parquet(args);
+
+  cudf::io::parquet_reader_options read_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(read_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *expected);
+  EXPECT_EQ(result.metadata.num_input_row_groups, 1);
+  EXPECT_FALSE(result.metadata.num_row_groups_after_stats_filter.has_value());
+  EXPECT_FALSE(result.metadata.num_row_groups_after_bloom_filter.has_value());
+}
+
 TEST_F(ParquetReaderTest, FilterSimple)
 {
   srand(31337);
@@ -2681,52 +2701,107 @@ TYPED_TEST(ParquetReaderPredicatePushdownTest, FilterTyped)
 
   auto const [src, filepath] = create_parquet_typed_with_stats<T>("FilterTyped.parquet");
   auto const written_table   = src.view();
+  auto const col_name_0      = cudf::ast::column_name_reference("col0");
+  auto const col_ref_0       = cudf::ast::column_reference(0);
 
-  // Filtering AST
-  auto literal_value = []() {
-    if constexpr (cudf::is_timestamp<T>()) {
-      // table[0] < 10000 timestamp days/seconds/milliseconds/microseconds/nanoseconds
-      return cudf::timestamp_scalar<T>(T(typename T::duration(10000)));  // i (0-20,000)
-    } else if constexpr (cudf::is_duration<T>()) {
-      // table[0] < 10000 day/seconds/milliseconds/microseconds/nanoseconds
-      return cudf::duration_scalar<T>(T(10000));  // i (0-20,000)
-    } else if constexpr (std::is_same_v<T, cudf::string_view>) {
-      // table[0] < "000010000"
-      return cudf::string_scalar("000010000");  // i (0-20,000)
+  auto const test_predicate_pushdown = [&](cudf::ast::operation const& filter_expression,
+                                           cudf::ast::operation const& ref_filter,
+                                           cudf::size_type expected_total_row_groups,
+                                           cudf::size_type expected_stats_filtered_row_groups) {
+    // Expected result
+    auto const predicate = cudf::compute_column(written_table, ref_filter);
+    EXPECT_EQ(predicate->view().type().id(), cudf::type_id::BOOL8)
+      << "Predicate filter should return a boolean";
+    auto const expected = cudf::apply_boolean_mask(written_table, *predicate);
+
+    // Reading with Predicate Pushdown
+    cudf::io::parquet_reader_options read_opts =
+      cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath})
+        .filter(filter_expression);
+    auto const result       = cudf::io::read_parquet(read_opts);
+    auto const result_table = result.tbl->view();
+
+    // Tests
+    EXPECT_EQ(static_cast<int>(written_table.column(0).type().id()),
+              static_cast<int>(result_table.column(0).type().id()))
+      << "col0 type mismatch";
+
+    // To make sure AST filters out some elements if row groups must be filtered
+    if (expected_stats_filtered_row_groups < expected_total_row_groups) {
+      EXPECT_LT(expected->num_rows(), written_table.num_rows());
     } else {
-      // table[0] < 0 or 100u
-      return cudf::numeric_scalar<T>((100 - 100 * std::is_signed_v<T>));  // i/100 (-100-100/ 0-200)
+      EXPECT_LE(expected->num_rows(), written_table.num_rows());
     }
-  }();
-  auto literal           = cudf::ast::literal(literal_value);
-  auto col_name_0        = cudf::ast::column_name_reference("col0");
-  auto filter_expression = cudf::ast::operation(cudf::ast::ast_operator::LESS, col_name_0, literal);
-  auto col_ref_0         = cudf::ast::column_reference(0);
-  auto ref_filter        = cudf::ast::operation(cudf::ast::ast_operator::LESS, col_ref_0, literal);
-
-  // Expected result
-  auto predicate = cudf::compute_column(written_table, ref_filter);
-  EXPECT_EQ(predicate->view().type().id(), cudf::type_id::BOOL8)
-    << "Predicate filter should return a boolean";
-  auto expected = cudf::apply_boolean_mask(written_table, *predicate);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result_table);
+    EXPECT_EQ(result.metadata.num_input_row_groups, expected_total_row_groups);
+    EXPECT_TRUE(result.metadata.num_row_groups_after_stats_filter.has_value());
+    EXPECT_EQ(result.metadata.num_row_groups_after_stats_filter.value(),
+              expected_stats_filtered_row_groups);
+    EXPECT_FALSE(result.metadata.num_row_groups_after_bloom_filter.has_value());
+  };
 
-  // Reading with Predicate Pushdown
-  cudf::io::parquet_reader_options read_opts =
-    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath})
-      .filter(filter_expression);
-  auto result       = cudf::io::read_parquet(read_opts);
-  auto result_table = result.tbl->view();
+  // The `literal_value` and stats should filter out 2 out of 4 row groups.
+  {
+    auto constexpr expected_total_row_groups          = 4;
+    auto constexpr expected_stats_filtered_row_groups = 2;
+
+    // Filtering AST
+    auto literal_value = []() {
+      if constexpr (cudf::is_timestamp<T>()) {
+        // table[0] < 10000 timestamp days/seconds/milliseconds/microseconds/nanoseconds
+        return cudf::timestamp_scalar<T>(T(typename T::duration(10000)));  // i (0-20,000)
+      } else if constexpr (cudf::is_duration<T>()) {
+        // table[0] < 10000 day/seconds/milliseconds/microseconds/nanoseconds
+        return cudf::duration_scalar<T>(T(10000));  // i (0-20,000)
+      } else if constexpr (std::is_same_v<T, cudf::string_view>) {
+        // table[0] < "000010000"
+        return cudf::string_scalar("000010000");  // i (0-20,000)
+      } else {
+        // table[0] < 0 or 100u
+        return cudf::numeric_scalar<T>(
+          (100 - 100 * std::is_signed_v<T>));  // i/100 (-100-100/ 0-200)
+      }
+    }();
+
+    auto const literal = cudf::ast::literal(literal_value);
+    auto const filter_expression =
+      cudf::ast::operation(cudf::ast::ast_operator::LESS, col_name_0, literal);
+    auto const ref_filter = cudf::ast::operation(cudf::ast::ast_operator::LESS, col_ref_0, literal);
+    test_predicate_pushdown(
+      filter_expression, ref_filter, expected_total_row_groups, expected_stats_filtered_row_groups);
+  }
 
-  // tests
-  EXPECT_EQ(int(written_table.column(0).type().id()), int(result_table.column(0).type().id()))
-    << "col0 type mismatch";
-  // To make sure AST filters out some elements
-  EXPECT_LT(expected->num_rows(), written_table.num_rows());
-  EXPECT_EQ(result_table.num_rows(), expected->num_rows());
-  EXPECT_EQ(result_table.num_columns(), expected->num_columns());
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result_table);
+  // The `literal_value` and stats should not filter any of the 4 row groups.
+  {
+    auto constexpr expected_total_row_groups          = 4;
+    auto constexpr expected_stats_filtered_row_groups = 4;
+
+    // Filtering AST
+    auto literal_value = []() {
+      if constexpr (cudf::is_timestamp<T>()) {
+        return cudf::timestamp_scalar<T>(T(typename T::duration(20000)));
+      } else if constexpr (cudf::is_duration<T>()) {
+        return cudf::duration_scalar<T>(T(20000));
+      } else if constexpr (std::is_same_v<T, cudf::string_view>) {
+        return cudf::string_scalar("000020000");
+      } else {
+        return cudf::numeric_scalar<T>(std::numeric_limits<T>::max());
+      }
+    }();
+
+    auto const literal = cudf::ast::literal(literal_value);
+    auto const filter_expression =
+      cudf::ast::operation(cudf::ast::ast_operator::LESS_EQUAL, col_name_0, literal);
+    auto const ref_filter =
+      cudf::ast::operation(cudf::ast::ast_operator::LESS_EQUAL, col_ref_0, literal);
+    test_predicate_pushdown(
+      filter_expression, ref_filter, expected_total_row_groups, expected_stats_filtered_row_groups);
+  }
 }
 
+//////////////////////
+// wide tables tests
+
 // The test below requires several minutes to complete with memcheck, thus it is disabled by
 // default.
 TEST_F(ParquetReaderTest, DISABLED_ListsWideTable)

From ceb56832d15a8d35c4056173569c17f89e254628 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 31 Jan 2025 03:51:56 -0800
Subject: [PATCH 10/17] Revert CUDA 12.8 shared workflow branch changes
 (#17879)

This PR points the shared workflow branches back to the default 25.02 branches.

xref: https://github.com/rapidsai/build-planning/issues/139

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/17879
---
 .github/workflows/build.yaml                  | 28 +++++-----
 .github/workflows/pandas-tests.yaml           |  2 +-
 .github/workflows/pr.yaml                     | 54 +++++++++----------
 .../workflows/pr_issue_status_automation.yml  |  8 +--
 .github/workflows/test.yaml                   | 30 +++++------
 .../trigger-breaking-change-alert.yaml        |  2 +-
 6 files changed, 62 insertions(+), 62 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index f0be4cb509d..f6b3fb83cdd 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-25.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -57,7 +57,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.02
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -69,7 +69,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-libcudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
     with:
       # build for every combination of arch and CUDA version, but only for the latest Python
       matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
@@ -81,7 +81,7 @@ jobs:
   wheel-publish-libcudf:
     needs: wheel-build-libcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -92,7 +92,7 @@ jobs:
   wheel-build-pylibcudf:
     needs: [wheel-build-libcudf]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -102,7 +102,7 @@ jobs:
   wheel-publish-pylibcudf:
     needs: wheel-build-pylibcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -113,7 +113,7 @@ jobs:
   wheel-build-cudf:
     needs: wheel-build-pylibcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -123,7 +123,7 @@ jobs:
   wheel-publish-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -134,7 +134,7 @@ jobs:
   wheel-build-dask-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -146,7 +146,7 @@ jobs:
   wheel-publish-dask-cudf:
     needs: wheel-build-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -157,7 +157,7 @@ jobs:
   wheel-build-cudf-polars:
     needs: wheel-build-pylibcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -169,7 +169,7 @@ jobs:
   wheel-publish-cudf-polars:
     needs: wheel-build-cudf-polars
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pandas-tests.yaml b/.github/workflows/pandas-tests.yaml
index bd5e87a0a0b..a29babb218f 100644
--- a/.github/workflows/pandas-tests.yaml
+++ b/.github/workflows/pandas-tests.yaml
@@ -17,7 +17,7 @@ jobs:
   pandas-tests:
       # run the Pandas unit tests
       secrets: inherit
-      uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.8.0
+      uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.02
       with:
         # This selects "ARCH=amd64 + the latest supported Python + CUDA".
         matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 6df5019b2aa..8f2aedd4f50 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -42,7 +42,7 @@ jobs:
       - pandas-tests-diff
       - telemetry-setup
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-25.02
     if: always()
     with:
       needs: ${{ toJSON(needs) }}
@@ -70,7 +70,7 @@ jobs:
   changed-files:
     secrets: inherit
     needs: telemetry-setup
-    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@branch-25.02
     with:
       files_yaml: |
         test_cpp:
@@ -123,48 +123,48 @@ jobs:
   checks:
     secrets: inherit
     needs: telemetry-setup
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-25.02
     with:
       enable_check_generated_files: false
       ignored_pr_jobs: "telemetry-summarize"
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.02
     with:
       build_type: pull-request
       node_type: "cpu16"
   cpp-linters:
     secrets: inherit
     needs: checks
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.02
     with:
       build_type: pull-request
       run_script: "ci/cpp_linters.sh"
   conda-cpp-checks:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-25.02
     with:
       build_type: pull-request
       enable_check_symbols: true
   conda-cpp-tests:
     needs: [conda-cpp-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-25.02
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_cpp
     with:
       build_type: pull-request
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.02
     with:
       build_type: pull-request
   conda-python-cudf-tests:
     needs: [conda-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.02
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
@@ -173,7 +173,7 @@ jobs:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     needs: [conda-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.02
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
@@ -181,7 +181,7 @@ jobs:
   conda-java-tests:
     needs: [conda-cpp-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.02
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_java
     with:
       build_type: pull-request
@@ -192,7 +192,7 @@ jobs:
   static-configure:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.02
     with:
       build_type: pull-request
       # Use the wheel container so we can skip conda solves and since our
@@ -202,7 +202,7 @@ jobs:
   conda-notebook-tests:
     needs: [conda-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.02
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_notebooks
     with:
       build_type: pull-request
@@ -213,7 +213,7 @@ jobs:
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.02
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -223,7 +223,7 @@ jobs:
   wheel-build-libcudf:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
     with:
       # build for every combination of arch and CUDA version, but only for the latest Python
       matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
@@ -233,21 +233,21 @@ jobs:
   wheel-build-pylibcudf:
     needs: [checks, wheel-build-libcudf]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
     with:
       build_type: pull-request
       script: "ci/build_wheel_pylibcudf.sh"
   wheel-build-cudf:
     needs: wheel-build-pylibcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
     with:
       build_type: pull-request
       script: "ci/build_wheel_cudf.sh"
   wheel-tests-cudf:
     needs: [wheel-build-cudf, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.02
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
@@ -255,7 +255,7 @@ jobs:
   wheel-build-cudf-polars:
     needs: wheel-build-pylibcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -264,7 +264,7 @@ jobs:
   wheel-tests-cudf-polars:
     needs: [wheel-build-cudf-polars, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.02
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
@@ -274,7 +274,7 @@ jobs:
   cudf-polars-polars-tests:
     needs: wheel-build-cudf-polars
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.02
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -283,7 +283,7 @@ jobs:
   wheel-build-dask-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -292,7 +292,7 @@ jobs:
   wheel-tests-dask-cudf:
     needs: [wheel-build-dask-cudf, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.02
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
@@ -302,7 +302,7 @@ jobs:
   devcontainer:
     secrets: inherit
     needs: telemetry-setup
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-25.02
     with:
       node_type: "cpu32"
       arch: '["amd64"]'
@@ -314,7 +314,7 @@ jobs:
   unit-tests-cudf-pandas:
     needs: [wheel-build-cudf, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.02
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python || fromJSON(needs.changed-files.outputs.changed_file_groups).test_cudf_pandas
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
@@ -325,7 +325,7 @@ jobs:
     # run the Pandas unit tests using PR branch
     needs: [wheel-build-cudf, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.02
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python || fromJSON(needs.changed-files.outputs.changed_file_groups).test_cudf_pandas
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
@@ -337,7 +337,7 @@ jobs:
   pandas-tests-diff:
     # diff the results of running the Pandas unit tests and publish a job summary
     needs: pandas-tests
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.02
     with:
         node_type: "cpu4"
         build_type: pull-request
diff --git a/.github/workflows/pr_issue_status_automation.yml b/.github/workflows/pr_issue_status_automation.yml
index 8ad6448bc27..20db9623e1b 100644
--- a/.github/workflows/pr_issue_status_automation.yml
+++ b/.github/workflows/pr_issue_status_automation.yml
@@ -23,7 +23,7 @@ on:
 
 jobs:
     get-project-id:
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@cuda-12.8.0
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@branch-25.02
       if: github.event.pull_request.state == 'open'
       secrets: inherit
       permissions:
@@ -34,7 +34,7 @@ jobs:
 
     update-status:
       # This job sets the PR and its linked issues to "In Progress" status
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@cuda-12.8.0
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-25.02
       if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }}
       needs: get-project-id
       with:
@@ -50,7 +50,7 @@ jobs:
 
     update-sprint:
       # This job sets the PR and its linked issues to the current "Weekly Sprint"
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@cuda-12.8.0
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@branch-25.02
       if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }}
       needs: get-project-id
       with:
@@ -79,7 +79,7 @@ jobs:
 
     update-release:
       # This job sets the PR and its linked issues to the release they are targeting
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@cuda-12.8.0
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-25.02
       if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }}
       needs: [get-project-id, process-branch-name]
       with:
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index d909fd4a657..dc82c17022a 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-25.02
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -25,7 +25,7 @@ jobs:
       enable_check_symbols: true
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-25.02
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -33,7 +33,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-cpp-memcheck-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.02
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -45,7 +45,7 @@ jobs:
       run_script: "ci/test_cpp_memcheck.sh"
   static-configure:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.02
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -57,7 +57,7 @@ jobs:
       run_script: "ci/configure_cpp_static.sh"
   cpp-linters:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.02
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -67,7 +67,7 @@ jobs:
       file_to_upload: iwyu_results.txt
   conda-python-cudf-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.02
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -77,7 +77,7 @@ jobs:
   conda-python-other-tests:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.02
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -86,7 +86,7 @@ jobs:
       script: "ci/test_python_other.sh"
   conda-java-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.02
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -98,7 +98,7 @@ jobs:
       run_script: "ci/test_java.sh"
   conda-notebook-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.02
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -110,7 +110,7 @@ jobs:
       run_script: "ci/test_notebooks.sh"
   wheel-tests-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.02
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -119,7 +119,7 @@ jobs:
       script: ci/test_wheel_cudf.sh
   wheel-tests-dask-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.02
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -128,7 +128,7 @@ jobs:
       script: ci/test_wheel_dask_cudf.sh
   unit-tests-cudf-pandas:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.02
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -137,7 +137,7 @@ jobs:
       script: ci/cudf_pandas_scripts/run_tests.sh
   third-party-integration-tests-cudf-pandas:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.02
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -149,7 +149,7 @@ jobs:
         ci/cudf_pandas_scripts/third-party-integration/test.sh python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml
   wheel-tests-cudf-polars:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.02
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -158,7 +158,7 @@ jobs:
       script: "ci/test_wheel_cudf_polars.sh"
   cudf-polars-polars-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.02
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/trigger-breaking-change-alert.yaml b/.github/workflows/trigger-breaking-change-alert.yaml
index 07f0f83cc92..01dd2436beb 100644
--- a/.github/workflows/trigger-breaking-change-alert.yaml
+++ b/.github/workflows/trigger-breaking-change-alert.yaml
@@ -12,7 +12,7 @@ jobs:
   trigger-notifier:
     if: contains(github.event.pull_request.labels.*.name, 'breaking')
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@branch-25.02
     with:
       sender_login: ${{ github.event.sender.login }}
       sender_avatar: ${{ github.event.sender.avatar_url }}

From 51b0f9ecc975d819c5cfef70d445e8c3e69b7369 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Fri, 31 Jan 2025 10:49:18 -0500
Subject: [PATCH 11/17] Add support for unary negation operator (#17560)

This PR adds support for unary negation operator in libcudf and plumbs the changes through cudf python and cudf polars.

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Basit Ayantunde (https://github.com/lamarrr)
  - Matthew Roeschke (https://github.com/mroeschke)
  - David Wendt (https://github.com/davidwendt)
  - Bradley Dice (https://github.com/bdice)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/17560
---
 cpp/include/cudf/unary.hpp                    |  3 +-
 cpp/include/cudf/utilities/traits.hpp         |  6 +-
 cpp/src/unary/math_ops.cu                     | 56 ++++++++++++++--
 cpp/tests/unary/math_ops_test.cpp             | 67 ++++++++++++++++++-
 cpp/tests/unary/unary_ops_test.cpp            | 16 ++++-
 python/cudf/cudf/core/column/numerical.py     | 29 +-------
 .../cudf/cudf/core/column/numerical_base.py   | 35 ++++++++++
 python/cudf/cudf/core/frame.py                |  2 +-
 python/cudf/cudf/tests/test_unaops.py         |  7 ++
 .../cudf_polars/dsl/expressions/unary.py      |  3 +-
 .../expressions/test_numeric_unaryops.py      | 11 ++-
 python/pylibcudf/pylibcudf/libcudf/unary.pxd  |  3 +-
 python/pylibcudf/pylibcudf/unary.pyi          |  1 +
 13 files changed, 198 insertions(+), 41 deletions(-)

diff --git a/cpp/include/cudf/unary.hpp b/cpp/include/cudf/unary.hpp
index 046e9745a71..6dcd84d3f4d 100644
--- a/cpp/include/cudf/unary.hpp
+++ b/cpp/include/cudf/unary.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -139,6 +139,7 @@ enum class unary_operator : int32_t {
   RINT,        ///< Rounds the floating-point argument arg to an integer value
   BIT_INVERT,  ///< Bitwise Not (~)
   NOT,         ///< Logical Not (!)
+  NEGATE,      ///< Unary negation (-), only for signed numeric and duration types.
 };
 
 /**
diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp
index 0f4bde204fa..168beb7fa9e 100644
--- a/cpp/include/cudf/utilities/traits.hpp
+++ b/cpp/include/cudf/utilities/traits.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -94,6 +94,8 @@ constexpr inline bool has_common_type_v = detail::has_common_type_impl<void, Ts.
 /// Checks if a type is a timestamp type.
 template <typename T>
 using is_timestamp_t = cuda::std::disjunction<std::is_same<cudf::timestamp_D, T>,
+                                              std::is_same<cudf::timestamp_h, T>,
+                                              std::is_same<cudf::timestamp_m, T>,
                                               std::is_same<cudf::timestamp_s, T>,
                                               std::is_same<cudf::timestamp_ms, T>,
                                               std::is_same<cudf::timestamp_us, T>,
@@ -102,6 +104,8 @@ using is_timestamp_t = cuda::std::disjunction<std::is_same<cudf::timestamp_D, T>
 /// Checks if a type is a duration type.
 template <typename T>
 using is_duration_t = cuda::std::disjunction<std::is_same<cudf::duration_D, T>,
+                                             std::is_same<cudf::duration_h, T>,
+                                             std::is_same<cudf::duration_m, T>,
                                              std::is_same<cudf::duration_s, T>,
                                              std::is_same<cudf::duration_ms, T>,
                                              std::is_same<cudf::duration_us, T>,
diff --git a/cpp/src/unary/math_ops.cu b/cpp/src/unary/math_ops.cu
index 1d506c59cd9..4e96f900bf3 100644
--- a/cpp/src/unary/math_ops.cu
+++ b/cpp/src/unary/math_ops.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -234,6 +234,16 @@ struct DeviceNot {
   }
 };
 
+// negation
+
+struct DeviceNegate {
+  template <typename T>
+  T __device__ operator()(T data)
+  {
+    return -data;
+  }
+};
+
 // fixed_point ops
 
 /*
@@ -278,6 +288,12 @@ struct fixed_point_abs {
   __device__ T operator()(T data) { return numeric::detail::abs(data); }
 };
 
+template <typename T>
+struct fixed_point_negate {
+  T n;
+  __device__ T operator()(T data) { return -data; }
+};
+
 template <typename T, template <typename> typename FixedPointFunctor>
 std::unique_ptr<column> unary_op_with(column_view const& input,
                                       rmm::cuda_stream_view stream,
@@ -414,6 +430,34 @@ struct MathOpDispatcher {
   }
 };
 
+template <typename UFN>
+struct NegateOpDispatcher {
+  template <typename T>
+  static constexpr bool is_supported()
+  {
+    return std::is_signed_v<T> || cudf::is_duration<T>();
+  }
+
+  template <typename T, std::enable_if_t<is_supported<T>()>* = nullptr>
+  std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::device_async_resource_ref mr)
+  {
+    return transform_fn<T, UFN>(input.begin<T>(),
+                                input.end<T>(),
+                                cudf::detail::copy_bitmask(input, stream, mr),
+                                input.null_count(),
+                                stream,
+                                mr);
+  }
+
+  template <typename T, typename... Args>
+  std::enable_if_t<!is_supported<T>(), std::unique_ptr<cudf::column>> operator()(Args&&...)
+  {
+    CUDF_FAIL("Unsupported data type for negate operation");
+  }
+};
+
 template <typename UFN>
 struct BitwiseOpDispatcher {
   template <typename T, std::enable_if_t<std::is_integral_v<T>>* = nullptr>
@@ -550,9 +594,10 @@ struct FixedPointOpDispatcher {
   {
     // clang-format off
     switch (op) {
-      case cudf::unary_operator::CEIL:  return unary_op_with<T, fixed_point_ceil>(input, stream, mr);
-      case cudf::unary_operator::FLOOR: return unary_op_with<T, fixed_point_floor>(input, stream, mr);
-      case cudf::unary_operator::ABS:   return unary_op_with<T, fixed_point_abs>(input, stream, mr);
+      case cudf::unary_operator::CEIL:   return unary_op_with<T, fixed_point_ceil>(input, stream, mr);
+      case cudf::unary_operator::FLOOR:  return unary_op_with<T, fixed_point_floor>(input, stream, mr);
+      case cudf::unary_operator::ABS:    return unary_op_with<T, fixed_point_abs>(input, stream, mr);
+      case cudf::unary_operator::NEGATE: return unary_op_with<T, fixed_point_negate>(input, stream, mr);
       default: CUDF_FAIL("Unsupported fixed_point unary operation");
     }
     // clang-format on
@@ -639,6 +684,9 @@ std::unique_ptr<cudf::column> unary_operation(cudf::column_view const& input,
     case cudf::unary_operator::NOT:
       return cudf::type_dispatcher(
         input.type(), detail::LogicalOpDispatcher<detail::DeviceNot>{}, input, stream, mr);
+    case cudf::unary_operator::NEGATE:
+      return cudf::type_dispatcher(
+        input.type(), detail::NegateOpDispatcher<detail::DeviceNegate>{}, input, stream, mr);
     default: CUDF_FAIL("Undefined unary operation");
   }
 }
diff --git a/cpp/tests/unary/math_ops_test.cpp b/cpp/tests/unary/math_ops_test.cpp
index 663a919f3f4..bcb84d4574c 100644
--- a/cpp/tests/unary/math_ops_test.cpp
+++ b/cpp/tests/unary/math_ops_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,69 @@
 
 #include <vector>
 
+using TypesToNegate = cudf::test::Types<int8_t,
+                                        int16_t,
+                                        int32_t,
+                                        int64_t,
+                                        float,
+                                        double,
+                                        cudf::duration_D,
+                                        cudf::duration_s,
+                                        cudf::duration_ms,
+                                        cudf::duration_us,
+                                        cudf::duration_ns>;
+
+template <typename T>
+struct UnaryNegateTests : public cudf::test::BaseFixture {};
+
+TYPED_TEST_SUITE(UnaryNegateTests, TypesToNegate);
+
+TYPED_TEST(UnaryNegateTests, SimpleNEGATE)
+{
+  using T = TypeParam;
+  cudf::test::fixed_width_column_wrapper<T> input{{0, 1, 2, 3}};
+  auto const v = cudf::test::make_type_param_vector<T>({0, -1, -2, -3});
+  cudf::test::fixed_width_column_wrapper<T> expected(v.begin(), v.end());
+  auto output = cudf::unary_operation(input, cudf::unary_operator::NEGATE);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, output->view());
+}
+
+using TypesNotToNegate = cudf::test::Types<uint8_t,
+                                           uint16_t,
+                                           uint32_t,
+                                           uint64_t,
+                                           cudf::timestamp_D,
+                                           cudf::timestamp_s,
+                                           cudf::timestamp_ms,
+                                           cudf::timestamp_us,
+                                           cudf::timestamp_ns>;
+
+template <typename T>
+struct UnaryNegateErrorTests : public cudf::test::BaseFixture {};
+
+TYPED_TEST_SUITE(UnaryNegateErrorTests, TypesNotToNegate);
+
+TYPED_TEST(UnaryNegateErrorTests, UnsupportedTypesFail)
+{
+  using T = TypeParam;
+  cudf::test::fixed_width_column_wrapper<T> input({1, 2, 3, 4});
+  EXPECT_THROW(cudf::unary_operation(input, cudf::unary_operator::NEGATE), cudf::logic_error);
+}
+
+struct UnaryNegateComplexTypesErrorTests : public cudf::test::BaseFixture {};
+
+TEST_F(UnaryNegateComplexTypesErrorTests, NegateStringColumnFail)
+{
+  cudf::test::strings_column_wrapper input({"foo", "bar"});
+  EXPECT_THROW(cudf::unary_operation(input, cudf::unary_operator::NEGATE), cudf::logic_error);
+}
+
+TEST_F(UnaryNegateComplexTypesErrorTests, NegateListsColumnFail)
+{
+  cudf::test::lists_column_wrapper<int32_t> input{{1, 2}, {3, 4}};
+  EXPECT_THROW(cudf::unary_operation(input, cudf::unary_operator::NEGATE), cudf::logic_error);
+}
+
 template <typename T>
 struct UnaryLogicalOpsTest : public cudf::test::BaseFixture {};
 
@@ -274,7 +337,7 @@ TYPED_TEST(UnaryMathFloatOpsTest, SimpleTANH)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, output->view());
 }
 
-TYPED_TEST(UnaryMathFloatOpsTest, SimpleiASINH)
+TYPED_TEST(UnaryMathFloatOpsTest, SimpleASINH)
 {
   cudf::test::fixed_width_column_wrapper<TypeParam> input{{0.0}};
   cudf::test::fixed_width_column_wrapper<TypeParam> expected{{0.0}};
diff --git a/cpp/tests/unary/unary_ops_test.cpp b/cpp/tests/unary/unary_ops_test.cpp
index 3c616461c74..d7989c6b053 100644
--- a/cpp/tests/unary/unary_ops_test.cpp
+++ b/cpp/tests/unary/unary_ops_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -266,6 +266,20 @@ struct FixedPointUnaryTests : public cudf::test::BaseFixture {};
 
 TYPED_TEST_SUITE(FixedPointUnaryTests, cudf::test::FixedPointTypes);
 
+TYPED_TEST(FixedPointUnaryTests, FixedPointUnaryNegate)
+{
+  using namespace numeric;
+  using decimalXX  = TypeParam;
+  using RepType    = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  auto const input    = fp_wrapper{{0, -1234, -3456, -6789, 1234, 3456, 6789}, scale_type{-3}};
+  auto const expected = fp_wrapper{{0, 1234, 3456, 6789, -1234, -3456, -6789}, scale_type{-3}};
+  auto const result   = cudf::unary_operation(input, cudf::unary_operator::NEGATE);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
 TYPED_TEST(FixedPointUnaryTests, FixedPointUnaryAbs)
 {
   using namespace numeric;
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index f901b5d735e..1877f167a08 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -8,7 +8,6 @@
 import numpy as np
 import pandas as pd
 import pyarrow as pa
-from numba.np import numpy_support
 from typing_extensions import Self
 
 import pylibcudf as plc
@@ -24,7 +23,6 @@
 from cudf.core.mixins import BinaryOperand
 from cudf.core.scalar import pa_scalar_to_plc_scalar
 from cudf.errors import MixedTypeError
-from cudf.utils import cudautils
 from cudf.utils.dtypes import (
     find_common_type,
     min_column_type,
@@ -33,7 +31,7 @@
 )
 
 if TYPE_CHECKING:
-    from collections.abc import Callable, Sequence
+    from collections.abc import Sequence
 
     from cudf._typing import (
         ColumnBinaryOperand,
@@ -45,13 +43,6 @@
     from cudf.core.buffer import Buffer
     from cudf.core.column import DecimalBaseColumn
 
-_unaryop_map = {
-    "ASIN": "ARCSIN",
-    "ACOS": "ARCCOS",
-    "ATAN": "ARCTAN",
-    "INVERT": "BIT_INVERT",
-}
-
 
 class NumericalColumn(NumericalBaseColumn):
     """
@@ -192,24 +183,6 @@ def transform(self, compiled_op, np_dtype: np.dtype) -> ColumnBase:
         )
         return type(self).from_pylibcudf(plc_column)
 
-    def unary_operator(self, unaryop: str | Callable) -> ColumnBase:
-        if callable(unaryop):
-            nb_type = numpy_support.from_dtype(self.dtype)
-            nb_signature = (nb_type,)
-            compiled_op = cudautils.compile_udf(unaryop, nb_signature)
-            np_dtype = np.dtype(compiled_op[1])
-            return self.transform(compiled_op, np_dtype)
-
-        unaryop = unaryop.upper()
-        unaryop = _unaryop_map.get(unaryop, unaryop)
-        unaryop = plc.unary.UnaryOperator[unaryop]
-        with acquire_spill_lock():
-            return type(self).from_pylibcudf(
-                plc.unary.unary_operation(
-                    self.to_pylibcudf(mode="read"), unaryop
-                )
-            )
-
     def __invert__(self):
         if self.dtype.kind in "ui":
             return self.unary_operator("invert")
diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py
index d8c316a4c8f..2674b92bb21 100644
--- a/python/cudf/cudf/core/column/numerical_base.py
+++ b/python/cudf/cudf/core/column/numerical_base.py
@@ -6,6 +6,7 @@
 from typing import TYPE_CHECKING, Literal, cast
 
 import numpy as np
+from numba.np import numpy_support
 
 import pylibcudf as plc
 
@@ -14,12 +15,23 @@
 from cudf.core.column.column import ColumnBase
 from cudf.core.missing import NA
 from cudf.core.mixins import Scannable
+from cudf.utils import cudautils
 
 if TYPE_CHECKING:
+    from collections.abc import Callable
+
     from cudf._typing import ScalarLike
     from cudf.core.column.decimal import DecimalDtype
 
 
+_unaryop_map = {
+    "ASIN": "ARCSIN",
+    "ACOS": "ARCCOS",
+    "ATAN": "ARCTAN",
+    "INVERT": "BIT_INVERT",
+}
+
+
 class NumericalBaseColumn(ColumnBase, Scannable):
     """
     A column composed of numerical (bool, integer, float, decimal) data.
@@ -268,3 +280,26 @@ def _scan(self, op: str) -> ColumnBase:
         return self.scan(op.replace("cum", ""), True)._with_type_metadata(
             self.dtype
         )
+
+    def unary_operator(self, unaryop: str | Callable) -> ColumnBase:
+        if callable(unaryop):
+            nb_type = numpy_support.from_dtype(self.dtype)
+            nb_signature = (nb_type,)
+            compiled_op = cudautils.compile_udf(unaryop, nb_signature)
+            np_dtype = np.dtype(compiled_op[1])
+            return self.transform(compiled_op, np_dtype)
+
+        unaryop = unaryop.upper()
+        unaryop = _unaryop_map.get(unaryop, unaryop)
+        unaryop = plc.unary.UnaryOperator[unaryop]
+        with acquire_spill_lock():
+            return type(self).from_pylibcudf(
+                plc.unary.unary_operation(
+                    self.to_pylibcudf(mode="read"), unaryop
+                )
+            )
+
+    def transform(self, compiled_op, np_dtype: np.dtype) -> ColumnBase:
+        raise NotImplementedError(
+            "transform is not implemented for NumericalBaseColumn"
+        )
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 08f8e49a98c..fcf5a3cd8e9 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -1644,7 +1644,7 @@ def __neg__(self):
                 (
                     col.unary_operator("not")
                     if col.dtype.kind == "b"
-                    else -1 * col
+                    else col.unary_operator("negate")
                     for col in self._columns
                 )
             )
diff --git a/python/cudf/cudf/tests/test_unaops.py b/python/cudf/cudf/tests/test_unaops.py
index bbd01eaa311..7ed0d370822 100644
--- a/python/cudf/cudf/tests/test_unaops.py
+++ b/python/cudf/cudf/tests/test_unaops.py
@@ -3,6 +3,7 @@
 import itertools
 import operator
 import re
+from decimal import Decimal
 
 import numpy as np
 import pandas as pd
@@ -134,3 +135,9 @@ def test_series_bool_neg():
     sr = Series([True, False, True, None, False, None, True, True])
     psr = sr.to_pandas(nullable=True)
     assert_eq((-sr).to_pandas(nullable=True), -psr, check_dtype=True)
+
+
+def test_series_decimal_neg():
+    sr = Series([Decimal("0.0"), Decimal("1.23"), Decimal("4.567")])
+    psr = sr.to_pandas()
+    assert_eq((-sr).to_pandas(), -psr, check_dtype=True)
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/unary.py b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py
index 3336c901e7f..3286c9ff8bc 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/unary.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
 # TODO: remove need for this
 """DSL nodes for unary operations."""
@@ -119,6 +119,7 @@ class UnaryFunction(Expr):
         "abs": plc.unary.UnaryOperator.ABS,
         "bit_invert": plc.unary.UnaryOperator.BIT_INVERT,
         "not": plc.unary.UnaryOperator.NOT,
+        "negate": plc.unary.UnaryOperator.NEGATE,
     }
     _supported_misc_fns = frozenset(
         {
diff --git a/python/cudf_polars/tests/expressions/test_numeric_unaryops.py b/python/cudf_polars/tests/expressions/test_numeric_unaryops.py
index ac3aecf88e6..75bf0960e10 100644
--- a/python/cudf_polars/tests/expressions/test_numeric_unaryops.py
+++ b/python/cudf_polars/tests/expressions/test_numeric_unaryops.py
@@ -1,7 +1,9 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
+from datetime import timedelta
+
 import numpy as np
 import pytest
 
@@ -58,6 +60,7 @@ def ldf(with_nulls, dtype):
         {
             "a": pl.Series(values, dtype=dtype),
             "b": pl.Series([i - 4 for i in range(len(values))], dtype=pl.Float32),
+            "c": pl.Series([timedelta(hours=i) for i in range(len(values))]),
         }
     )
 
@@ -89,3 +92,9 @@ def test_log(ldf, natural):
     q = ldf.select(expr)
 
     assert_gpu_result_equal(q, check_exact=False)
+
+
+@pytest.mark.parametrize("col", ["a", "b", "c"])
+def test_negate(ldf, col):
+    q = ldf.select(-pl.col(col))
+    assert_gpu_result_equal(q)
diff --git a/python/pylibcudf/pylibcudf/libcudf/unary.pxd b/python/pylibcudf/pylibcudf/libcudf/unary.pxd
index 4666012623e..802d4b392a8 100644
--- a/python/pylibcudf/pylibcudf/libcudf/unary.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/unary.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 from libc.stdint cimport int32_t
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
@@ -33,6 +33,7 @@ cdef extern from "cudf/unary.hpp" namespace "cudf" nogil:
         RINT
         BIT_INVERT
         NOT
+        NEGATE
 
     cdef extern unique_ptr[column] unary_operation(
         column_view input,
diff --git a/python/pylibcudf/pylibcudf/unary.pyi b/python/pylibcudf/pylibcudf/unary.pyi
index 7aa23b618f4..4d06a51c03a 100644
--- a/python/pylibcudf/pylibcudf/unary.pyi
+++ b/python/pylibcudf/pylibcudf/unary.pyi
@@ -28,6 +28,7 @@ class UnaryOperator(IntEnum):
     RINT = ...
     BIT_INVERT = ...
     NOT = ...
+    NEGATE = ...
 
 def unary_operation(input: Column, op: UnaryOperator) -> Column: ...
 def is_null(input: Column) -> Column: ...

From 03742acc9ae72a4e5a582fb3f136c9116862f062 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Fri, 31 Jan 2025 13:51:23 -0500
Subject: [PATCH 12/17] Remove extra local var declaration from cudf.pandas
 3rd-party integration shell script (#17886)

This PR fixes this nightly CI failure: https://github.com/rapidsai/cudf/actions/runs/13067745114/job/36478210419#step:10:16 in the cudf.pandas 3rd-party integration tests.

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - James Lamb (https://github.com/jameslamb)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/17886
---
 ci/cudf_pandas_scripts/third-party-integration/test.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ci/cudf_pandas_scripts/third-party-integration/test.sh b/ci/cudf_pandas_scripts/third-party-integration/test.sh
index 43ed3594917..c6f5844427d 100755
--- a/ci/cudf_pandas_scripts/third-party-integration/test.sh
+++ b/ci/cudf_pandas_scripts/third-party-integration/test.sh
@@ -11,7 +11,6 @@ extract_lib_from_dependencies_yaml() {
     # Parse all keys in dependencies.yaml under the "files" section,
     # extract all the keys that start with "test_", and extract the rest
     extracted_libs="$(yq -o json "$file" | jq -rc '.files | with_entries(select(.key | contains("test_"))) | keys | map(sub("^test_"; ""))')"
-    local extracted_libs
     echo "$extracted_libs"
 }
 

From bd76fa85d84e9130e67fc5141f79a092c32460f3 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 31 Jan 2025 14:28:02 -0800
Subject: [PATCH 13/17] Remove cudf.Scalar from .dt timedelta properties
 (#17863)

Towards https://github.com/rapidsai/cudf/issues/17843

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/17863
---
 python/cudf/cudf/core/column/timedelta.py | 126 +++++++---------------
 1 file changed, 37 insertions(+), 89 deletions(-)

diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index 3f5aea19307..c7243d01325 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -41,6 +41,16 @@
 }
 
 
+@functools.cache
+def get_np_td_unit_conversion(
+    reso: str, dtype: None | np.dtype
+) -> np.timedelta64:
+    td = np.timedelta64(_unit_to_nanoseconds_conversion[reso], "ns")
+    if dtype is not None:
+        return td.astype(dtype)
+    return td
+
+
 class TimeDeltaColumn(ColumnBase):
     """
     Parameters
@@ -483,74 +493,33 @@ def components(self) -> dict[str, ColumnBase]:
         3      0      0       35       35           656             0            0
         4     37     13       12       14           234             0            0
         """
-
         date_meta = {
+            "hours": ["D", "h"],
+            "minutes": ["h", "m"],
             "seconds": ["m", "s"],
             "milliseconds": ["s", "ms"],
             "microseconds": ["ms", "us"],
             "nanoseconds": ["us", "ns"],
         }
-        data = {
-            "days": self
-            // cudf.Scalar(
-                np.timedelta64(
-                    _unit_to_nanoseconds_conversion["D"], "ns"
-                ).astype(self.dtype)
-            ),
-            "hours": (
-                self
-                % cudf.Scalar(
-                    np.timedelta64(
-                        _unit_to_nanoseconds_conversion["D"], "ns"
-                    ).astype(self.dtype)
-                )
-            )
-            // cudf.Scalar(
-                np.timedelta64(
-                    _unit_to_nanoseconds_conversion["h"], "ns"
-                ).astype(self.dtype)
-            ),
-            "minutes": (
-                self
-                % cudf.Scalar(
-                    np.timedelta64(
-                        _unit_to_nanoseconds_conversion["h"], "ns"
-                    ).astype(self.dtype)
-                )
-            )
-            // cudf.Scalar(
-                np.timedelta64(
-                    _unit_to_nanoseconds_conversion["m"], "ns"
-                ).astype(self.dtype)
-            ),
-        }
-        keys_list = iter(date_meta.keys())
-        for name in keys_list:
-            value = date_meta[name]
-            data[name] = (
-                self
-                % cudf.Scalar(
-                    np.timedelta64(
-                        _unit_to_nanoseconds_conversion[value[0]], "ns"
-                    ).astype(self.dtype)
+        data = {"days": self // get_np_td_unit_conversion("D", self.dtype)}
+        reached_self_unit = False
+        for result_key, (mod_unit, div_unit) in date_meta.items():
+            if not reached_self_unit:
+                res_col = (
+                    self % get_np_td_unit_conversion(mod_unit, self.dtype)
+                ) // get_np_td_unit_conversion(div_unit, self.dtype)
+                reached_self_unit = self.time_unit == div_unit
+            else:
+                res_col = column.as_column(
+                    0, length=len(self), dtype=np.dtype(np.int64)
                 )
-            ) // cudf.Scalar(
-                np.timedelta64(
-                    _unit_to_nanoseconds_conversion[value[1]], "ns"
-                ).astype(self.dtype)
-            )
-            if self.time_unit == value[1]:
-                break
-
-        for name in keys_list:
-            res_col = column.as_column(0, length=len(self), dtype="int64")
-            if self.nullable:
-                res_col = res_col.set_mask(self.mask)
-            data[name] = res_col
+                if self.nullable:
+                    res_col = res_col.set_mask(self.mask)
+            data[result_key] = res_col
         return data
 
     @property
-    def days(self) -> "cudf.core.column.NumericalColumn":
+    def days(self) -> cudf.core.column.NumericalColumn:
         """
         Number of days for each element.
 
@@ -558,14 +527,10 @@ def days(self) -> "cudf.core.column.NumericalColumn":
         -------
         NumericalColumn
         """
-        return self // cudf.Scalar(
-            np.timedelta64(_unit_to_nanoseconds_conversion["D"], "ns").astype(
-                self.dtype
-            )
-        )
+        return self // get_np_td_unit_conversion("D", self.dtype)
 
     @property
-    def seconds(self) -> "cudf.core.column.NumericalColumn":
+    def seconds(self) -> cudf.core.column.NumericalColumn:
         """
         Number of seconds (>= 0 and less than 1 day).
 
@@ -579,18 +544,11 @@ def seconds(self) -> "cudf.core.column.NumericalColumn":
         # division operation to extract the number of seconds.
 
         return (
-            self
-            % cudf.Scalar(
-                np.timedelta64(
-                    _unit_to_nanoseconds_conversion["D"], "ns"
-                ).astype(self.dtype)
-            )
-        ) // cudf.Scalar(
-            np.timedelta64(_unit_to_nanoseconds_conversion["s"], "ns")
-        )
+            self % get_np_td_unit_conversion("D", self.dtype)
+        ) // get_np_td_unit_conversion("s", None)
 
     @property
-    def microseconds(self) -> "cudf.core.column.NumericalColumn":
+    def microseconds(self) -> cudf.core.column.NumericalColumn:
         """
         Number of microseconds (>= 0 and less than 1 second).
 
@@ -604,16 +562,11 @@ def microseconds(self) -> "cudf.core.column.NumericalColumn":
         # division operation to extract the number of microseconds.
 
         return (
-            self
-            % np.timedelta64(
-                _unit_to_nanoseconds_conversion["s"], "ns"
-            ).astype(self.dtype)
-        ) // cudf.Scalar(
-            np.timedelta64(_unit_to_nanoseconds_conversion["us"], "ns")
-        )
+            self % get_np_td_unit_conversion("s", self.dtype)
+        ) // get_np_td_unit_conversion("us", None)
 
     @property
-    def nanoseconds(self) -> "cudf.core.column.NumericalColumn":
+    def nanoseconds(self) -> cudf.core.column.NumericalColumn:
         """
         Return the number of nanoseconds (n), where 0 <= n < 1 microsecond.
 
@@ -633,13 +586,8 @@ def nanoseconds(self) -> "cudf.core.column.NumericalColumn":
                 res_col = res_col.set_mask(self.mask)
             return cast("cudf.core.column.NumericalColumn", res_col)
         return (
-            self
-            % cudf.Scalar(
-                np.timedelta64(_unit_to_nanoseconds_conversion["us"], "ns")
-            )
-        ) // cudf.Scalar(
-            np.timedelta64(_unit_to_nanoseconds_conversion["ns"], "ns")
-        )
+            self % get_np_td_unit_conversion("us", None)
+        ) // get_np_td_unit_conversion("ns", None)
 
 
 def determine_out_dtype(lhs_dtype: Dtype, rhs_dtype: Dtype) -> Dtype:

From aeb90de3e2abef5f9e86ad5bc409a1bb648f0d57 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 31 Jan 2025 16:29:35 -0600
Subject: [PATCH 14/17] Migrate to NVKS for amd64 CI runners (#17877)

This migrates amd64 CI jobs (PRs and nightlies) to use L4 GPUs from the NVKS cluster.

xref: https://github.com/rapidsai/build-infra/issues/184

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/cudf/pull/17877
---
 .github/workflows/build.yaml                  | 30 +++++-----
 .github/workflows/pandas-tests.yaml           |  2 +-
 .github/workflows/pr.yaml                     | 60 +++++++++----------
 .../workflows/pr_issue_status_automation.yml  |  8 +--
 .github/workflows/test.yaml                   | 38 ++++++------
 .../trigger-breaking-change-alert.yaml        |  2 +-
 6 files changed, 70 insertions(+), 70 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index fd7b1fe253f..9bcd3a65a9d 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@nvks-runners
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@nvks-runners
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-25.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@nvks-runners
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -57,19 +57,19 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.04
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@nvks-runners
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
       build_type: ${{ inputs.build_type || 'branch' }}
       container_image: "rapidsai/ci-conda:latest"
       date: ${{ inputs.date }}
-      node_type: "gpu-v100-latest-1"
+      node_type: "gpu-l4-latest-1"
       run_script: "ci/build_docs.sh"
       sha: ${{ inputs.sha }}
   wheel-build-libcudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@nvks-runners
     with:
       # build for every combination of arch and CUDA version, but only for the latest Python
       matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
@@ -81,7 +81,7 @@ jobs:
   wheel-publish-libcudf:
     needs: wheel-build-libcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@nvks-runners
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -92,7 +92,7 @@ jobs:
   wheel-build-pylibcudf:
     needs: [wheel-build-libcudf]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@nvks-runners
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -102,7 +102,7 @@ jobs:
   wheel-publish-pylibcudf:
     needs: wheel-build-pylibcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@nvks-runners
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -113,7 +113,7 @@ jobs:
   wheel-build-cudf:
     needs: wheel-build-pylibcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@nvks-runners
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -123,7 +123,7 @@ jobs:
   wheel-publish-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@nvks-runners
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -134,7 +134,7 @@ jobs:
   wheel-build-dask-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@nvks-runners
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -146,7 +146,7 @@ jobs:
   wheel-publish-dask-cudf:
     needs: wheel-build-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@nvks-runners
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -157,7 +157,7 @@ jobs:
   wheel-build-cudf-polars:
     needs: wheel-build-pylibcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@nvks-runners
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -169,7 +169,7 @@ jobs:
   wheel-publish-cudf-polars:
     needs: wheel-build-cudf-polars
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@nvks-runners
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pandas-tests.yaml b/.github/workflows/pandas-tests.yaml
index fea393c549e..8730ae43ddf 100644
--- a/.github/workflows/pandas-tests.yaml
+++ b/.github/workflows/pandas-tests.yaml
@@ -17,7 +17,7 @@ jobs:
   pandas-tests:
       # run the Pandas unit tests
       secrets: inherit
-      uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.04
+      uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@nvks-runners
       with:
         # This selects "ARCH=amd64 + the latest supported Python + CUDA".
         matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 65356ec8b73..34fcbc14420 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -42,7 +42,7 @@ jobs:
       - pandas-tests-diff
       - telemetry-setup
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-25.04
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@nvks-runners
     if: always()
     with:
       needs: ${{ toJSON(needs) }}
@@ -70,7 +70,7 @@ jobs:
   changed-files:
     secrets: inherit
     needs: telemetry-setup
-    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@branch-25.04
+    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@nvks-runners
     with:
       files_yaml: |
         test_cpp:
@@ -123,48 +123,48 @@ jobs:
   checks:
     secrets: inherit
     needs: telemetry-setup
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-25.04
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@nvks-runners
     with:
       enable_check_generated_files: false
       ignored_pr_jobs: "telemetry-summarize"
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@nvks-runners
     with:
       build_type: pull-request
       node_type: "cpu16"
   cpp-linters:
     secrets: inherit
     needs: checks
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.04
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@nvks-runners
     with:
       build_type: pull-request
       run_script: "ci/cpp_linters.sh"
   conda-cpp-checks:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-25.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@nvks-runners
     with:
       build_type: pull-request
       enable_check_symbols: true
   conda-cpp-tests:
     needs: [conda-cpp-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-25.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@nvks-runners
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_cpp
     with:
       build_type: pull-request
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@nvks-runners
     with:
       build_type: pull-request
   conda-python-cudf-tests:
     needs: [conda-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@nvks-runners
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
@@ -173,7 +173,7 @@ jobs:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     needs: [conda-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@nvks-runners
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
@@ -181,18 +181,18 @@ jobs:
   conda-java-tests:
     needs: [conda-cpp-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.04
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@nvks-runners
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_java
     with:
       build_type: pull-request
-      node_type: "gpu-v100-latest-1"
+      node_type: "gpu-l4-latest-1"
       arch: "amd64"
       container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/test_java.sh"
   static-configure:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.04
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@nvks-runners
     with:
       build_type: pull-request
       # Use the wheel container so we can skip conda solves and since our
@@ -202,28 +202,28 @@ jobs:
   conda-notebook-tests:
     needs: [conda-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.04
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@nvks-runners
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_notebooks
     with:
       build_type: pull-request
-      node_type: "gpu-v100-latest-1"
+      node_type: "gpu-l4-latest-1"
       arch: "amd64"
       container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/test_notebooks.sh"
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.04
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@nvks-runners
     with:
       build_type: pull-request
-      node_type: "gpu-v100-latest-1"
+      node_type: "gpu-l4-latest-1"
       arch: "amd64"
       container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/build_docs.sh"
   wheel-build-libcudf:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@nvks-runners
     with:
       # build for every combination of arch and CUDA version, but only for the latest Python
       matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
@@ -233,21 +233,21 @@ jobs:
   wheel-build-pylibcudf:
     needs: [checks, wheel-build-libcudf]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@nvks-runners
     with:
       build_type: pull-request
       script: "ci/build_wheel_pylibcudf.sh"
   wheel-build-cudf:
     needs: wheel-build-pylibcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@nvks-runners
     with:
       build_type: pull-request
       script: "ci/build_wheel_cudf.sh"
   wheel-tests-cudf:
     needs: [wheel-build-cudf, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@nvks-runners
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
@@ -255,7 +255,7 @@ jobs:
   wheel-build-cudf-polars:
     needs: wheel-build-pylibcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@nvks-runners
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -264,7 +264,7 @@ jobs:
   wheel-tests-cudf-polars:
     needs: [wheel-build-cudf-polars, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@nvks-runners
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
@@ -274,7 +274,7 @@ jobs:
   cudf-polars-polars-tests:
     needs: wheel-build-cudf-polars
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@nvks-runners
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -283,7 +283,7 @@ jobs:
   wheel-build-dask-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@nvks-runners
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -292,7 +292,7 @@ jobs:
   wheel-tests-dask-cudf:
     needs: [wheel-build-dask-cudf, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@nvks-runners
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
@@ -302,7 +302,7 @@ jobs:
   devcontainer:
     secrets: inherit
     needs: telemetry-setup
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-25.04
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@nvks-runners
     with:
       node_type: "cpu32"
       arch: '["amd64"]'
@@ -314,7 +314,7 @@ jobs:
   unit-tests-cudf-pandas:
     needs: [wheel-build-cudf, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@nvks-runners
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python || fromJSON(needs.changed-files.outputs.changed_file_groups).test_cudf_pandas
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
@@ -325,7 +325,7 @@ jobs:
     # run the Pandas unit tests using PR branch
     needs: [wheel-build-cudf, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@nvks-runners
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python || fromJSON(needs.changed-files.outputs.changed_file_groups).test_cudf_pandas
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
@@ -337,7 +337,7 @@ jobs:
   pandas-tests-diff:
     # diff the results of running the Pandas unit tests and publish a job summary
     needs: pandas-tests
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.04
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@nvks-runners
     with:
         node_type: "cpu4"
         build_type: pull-request
diff --git a/.github/workflows/pr_issue_status_automation.yml b/.github/workflows/pr_issue_status_automation.yml
index 44e48f691a2..b1bd2d4e768 100644
--- a/.github/workflows/pr_issue_status_automation.yml
+++ b/.github/workflows/pr_issue_status_automation.yml
@@ -23,7 +23,7 @@ on:
 
 jobs:
     get-project-id:
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@branch-25.04
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@nvks-runners
       if: github.event.pull_request.state == 'open'
       secrets: inherit
       permissions:
@@ -34,7 +34,7 @@ jobs:
 
     update-status:
       # This job sets the PR and its linked issues to "In Progress" status
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-25.04
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@nvks-runners
       if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }}
       needs: get-project-id
       with:
@@ -50,7 +50,7 @@ jobs:
 
     update-sprint:
       # This job sets the PR and its linked issues to the current "Weekly Sprint"
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@branch-25.04
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@nvks-runners
       if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }}
       needs: get-project-id
       with:
@@ -79,7 +79,7 @@ jobs:
 
     update-release:
       # This job sets the PR and its linked issues to the release they are targeting
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-25.04
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@nvks-runners
       if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }}
       needs: [get-project-id, process-branch-name]
       with:
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 233d15dd145..b6b2caddeb8 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-25.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@nvks-runners
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -25,7 +25,7 @@ jobs:
       enable_check_symbols: true
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-25.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@nvks-runners
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -33,19 +33,19 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-cpp-memcheck-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.04
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@nvks-runners
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
-      node_type: "gpu-v100-latest-1"
+      node_type: "gpu-l4-latest-1"
       arch: "amd64"
       container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/test_cpp_memcheck.sh"
   static-configure:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.04
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@nvks-runners
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -57,7 +57,7 @@ jobs:
       run_script: "ci/configure_cpp_static.sh"
   cpp-linters:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.04
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@nvks-runners
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -67,7 +67,7 @@ jobs:
       file_to_upload: iwyu_results.txt
   conda-python-cudf-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@nvks-runners
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -77,7 +77,7 @@ jobs:
   conda-python-other-tests:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@nvks-runners
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -86,31 +86,31 @@ jobs:
       script: "ci/test_python_other.sh"
   conda-java-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.04
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@nvks-runners
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
-      node_type: "gpu-v100-latest-1"
+      node_type: "gpu-l4-latest-1"
       arch: "amd64"
       container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/test_java.sh"
   conda-notebook-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.04
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@nvks-runners
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
-      node_type: "gpu-v100-latest-1"
+      node_type: "gpu-l4-latest-1"
       arch: "amd64"
       container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/test_notebooks.sh"
   wheel-tests-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@nvks-runners
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -119,7 +119,7 @@ jobs:
       script: ci/test_wheel_cudf.sh
   wheel-tests-dask-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@nvks-runners
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -128,7 +128,7 @@ jobs:
       script: ci/test_wheel_dask_cudf.sh
   unit-tests-cudf-pandas:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@nvks-runners
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -137,19 +137,19 @@ jobs:
       script: ci/cudf_pandas_scripts/run_tests.sh
   third-party-integration-tests-cudf-pandas:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.04
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@nvks-runners
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
-      node_type: "gpu-v100-latest-1"
+      node_type: "gpu-l4-latest-1"
       container_image: "rapidsai/ci-conda:latest"
       run_script: |
         ci/cudf_pandas_scripts/third-party-integration/test.sh python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml
   wheel-tests-cudf-polars:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@nvks-runners
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -158,7 +158,7 @@ jobs:
       script: "ci/test_wheel_cudf_polars.sh"
   cudf-polars-polars-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@nvks-runners
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/trigger-breaking-change-alert.yaml b/.github/workflows/trigger-breaking-change-alert.yaml
index 9764c62c15c..7b5b4810fb6 100644
--- a/.github/workflows/trigger-breaking-change-alert.yaml
+++ b/.github/workflows/trigger-breaking-change-alert.yaml
@@ -12,7 +12,7 @@ jobs:
   trigger-notifier:
     if: contains(github.event.pull_request.labels.*.name, 'breaking')
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@branch-25.04
+    uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@nvks-runners
     with:
       sender_login: ${{ github.event.sender.login }}
       sender_avatar: ${{ github.event.sender.avatar_url }}

From a6acba0eaf67edf338a54d911803a7b6075d25c7 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Fri, 31 Jan 2025 17:26:51 -0800
Subject: [PATCH 15/17] Remove `orc::gpu` namespace (#17891)

Moving forward with removal of the (redundant) `gpu` namespace in cuIO.
Also moved the entire ORC implementation to `cudf::io::orc::detail`, leaving only the implementation of the public API in `cudf::io::orc`.

Also removed a few unused headers, or moved them to be included in the right files.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: https://github.com/rapidsai/cudf/pull/17891
---
 cpp/include/cudf/io/orc_metadata.hpp    |  10 +-
 cpp/src/io/functions.cpp                |  14 +-
 cpp/src/io/orc/dict_enc.cu              |   4 +-
 cpp/src/io/orc/orc.cpp                  |   9 +-
 cpp/src/io/orc/orc.hpp                  |  10 +-
 cpp/src/io/orc/orc_field_reader.hpp     |  12 +-
 cpp/src/io/orc/orc_field_writer.hpp     |  10 +-
 cpp/src/io/orc/orc_gpu.hpp              |  15 +-
 cpp/src/io/orc/reader_impl_chunking.cu  |  60 +++----
 cpp/src/io/orc/reader_impl_chunking.hpp |  10 +-
 cpp/src/io/orc/reader_impl_decode.cu    | 135 +++++++-------
 cpp/src/io/orc/reader_impl_helpers.cpp  |  16 +-
 cpp/src/io/orc/reader_impl_helpers.hpp  |  38 ++--
 cpp/src/io/orc/stats_enc.cu             |   4 +-
 cpp/src/io/orc/stripe_data.cu           |  11 +-
 cpp/src/io/orc/stripe_enc.cu            |  11 +-
 cpp/src/io/orc/stripe_init.cu           |  10 +-
 cpp/src/io/orc/writer_impl.cu           | 228 ++++++++++++------------
 cpp/src/io/orc/writer_impl.hpp          |   4 +-
 19 files changed, 282 insertions(+), 329 deletions(-)

diff --git a/cpp/include/cudf/io/orc_metadata.hpp b/cpp/include/cudf/io/orc_metadata.hpp
index 3c6194bb721..a9045d460b3 100644
--- a/cpp/include/cudf/io/orc_metadata.hpp
+++ b/cpp/include/cudf/io/orc_metadata.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -171,12 +171,12 @@ using statistics_type = std::variant<no_statistics,
                                      timestamp_statistics>;
 
 //! Orc I/O interfaces
-namespace orc {
+namespace orc::detail {
 // forward declare the type that ProtobufReader uses. The `cudf::io::column_statistics` objects,
 // returned from `read_parsed_orc_statistics`, are constructed from
-// `cudf::io::orc::column_statistics` objects that `ProtobufReader` initializes.
+// `cudf::io::orc::detail::column_statistics` objects that `ProtobufReader` initializes.
 struct column_statistics;
-}  // namespace orc
+}  // namespace orc::detail
 
 /**
  * @brief Contains per-column ORC statistics.
@@ -194,7 +194,7 @@ struct column_statistics {
    *
    * @param detail_statistics The statistics to initialize the object with
    */
-  column_statistics(orc::column_statistics&& detail_statistics);
+  column_statistics(orc::detail::column_statistics&& detail_statistics);
 };
 
 /**
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index 69fd4068712..0d5bb8ac191 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -292,7 +292,7 @@ raw_orc_statistics read_raw_orc_statistics(source_info const& src_info,
     CUDF_FAIL("Unsupported source type");
   }
 
-  orc::metadata const metadata(source.get(), stream);
+  orc::detail::metadata const metadata(source.get(), stream);
 
   // Initialize statistics to return
   raw_orc_statistics result;
@@ -318,7 +318,7 @@ raw_orc_statistics read_raw_orc_statistics(source_info const& src_info,
   return result;
 }
 
-column_statistics::column_statistics(orc::column_statistics&& cs)
+column_statistics::column_statistics(orc::detail::column_statistics&& cs)
 {
   number_of_values = cs.number_of_values;
   has_null         = cs.has_null;
@@ -350,9 +350,9 @@ parsed_orc_statistics read_parsed_orc_statistics(source_info const& src_info,
   result.column_names = raw_stats.column_names;
 
   auto parse_column_statistics = [](auto const& raw_col_stats) {
-    orc::column_statistics stats_internal;
-    orc::ProtobufReader(reinterpret_cast<uint8_t const*>(raw_col_stats.c_str()),
-                        raw_col_stats.size())
+    orc::detail::column_statistics stats_internal;
+    orc::detail::ProtobufReader(reinterpret_cast<uint8_t const*>(raw_col_stats.c_str()),
+                                raw_col_stats.size())
       .read(stats_internal);
     return column_statistics(std::move(stats_internal));
   };
@@ -373,7 +373,7 @@ parsed_orc_statistics read_parsed_orc_statistics(source_info const& src_info,
   return result;
 }
 namespace {
-orc_column_schema make_orc_column_schema(host_span<orc::SchemaType const> orc_schema,
+orc_column_schema make_orc_column_schema(host_span<orc::detail::SchemaType const> orc_schema,
                                          uint32_t column_id,
                                          std::string column_name)
 {
@@ -400,7 +400,7 @@ orc_metadata read_orc_metadata(source_info const& src_info, rmm::cuda_stream_vie
   auto sources = make_datasources(src_info);
 
   CUDF_EXPECTS(sources.size() == 1, "Only a single source is currently supported.");
-  auto const footer = orc::metadata(sources.front().get(), stream).ff;
+  auto const footer = orc::detail::metadata(sources.front().get(), stream).ff;
 
   return {{make_orc_column_schema(footer.types, 0, "")},
           footer.numberOfRows,
diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu
index aeaa87e2202..be3c90a3e24 100644
--- a/cpp/src/io/orc/dict_enc.cu
+++ b/cpp/src/io/orc/dict_enc.cu
@@ -24,7 +24,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
-namespace cudf::io::orc::gpu {
+namespace cudf::io::orc::detail {
 
 /**
  * @brief Counts the number of characters in each rowgroup of each string column.
@@ -266,4 +266,4 @@ void get_dictionary_indices(device_2dspan<stripe_dictionary> dictionaries,
     <<<dictionaries.count(), block_size, 0, stream.value()>>>(dictionaries, columns);
 }
 
-}  // namespace cudf::io::orc::gpu
+}  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/orc.cpp b/cpp/src/io/orc/orc.cpp
index 7046b3b3f91..7ae32f3e8f8 100644
--- a/cpp/src/io/orc/orc.cpp
+++ b/cpp/src/io/orc/orc.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 
 #include "orc.hpp"
 
+#include "io/comp/io_uncomp.hpp"
 #include "orc_field_reader.hpp"
 #include "orc_field_writer.hpp"
 
@@ -25,7 +26,7 @@
 
 #include <string>
 
-namespace cudf::io::orc {
+namespace cudf::io::orc::detail {
 
 namespace {
 [[nodiscard]] constexpr uint32_t varint_size(uint64_t val)
@@ -496,7 +497,7 @@ metadata::metadata(datasource* const src, rmm::cuda_stream_view stream) : source
   buffer =
     source->host_read(len - ps_length - 1 - ps.footerLength - ps.metadataLength, ps.metadataLength);
   auto const md_data = decompressor->decompress_blocks({buffer->data(), buffer->size()}, stream);
-  orc::ProtobufReader(md_data.data(), md_data.size()).read(md);
+  ProtobufReader(md_data.data(), md_data.size()).read(md);
 
   init_parent_descriptors();
   init_column_names();
@@ -546,4 +547,4 @@ void metadata::init_parent_descriptors()
   }
 }
 
-}  // namespace cudf::io::orc
+}  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/orc.hpp b/cpp/src/io/orc/orc.hpp
index 8dccf65ef10..49652c9a0d2 100644
--- a/cpp/src/io/orc/orc.hpp
+++ b/cpp/src/io/orc/orc.hpp
@@ -16,8 +16,6 @@
 
 #pragma once
 
-#include "io/comp/io_uncomp.hpp"
-
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/io/datasource.hpp>
 #include <cudf/io/orc_metadata.hpp>
@@ -34,9 +32,7 @@
 #include <string>
 #include <vector>
 
-namespace cudf {
-namespace io {
-namespace orc {
+namespace cudf::io::orc::detail {
 
 static constexpr uint32_t block_header_size = 3;
 // Seconds from January 1st, 1970 to January 1st, 2015
@@ -710,6 +706,4 @@ struct rowgroup_rows {
   [[nodiscard]] CUDF_HOST_DEVICE constexpr auto size() const noexcept { return end - begin; }
 };
 
-}  // namespace orc
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/orc_field_reader.hpp b/cpp/src/io/orc/orc_field_reader.hpp
index 3689e4d958b..797db239538 100644
--- a/cpp/src/io/orc/orc_field_reader.hpp
+++ b/cpp/src/io/orc/orc_field_reader.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 
 #include "orc.hpp"
 
-#include <string>
+#include <cstdint>
 
 /**
  * @file orc_field_reader.hpp
@@ -25,9 +25,7 @@
  * ProtobufReader::read(...) functions
  */
 
-namespace cudf {
-namespace io {
-namespace orc {
+namespace cudf::io::orc::detail {
 
 /**
  * @brief Functor to run an operator for a specified field.
@@ -90,6 +88,4 @@ inline void ProtobufReader::function_builder(T& s, size_t maxlen, std::tuple<Ope
   CUDF_EXPECTS(m_cur <= end, "Current pointer to metadata stream is out of bounds");
 }
 
-}  // namespace orc
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/orc_field_writer.hpp b/cpp/src/io/orc/orc_field_writer.hpp
index 731e9d7687e..58890de46c2 100644
--- a/cpp/src/io/orc/orc_field_writer.hpp
+++ b/cpp/src/io/orc/orc_field_writer.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,9 +26,7 @@
  * `ProtobufWriter::write(...)` functions
  */
 
-namespace cudf {
-namespace io {
-namespace orc {
+namespace cudf::io::orc::detail {
 
 struct ProtobufWriter::ProtobufFieldWriter {
   int struct_size{0};
@@ -129,6 +127,4 @@ struct ProtobufWriter::ProtobufFieldWriter {
   size_t value() { return struct_size; }
 };
 
-}  // namespace orc
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/orc_gpu.hpp b/cpp/src/io/orc/orc_gpu.hpp
index 8b30cee6681..4545c0a917a 100644
--- a/cpp/src/io/orc/orc_gpu.hpp
+++ b/cpp/src/io/orc/orc_gpu.hpp
@@ -18,7 +18,6 @@
 
 #include "io/comp/comp.hpp"
 #include "io/statistics/statistics.cuh"
-#include "io/utilities/column_buffer.hpp"
 #include "orc.hpp"
 
 #include <cudf/detail/cuco_helpers.hpp>
@@ -33,10 +32,7 @@
 
 #include <cuco/static_map.cuh>
 
-namespace cudf {
-namespace io {
-namespace orc {
-namespace gpu {
+namespace cudf::io::orc::detail {
 
 using cudf::detail::device_2dspan;
 using cudf::detail::host_2dspan;
@@ -65,9 +61,7 @@ auto constexpr VALUE_SENTINEL = size_type{-1};
 struct CompressedStreamInfo {
   CompressedStreamInfo() = default;
   explicit constexpr CompressedStreamInfo(uint8_t const* compressed_data_, size_t compressed_size_)
-    : compressed_data(compressed_data_),
-      uncompressed_data(nullptr),
-      compressed_data_size(compressed_size_)
+    : compressed_data(compressed_data_), compressed_data_size(compressed_size_)
   {
   }
   uint8_t const* compressed_data{};  // [in] base ptr to compressed stream data
@@ -500,7 +494,4 @@ void reduce_pushdown_masks(device_span<orc_column_device_view const> orc_columns
                            device_2dspan<cudf::size_type> set_counts,
                            rmm::cuda_stream_view stream);
 
-}  // namespace gpu
-}  // namespace orc
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 726c79bd004..f19fb3c81d8 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include "io/comp/gpuinflate.hpp"
 #include "io/orc/reader_impl.hpp"
 #include "io/orc/reader_impl_chunking.hpp"
 #include "io/orc/reader_impl_helpers.hpp"
@@ -40,16 +39,16 @@ namespace cudf::io::orc::detail {
 std::size_t gather_stream_info_and_column_desc(
   std::size_t stripe_id,
   std::size_t level,
-  orc::StripeInformation const* stripeinfo,
-  orc::StripeFooter const* stripefooter,
+  StripeInformation const* stripeinfo,
+  StripeFooter const* stripefooter,
   host_span<int const> orc2gdf,
-  host_span<orc::SchemaType const> types,
+  host_span<SchemaType const> types,
   bool use_index,
   bool apply_struct_map,
   int64_t* num_dictionary_entries,
   std::size_t* local_stream_order,
   std::vector<orc_stream_info>* stream_info,
-  cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>* chunks)
+  cudf::detail::hostdevice_2dvector<ColumnDesc>* chunks)
 {
   CUDF_EXPECTS((stream_info == nullptr) ^ (chunks == nullptr),
                "Either stream_info or chunks must be provided, but not both.");
@@ -57,17 +56,17 @@ std::size_t gather_stream_info_and_column_desc(
   std::size_t src_offset = 0;
   std::size_t dst_offset = 0;
 
-  auto const get_stream_index_type = [](orc::StreamKind kind) {
+  auto const get_stream_index_type = [](StreamKind kind) {
     switch (kind) {
-      case orc::DATA: return gpu::CI_DATA;
-      case orc::LENGTH:
-      case orc::SECONDARY: return gpu::CI_DATA2;
-      case orc::DICTIONARY_DATA: return gpu::CI_DICTIONARY;
-      case orc::PRESENT: return gpu::CI_PRESENT;
-      case orc::ROW_INDEX: return gpu::CI_INDEX;
+      case DATA: return CI_DATA;
+      case LENGTH:
+      case SECONDARY: return CI_DATA2;
+      case DICTIONARY_DATA: return CI_DICTIONARY;
+      case PRESENT: return CI_PRESENT;
+      case ROW_INDEX: return CI_INDEX;
       default:
         // Skip this stream as it's not strictly required
-        return gpu::CI_NUM_STREAMS;
+        return CI_NUM_STREAMS;
     }
   };
 
@@ -87,16 +86,15 @@ std::size_t gather_stream_info_and_column_desc(
       // for each of its fields. There is only a PRESENT stream, which
       // needs to be included for the reader.
       auto const schema_type = types[column_id];
-      if (!schema_type.subtypes.empty() && schema_type.kind == orc::STRUCT &&
-          stream.kind == orc::PRESENT) {
+      if (!schema_type.subtypes.empty() && schema_type.kind == STRUCT && stream.kind == PRESENT) {
         for (auto const& idx : schema_type.subtypes) {
           auto const child_idx = (idx < orc2gdf.size()) ? orc2gdf[idx] : -1;
           if (child_idx >= 0) {
             col = child_idx;
             if (chunks) {
-              auto& chunk                     = (*chunks)[stripe_id][col];
-              chunk.strm_id[gpu::CI_PRESENT]  = *local_stream_order;
-              chunk.strm_len[gpu::CI_PRESENT] = stream.length;
+              auto& chunk                = (*chunks)[stripe_id][col];
+              chunk.strm_id[CI_PRESENT]  = *local_stream_order;
+              chunk.strm_len[CI_PRESENT] = stream.length;
             }
           }
         }
@@ -105,14 +103,14 @@ std::size_t gather_stream_info_and_column_desc(
       if (chunks) {
         if (src_offset >= stripeinfo->indexLength || use_index) {
           auto const index_type = get_stream_index_type(stream.kind);
-          if (index_type < gpu::CI_NUM_STREAMS) {
+          if (index_type < CI_NUM_STREAMS) {
             auto& chunk                = (*chunks)[stripe_id][col];
             chunk.strm_id[index_type]  = *local_stream_order;
             chunk.strm_len[index_type] = stream.length;
             // NOTE: skip_count field is temporarily used to track the presence of index streams
             chunk.skip_count |= 1 << index_type;
 
-            if (index_type == gpu::CI_DICTIONARY) {
+            if (index_type == CI_DICTIONARY) {
               chunk.dictionary_start = *num_dictionary_entries;
               chunk.dict_len         = stripefooter->columns[column_id].dictionarySize;
               *num_dictionary_entries +=
@@ -643,7 +641,7 @@ void reader_impl::load_next_stripe_data(read_mode mode)
   // memory once.
   auto hd_compinfo = [&] {
     std::size_t max_num_streams{0};
-    if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
+    if (_metadata.per_file_metadata[0].ps.compression != NONE) {
       // Find the maximum number of streams in all levels of the loaded stripes.
       for (std::size_t level = 0; level < num_levels; ++level) {
         auto const stream_range =
@@ -651,7 +649,7 @@ void reader_impl::load_next_stripe_data(read_mode mode)
         max_num_streams = std::max(max_num_streams, stream_range.size());
       }
     }
-    return cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo>(max_num_streams, _stream);
+    return cudf::detail::hostdevice_vector<CompressedStreamInfo>(max_num_streams, _stream);
   }();
 
   for (std::size_t level = 0; level < num_levels; ++level) {
@@ -665,26 +663,26 @@ void reader_impl::load_next_stripe_data(read_mode mode)
     auto const stream_range =
       merge_selected_ranges(_file_itm_data.lvl_stripe_stream_ranges[level], load_stripe_range);
 
-    if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
+    if (_metadata.per_file_metadata[0].ps.compression != NONE) {
       auto const& decompressor = *_metadata.per_file_metadata[0].decompressor;
 
-      auto compinfo = cudf::detail::hostdevice_span<gpu::CompressedStreamInfo>{hd_compinfo}.subspan(
+      auto compinfo = cudf::detail::hostdevice_span<CompressedStreamInfo>{hd_compinfo}.subspan(
         0, stream_range.size());
       for (auto stream_idx = stream_range.begin; stream_idx < stream_range.end; ++stream_idx) {
         auto const& info = stream_info[stream_idx];
         auto const dst_base =
           static_cast<uint8_t const*>(stripe_data[info.source.stripe_idx - stripe_start].data());
         compinfo[stream_idx - stream_range.begin] =
-          gpu::CompressedStreamInfo(dst_base + info.dst_pos, info.length);
+          CompressedStreamInfo(dst_base + info.dst_pos, info.length);
       }
 
       // Estimate the uncompressed data.
       compinfo.host_to_device_async(_stream);
-      gpu::ParseCompressedStripeData(compinfo.device_ptr(),
-                                     compinfo.size(),
-                                     decompressor.GetBlockSize(),
-                                     decompressor.GetLog2MaxCompressionRatio(),
-                                     _stream);
+      ParseCompressedStripeData(compinfo.device_ptr(),
+                                compinfo.size(),
+                                decompressor.GetBlockSize(),
+                                decompressor.GetLog2MaxCompressionRatio(),
+                                _stream);
       compinfo.device_to_host_sync(_stream);
 
       for (auto stream_idx = stream_range.begin; stream_idx < stream_range.end; ++stream_idx) {
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 4ef68ee8d86..cb66edf3c98 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -307,15 +307,15 @@ std::vector<range> find_splits(host_span<T const> cumulative_sizes,
 std::size_t gather_stream_info_and_column_desc(
   std::size_t stripe_id,
   std::size_t level,
-  orc::StripeInformation const* stripeinfo,
-  orc::StripeFooter const* stripefooter,
+  StripeInformation const* stripeinfo,
+  StripeFooter const* stripefooter,
   host_span<int const> orc2gdf,
-  host_span<orc::SchemaType const> types,
+  host_span<SchemaType const> types,
   bool use_index,
   bool apply_struct_map,
   int64_t* num_dictionary_entries,
   std::size_t* local_stream_order,
   std::vector<orc_stream_info>* stream_info,
-  cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>* chunks);
+  cudf::detail::hostdevice_2dvector<ColumnDesc>* chunks);
 
 }  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index b661bb4ff90..586c07cbc16 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -77,13 +77,13 @@ rmm::device_buffer decompress_stripe_data(
   range const& loaded_stripe_range,
   range const& stream_range,
   std::size_t num_decode_stripes,
-  cudf::detail::hostdevice_span<gpu::CompressedStreamInfo> compinfo,
+  cudf::detail::hostdevice_span<CompressedStreamInfo> compinfo,
   stream_source_map<stripe_level_comp_info> const& compinfo_map,
   OrcDecompressor const& decompressor,
   host_span<rmm::device_buffer const> stripe_data,
   host_span<orc_stream_info const> stream_info,
-  cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
-  cudf::detail::hostdevice_2dvector<gpu::RowGroup>& row_groups,
+  cudf::detail::hostdevice_2dvector<ColumnDesc>& chunks,
+  cudf::detail::hostdevice_2dvector<RowGroup>& row_groups,
   size_type row_index_stride,
   bool use_base_stride,
   rmm::cuda_stream_view stream)
@@ -100,7 +100,7 @@ rmm::device_buffer decompress_stripe_data(
     auto const& info = stream_info[stream_idx];
 
     auto& stream_comp_info = compinfo[stream_idx - stream_range.begin];
-    stream_comp_info       = gpu::CompressedStreamInfo(
+    stream_comp_info       = CompressedStreamInfo(
       static_cast<uint8_t const*>(
         stripe_data[info.source.stripe_idx - loaded_stripe_range.begin].data()) +
         info.dst_pos,
@@ -120,11 +120,11 @@ rmm::device_buffer decompress_stripe_data(
 
   if (!compinfo_ready) {
     compinfo.host_to_device_async(stream);
-    gpu::ParseCompressedStripeData(compinfo.device_ptr(),
-                                   compinfo.size(),
-                                   decompressor.GetBlockSize(),
-                                   decompressor.GetLog2MaxCompressionRatio(),
-                                   stream);
+    ParseCompressedStripeData(compinfo.device_ptr(),
+                              compinfo.size(),
+                              decompressor.GetBlockSize(),
+                              decompressor.GetLog2MaxCompressionRatio(),
+                              stream);
     compinfo.device_to_host_sync(stream);
 
     for (std::size_t i = 0; i < compinfo.size(); ++i) {
@@ -178,11 +178,11 @@ rmm::device_buffer decompress_stripe_data(
   }
 
   compinfo.host_to_device_async(stream);
-  gpu::ParseCompressedStripeData(compinfo.device_ptr(),
-                                 compinfo.size(),
-                                 decompressor.GetBlockSize(),
-                                 decompressor.GetLog2MaxCompressionRatio(),
-                                 stream);
+  ParseCompressedStripeData(compinfo.device_ptr(),
+                            compinfo.size(),
+                            decompressor.GetBlockSize(),
+                            decompressor.GetLog2MaxCompressionRatio(),
+                            stream);
 
   // Value for checking whether we decompress successfully.
   // It doesn't need to be atomic as there is no race condition: we only write `true` if needed.
@@ -275,7 +275,7 @@ rmm::device_buffer decompress_stripe_data(
   // Copy without stream sync, thus need to wait for stream sync below to access.
   any_block_failure.device_to_host_async(stream);
 
-  gpu::PostDecompressionReassemble(compinfo.device_ptr(), compinfo.size(), stream);
+  PostDecompressionReassemble(compinfo.device_ptr(), compinfo.size(), stream);
   compinfo.device_to_host_sync(stream);  // This also sync stream for `any_block_failure`.
 
   // We can check on host after stream synchronize
@@ -291,7 +291,7 @@ rmm::device_buffer decompress_stripe_data(
   for (std::size_t i = 0; i < num_decode_stripes; ++i) {
     for (std::size_t j = 0; j < num_columns; ++j) {
       auto& chunk = chunks[i][j];
-      for (int k = 0; k < gpu::CI_NUM_STREAMS; ++k) {
+      for (int k = 0; k < CI_NUM_STREAMS; ++k) {
         if (chunk.strm_len[k] > 0 && chunk.strm_id[k] < compinfo.size()) {
           chunk.streams[k]  = compinfo[chunk.strm_id[k]].uncompressed_data;
           chunk.strm_len[k] = compinfo[chunk.strm_id[k]].max_uncompressed_size;
@@ -303,14 +303,14 @@ rmm::device_buffer decompress_stripe_data(
   if (row_groups.size().first) {
     chunks.host_to_device_async(stream);
     row_groups.host_to_device_async(stream);
-    gpu::ParseRowGroupIndex(row_groups.base_device_ptr(),
-                            compinfo.device_ptr(),
-                            chunks.base_device_ptr(),
-                            num_columns,
-                            num_decode_stripes,
-                            row_index_stride,
-                            use_base_stride,
-                            stream);
+    ParseRowGroupIndex(row_groups.base_device_ptr(),
+                       compinfo.device_ptr(),
+                       chunks.base_device_ptr(),
+                       num_columns,
+                       num_decode_stripes,
+                       row_index_stride,
+                       use_base_stride,
+                       stream);
   }
 
   return decomp_data;
@@ -329,7 +329,7 @@ rmm::device_buffer decompress_stripe_data(
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource to use for device memory allocation
  */
-void update_null_mask(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
+void update_null_mask(cudf::detail::hostdevice_2dvector<ColumnDesc>& chunks,
                       host_span<column_buffer> out_buffers,
                       rmm::cuda_stream_view stream,
                       rmm::device_async_resource_ref mr)
@@ -419,8 +419,8 @@ void decode_stream_data(int64_t num_dicts,
                         size_type row_index_stride,
                         std::size_t level,
                         table_device_view const& d_tz_table,
-                        cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
-                        cudf::detail::device_2dspan<gpu::RowGroup> row_groups,
+                        cudf::detail::hostdevice_2dvector<ColumnDesc>& chunks,
+                        cudf::detail::device_2dspan<RowGroup> row_groups,
                         std::vector<column_buffer>& out_buffers,
                         rmm::cuda_stream_view stream,
                         rmm::device_async_resource_ref mr)
@@ -441,10 +441,10 @@ void decode_stream_data(int64_t num_dicts,
   });
 
   // Allocate global dictionary for deserializing
-  rmm::device_uvector<gpu::DictionaryEntry> global_dict(num_dicts, stream);
+  rmm::device_uvector<DictionaryEntry> global_dict(num_dicts, stream);
 
   chunks.host_to_device_async(stream);
-  gpu::DecodeNullsAndStringDictionaries(
+  DecodeNullsAndStringDictionaries(
     chunks.base_device_ptr(), global_dict.data(), num_columns, num_stripes, skip_rows, stream);
 
   if (level > 0) {
@@ -453,18 +453,18 @@ void decode_stream_data(int64_t num_dicts,
   }
 
   cudf::detail::device_scalar<size_type> error_count(0, stream);
-  gpu::DecodeOrcColumnData(chunks.base_device_ptr(),
-                           global_dict.data(),
-                           row_groups,
-                           num_columns,
-                           num_stripes,
-                           skip_rows,
-                           d_tz_table,
-                           row_groups.size().first,
-                           row_index_stride,
-                           level,
-                           error_count.data(),
-                           stream);
+  DecodeOrcColumnData(chunks.base_device_ptr(),
+                      global_dict.data(),
+                      row_groups,
+                      num_columns,
+                      num_stripes,
+                      skip_rows,
+                      d_tz_table,
+                      row_groups.size().first,
+                      row_index_stride,
+                      level,
+                      error_count.data(),
+                      stream);
   chunks.device_to_host_async(stream);
   // `value` synchronizes
   auto const num_errors = error_count.value(stream);
@@ -485,7 +485,7 @@ void decode_stream_data(int64_t num_dicts,
  * @brief Compute the per-stripe prefix sum of null count, for each struct column in the current
  * layer.
  */
-void scan_null_counts(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> const& chunks,
+void scan_null_counts(cudf::detail::hostdevice_2dvector<ColumnDesc> const& chunks,
                       uint32_t* d_prefix_sums,
                       rmm::cuda_stream_view stream)
 {
@@ -531,9 +531,9 @@ void scan_null_counts(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> const&
  * @brief Aggregate child metadata from parent column chunks.
  */
 void aggregate_child_meta(std::size_t level,
-                          cudf::io::orc::detail::column_hierarchy const& selected_columns,
-                          cudf::detail::host_2dspan<gpu::ColumnDesc> chunks,
-                          cudf::detail::host_2dspan<gpu::RowGroup> row_groups,
+                          column_hierarchy const& selected_columns,
+                          cudf::detail::host_2dspan<ColumnDesc> chunks,
+                          cudf::detail::host_2dspan<RowGroup> row_groups,
                           host_span<orc_column_meta const> nested_cols,
                           host_span<column_buffer> out_buffers,
                           reader_column_meta& col_meta)
@@ -766,7 +766,7 @@ void reader_impl::decompress_and_decode_stripes(read_mode mode)
   // Each 'chunk' of data here corresponds to an orc column, in a stripe, at a nested level.
   // Unfortunately we cannot create one hostdevice_vector to use for all levels because
   // currently we do not have a hostdevice_2dspan class.
-  std::vector<cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>> lvl_chunks(num_levels);
+  std::vector<cudf::detail::hostdevice_2dvector<ColumnDesc>> lvl_chunks(num_levels);
 
   // For computing null count.
   auto null_count_prefix_sums = [&] {
@@ -787,7 +787,7 @@ void reader_impl::decompress_and_decode_stripes(read_mode mode)
   // thus only need to allocate memory once.
   auto hd_compinfo = [&] {
     std::size_t max_num_streams{0};
-    if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
+    if (_metadata.per_file_metadata[0].ps.compression != NONE) {
       // Find the maximum number of streams in all levels of the decoding stripes.
       for (std::size_t level = 0; level < num_levels; ++level) {
         auto const stream_range =
@@ -795,7 +795,7 @@ void reader_impl::decompress_and_decode_stripes(read_mode mode)
         max_num_streams = std::max(max_num_streams, stream_range.size());
       }
     }
-    return cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo>{max_num_streams, _stream};
+    return cudf::detail::hostdevice_vector<CompressedStreamInfo>{max_num_streams, _stream};
   }();
 
   auto& col_meta = *_col_meta;
@@ -812,8 +812,7 @@ void reader_impl::decompress_and_decode_stripes(read_mode mode)
     auto& chunks      = lvl_chunks[level];
 
     auto const num_lvl_columns = columns_level.size();
-    chunks =
-      cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>(stripe_count, num_lvl_columns, _stream);
+    chunks = cudf::detail::hostdevice_2dvector<ColumnDesc>(stripe_count, num_lvl_columns, _stream);
     memset(chunks.base_host_ptr(), 0, chunks.size_bytes());
 
     bool const use_index =
@@ -897,7 +896,7 @@ void reader_impl::decompress_and_decode_stripes(read_mode mode)
 
         // num_child_rows for a struct column will be same, for other nested types it will be
         // calculated.
-        chunk.num_child_rows = (chunk.type_kind != orc::STRUCT) ? 0 : chunk.num_rows;
+        chunk.num_child_rows = (chunk.type_kind != STRUCT) ? 0 : chunk.num_rows;
         chunk.dtype_id       = column_types[col_idx].id();
         chunk.decimal_scale  = _metadata.per_file_metadata[stripe.source_idx]
                                 .ff.types[columns_level[col_idx].id]
@@ -912,11 +911,11 @@ void reader_impl::decompress_and_decode_stripes(read_mode mode)
                                 : cudf::size_of(column_types[col_idx]);
         chunk.num_rowgroups = stripe_num_rowgroups;
 
-        if (chunk.type_kind == orc::TIMESTAMP) {
+        if (chunk.type_kind == TIMESTAMP) {
           chunk.timestamp_type_id = _options.timestamp_type.id();
         }
         if (not is_stripe_data_empty) {
-          for (int k = 0; k < gpu::CI_NUM_STREAMS; k++) {
+          for (int k = 0; k < CI_NUM_STREAMS; k++) {
             chunk.streams[k] =
               dst_base + stream_info[chunk.strm_id[k] + stream_range.begin].dst_pos;
           }
@@ -931,10 +930,10 @@ void reader_impl::decompress_and_decode_stripes(read_mode mode)
 
     // Process dataset chunks into output columns.
     auto row_groups =
-      cudf::detail::hostdevice_2dvector<gpu::RowGroup>(num_rowgroups, num_lvl_columns, _stream);
+      cudf::detail::hostdevice_2dvector<RowGroup>(num_rowgroups, num_lvl_columns, _stream);
     if (level > 0 and row_groups.size().first) {
-      cudf::host_span<gpu::RowGroup> row_groups_span(row_groups.base_host_ptr(),
-                                                     num_rowgroups * num_lvl_columns);
+      cudf::host_span<RowGroup> row_groups_span(row_groups.base_host_ptr(),
+                                                num_rowgroups * num_lvl_columns);
       auto& rw_grp_meta = col_meta.rwgrp_meta;
 
       // Update start row and num rows per row group
@@ -950,9 +949,9 @@ void reader_impl::decompress_and_decode_stripes(read_mode mode)
     }
 
     // Setup row group descriptors if using indexes.
-    if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
+    if (_metadata.per_file_metadata[0].ps.compression != NONE) {
       auto const compinfo =
-        cudf::detail::hostdevice_span<gpu::CompressedStreamInfo>{hd_compinfo}.subspan(
+        cudf::detail::hostdevice_span<CompressedStreamInfo>{hd_compinfo}.subspan(
           0, stream_range.size());
       auto decomp_data = decompress_stripe_data(load_stripe_range,
                                                 stream_range,
@@ -979,14 +978,14 @@ void reader_impl::decompress_and_decode_stripes(read_mode mode)
         chunks.host_to_device_async(_stream);
         row_groups.host_to_device_async(_stream);
         row_groups.host_to_device_async(_stream);
-        gpu::ParseRowGroupIndex(row_groups.base_device_ptr(),
-                                nullptr,
-                                chunks.base_device_ptr(),
-                                num_lvl_columns,
-                                stripe_count,
-                                _metadata.get_row_index_stride(),
-                                level == 0,
-                                _stream);
+        ParseRowGroupIndex(row_groups.base_device_ptr(),
+                           nullptr,
+                           chunks.base_device_ptr(),
+                           num_lvl_columns,
+                           stripe_count,
+                           _metadata.get_row_index_stride(),
+                           level == 0,
+                           _stream);
       }
     }
 
@@ -995,7 +994,7 @@ void reader_impl::decompress_and_decode_stripes(read_mode mode)
     for (std::size_t i = 0; i < column_types.size(); ++i) {
       bool is_nullable = false;
       for (std::size_t j = 0; j < stripe_count; ++j) {
-        if (chunks[j][i].strm_len[gpu::CI_PRESENT] != 0) {
+        if (chunks[j][i].strm_len[CI_PRESENT] != 0) {
           is_nullable = true;
           break;
         }
diff --git a/cpp/src/io/orc/reader_impl_helpers.cpp b/cpp/src/io/orc/reader_impl_helpers.cpp
index 7e5db4b7617..1d4aaaf51ef 100644
--- a/cpp/src/io/orc/reader_impl_helpers.cpp
+++ b/cpp/src/io/orc/reader_impl_helpers.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -34,7 +34,7 @@ std::unique_ptr<column> create_empty_column(size_type orc_col_id,
                                  to_cudf_decimal_type(decimal128_columns, metadata, orc_col_id));
 
   switch (kind) {
-    case orc::LIST: {
+    case LIST: {
       schema_info.children.emplace_back("offsets");
       schema_info.children.emplace_back("");
       return make_lists_column(0,
@@ -50,7 +50,7 @@ std::unique_ptr<column> create_empty_column(size_type orc_col_id,
                                rmm::device_buffer{0, stream},
                                stream);
     }
-    case orc::MAP: {
+    case MAP: {
       schema_info.children.emplace_back("offsets");
       schema_info.children.emplace_back("struct");
       auto const child_column_ids = metadata.get_col_type(orc_col_id).subtypes;
@@ -76,7 +76,7 @@ std::unique_ptr<column> create_empty_column(size_type orc_col_id,
         stream);
     }
 
-    case orc::STRUCT: {
+    case STRUCT: {
       std::vector<std::unique_ptr<column>> child_columns;
       for (auto const col : metadata.get_col_type(orc_col_id).subtypes) {
         schema_info.children.emplace_back("");
@@ -92,7 +92,7 @@ std::unique_ptr<column> create_empty_column(size_type orc_col_id,
         0, std::move(child_columns), 0, rmm::device_buffer{0, stream}, stream);
     }
 
-    case orc::DECIMAL: {
+    case DECIMAL: {
       int32_t scale = 0;
       if (type == type_id::DECIMAL32 or type == type_id::DECIMAL64 or type == type_id::DECIMAL128) {
         scale = -static_cast<int32_t>(metadata.get_types()[orc_col_id].scale.value_or(0));
@@ -119,8 +119,8 @@ column_buffer assemble_buffer(size_type orc_col_id,
   col_buffer.name = metadata.column_name(0, orc_col_id);
   auto kind       = metadata.get_col_type(orc_col_id).kind;
   switch (kind) {
-    case orc::LIST:
-    case orc::STRUCT: {
+    case LIST:
+    case STRUCT: {
       auto const& children_indices = selected_columns.children.at(orc_col_id);
       for (auto const child_id : children_indices) {
         col_buffer.children.emplace_back(assemble_buffer(
@@ -128,7 +128,7 @@ column_buffer assemble_buffer(size_type orc_col_id,
       }
     } break;
 
-    case orc::MAP: {
+    case MAP: {
       std::vector<column_buffer> child_col_buffers;
       // Get child buffers
       auto const& children_indices = selected_columns.children.at(orc_col_id);
diff --git a/cpp/src/io/orc/reader_impl_helpers.hpp b/cpp/src/io/orc/reader_impl_helpers.hpp
index 4cded30d89b..f2e746b312f 100644
--- a/cpp/src/io/orc/reader_impl_helpers.hpp
+++ b/cpp/src/io/orc/reader_impl_helpers.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -61,36 +61,36 @@ struct reader_column_meta {
 /**
  * @brief Function that translates ORC data kind to cuDF type enum
  */
-inline constexpr type_id to_cudf_type(orc::TypeKind kind,
+inline constexpr type_id to_cudf_type(TypeKind kind,
                                       bool use_np_dtypes,
                                       type_id timestamp_type_id,
                                       type_id decimal_type_id)
 {
   switch (kind) {
-    case orc::BOOLEAN: return type_id::BOOL8;
-    case orc::BYTE: return type_id::INT8;
-    case orc::SHORT: return type_id::INT16;
-    case orc::INT: return type_id::INT32;
-    case orc::LONG: return type_id::INT64;
-    case orc::FLOAT: return type_id::FLOAT32;
-    case orc::DOUBLE: return type_id::FLOAT64;
-    case orc::STRING:
-    case orc::BINARY:
-    case orc::VARCHAR:
-    case orc::CHAR:
+    case BOOLEAN: return type_id::BOOL8;
+    case BYTE: return type_id::INT8;
+    case SHORT: return type_id::INT16;
+    case INT: return type_id::INT32;
+    case LONG: return type_id::INT64;
+    case FLOAT: return type_id::FLOAT32;
+    case DOUBLE: return type_id::FLOAT64;
+    case STRING:
+    case BINARY:
+    case VARCHAR:
+    case CHAR:
       // Variable-length types can all be mapped to STRING
       return type_id::STRING;
-    case orc::TIMESTAMP:
+    case TIMESTAMP:
       return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
                                                    : type_id::TIMESTAMP_NANOSECONDS;
-    case orc::DATE:
+    case DATE:
       // There isn't a (DAYS -> np.dtype) mapping
       return (use_np_dtypes) ? type_id::TIMESTAMP_MILLISECONDS : type_id::TIMESTAMP_DAYS;
-    case orc::DECIMAL: return decimal_type_id;
+    case DECIMAL: return decimal_type_id;
     // Need to update once cuDF plans to support map type
-    case orc::MAP:
-    case orc::LIST: return type_id::LIST;
-    case orc::STRUCT: return type_id::STRUCT;
+    case MAP:
+    case LIST: return type_id::LIST;
+    case STRUCT: return type_id::STRUCT;
     default: break;
   }
 
diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu
index 5f4c1e0696d..e81c74ae1a6 100644
--- a/cpp/src/io/orc/stats_enc.cu
+++ b/cpp/src/io/orc/stats_enc.cu
@@ -24,7 +24,7 @@
 
 #include <cuda/std/utility>
 
-namespace cudf::io::orc::gpu {
+namespace cudf::io::orc::detail {
 
 using strings::detail::fixed_point_string_size;
 
@@ -502,4 +502,4 @@ void orc_encode_statistics(uint8_t* blob_bfr,
     blob_bfr, groups, chunks, statistics_count);
 }
 
-}  // namespace cudf::io::orc::gpu
+}  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu
index f560b806894..c7947b0e4c9 100644
--- a/cpp/src/io/orc/stripe_data.cu
+++ b/cpp/src/io/orc/stripe_data.cu
@@ -15,6 +15,7 @@
  */
 
 #include "io/utilities/block_utils.cuh"
+#include "io/utilities/column_buffer.hpp"
 #include "orc_gpu.hpp"
 
 #include <cudf/io/orc_types.hpp>
@@ -23,10 +24,7 @@
 
 #include <cub/cub.cuh>
 
-namespace cudf {
-namespace io {
-namespace orc {
-namespace gpu {
+namespace cudf::io::orc::detail {
 
 using cudf::io::detail::string_index_pair;
 
@@ -2096,7 +2094,4 @@ void __host__ DecodeOrcColumnData(ColumnDesc* chunks,
     chunks, global_dictionary, tz_table, row_groups, first_row, rowidx_stride, level, error_count);
 }
 
-}  // namespace gpu
-}  // namespace orc
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index 857daeb5856..15ce1aadb17 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include "io/comp/gpuinflate.hpp"
 #include "io/utilities/block_utils.cuh"
 #include "io/utilities/time_utils.cuh"
 #include "orc_gpu.hpp"
@@ -40,10 +39,7 @@
 #include <thrust/transform.h>
 #include <thrust/tuple.h>
 
-namespace cudf {
-namespace io {
-namespace orc {
-namespace gpu {
+namespace cudf::io::orc::detail {
 
 using cudf::detail::device_2dspan;
 using cudf::io::detail::compression_result;
@@ -1421,7 +1417,4 @@ void decimal_sizes_to_offsets(device_2dspan<rowgroup_rows const> rg_bounds,
     <<<num_blocks, block_size, 0, stream.value()>>>(rg_bounds, d_sizes);
 }
 
-}  // namespace gpu
-}  // namespace orc
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/stripe_init.cu b/cpp/src/io/orc/stripe_init.cu
index 5e23bc5adcc..a72b71a83ca 100644
--- a/cpp/src/io/orc/stripe_init.cu
+++ b/cpp/src/io/orc/stripe_init.cu
@@ -26,10 +26,7 @@
 #include <thrust/copy.h>
 #include <thrust/execution_policy.h>
 
-namespace cudf {
-namespace io {
-namespace orc {
-namespace gpu {
+namespace cudf::io::orc::detail {
 
 struct comp_in_out {
   uint8_t const* in_ptr{};
@@ -605,7 +602,4 @@ void __host__ reduce_pushdown_masks(device_span<orc_column_device_view const> co
     <<<num_blocks, block_size, 0, stream.value()>>>(columns, rowgroups, valid_counts);
 }
 
-}  // namespace gpu
-}  // namespace orc
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 5c3377a1aeb..ed900105968 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -21,7 +21,6 @@
 
 #include "io/orc/orc_gpu.hpp"
 #include "io/statistics/column_statistics.cuh"
-#include "io/utilities/column_utils.cuh"
 #include "writer_impl.hpp"
 
 #include <cudf/detail/iterator.cuh>
@@ -40,6 +39,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
 
 #include <cooperative_groups.h>
 #include <cooperative_groups/memcpy_async.h>
@@ -92,15 +92,15 @@ namespace {
 /**
  * @brief Translates cuDF compression to ORC compression.
  */
-orc::CompressionKind to_orc_compression(compression_type compression)
+CompressionKind to_orc_compression(compression_type compression)
 {
   switch (compression) {
     case compression_type::AUTO:
-    case compression_type::SNAPPY: return orc::CompressionKind::SNAPPY;
-    case compression_type::ZLIB: return orc::CompressionKind::ZLIB;
-    case compression_type::ZSTD: return orc::CompressionKind::ZSTD;
-    case compression_type::LZ4: return orc::CompressionKind::LZ4;
-    case compression_type::NONE: return orc::CompressionKind::NONE;
+    case compression_type::SNAPPY: return CompressionKind::SNAPPY;
+    case compression_type::ZLIB: return CompressionKind::ZLIB;
+    case compression_type::ZSTD: return CompressionKind::ZSTD;
+    case compression_type::LZ4: return CompressionKind::LZ4;
+    case compression_type::NONE: return CompressionKind::NONE;
     default: CUDF_FAIL("Unsupported compression type");
   }
 }
@@ -119,7 +119,7 @@ size_t compression_block_size(compression_type compression)
 /**
  * @brief Translates cuDF dtype to ORC datatype
  */
-constexpr orc::TypeKind to_orc_type(cudf::type_id id, bool list_column_as_map)
+constexpr TypeKind to_orc_type(cudf::type_id id, bool list_column_as_map)
 {
   switch (id) {
     case cudf::type_id::INT8: return TypeKind::BYTE;
@@ -237,8 +237,8 @@ class orc_column_view {
   [[nodiscard]] auto const& decimal_offsets() const { return d_decimal_offsets; }
   void attach_decimal_offsets(uint32_t* sizes_ptr) { d_decimal_offsets = sizes_ptr; }
 
-  void attach_stripe_dicts(host_span<gpu::stripe_dictionary const> host_stripe_dicts,
-                           device_span<gpu::stripe_dictionary const> dev_stripe_dicts)
+  void attach_stripe_dicts(host_span<stripe_dictionary const> host_stripe_dicts,
+                           device_span<stripe_dictionary const> dev_stripe_dicts)
   {
     stripe_dicts   = host_stripe_dicts;
     d_stripe_dicts = dev_stripe_dicts;
@@ -301,8 +301,8 @@ class orc_column_view {
 
   host_span<size_type const> rowgroup_char_counts;
 
-  host_span<gpu::stripe_dictionary const> stripe_dicts;
-  device_span<gpu::stripe_dictionary const> d_stripe_dicts;
+  host_span<stripe_dictionary const> stripe_dicts;
+  device_span<stripe_dictionary const> d_stripe_dicts;
 
   // Offsets for encoded decimal elements. Used to enable direct writing of encoded decimal elements
   // into the output stream.
@@ -498,17 +498,17 @@ size_t RLE_stream_size(TypeKind kind, size_t count)
     case TypeKind::BYTE:
       return div_rounding_up_unsafe(count, byte_rle_max_len) * (byte_rle_max_len + 1);
     case TypeKind::SHORT:
-      return div_rounding_up_unsafe(count, gpu::encode_block_size) *
-             (gpu::encode_block_size * max_varint_size<int16_t>() + 2);
+      return div_rounding_up_unsafe(count, encode_block_size) *
+             (encode_block_size * max_varint_size<int16_t>() + 2);
     case TypeKind::FLOAT:
     case TypeKind::INT:
     case TypeKind::DATE:
-      return div_rounding_up_unsafe(count, gpu::encode_block_size) *
-             (gpu::encode_block_size * max_varint_size<int32_t>() + 2);
+      return div_rounding_up_unsafe(count, encode_block_size) *
+             (encode_block_size * max_varint_size<int32_t>() + 2);
     case TypeKind::LONG:
     case TypeKind::DOUBLE:
-      return div_rounding_up_unsafe(count, gpu::encode_block_size) *
-             (gpu::encode_block_size * max_varint_size<int64_t>() + 2);
+      return div_rounding_up_unsafe(count, encode_block_size) *
+             (encode_block_size * max_varint_size<int64_t>() + 2);
     default: CUDF_FAIL("Unsupported ORC type for RLE stream size: " + std::to_string(kind));
   }
 }
@@ -536,7 +536,7 @@ orc_streams create_streams(host_span<orc_column_view> columns,
     return Stream{ROW_INDEX, col.id()};
   });
 
-  std::vector<int32_t> ids(columns.size() * gpu::CI_NUM_STREAMS, -1);
+  std::vector<int32_t> ids(columns.size() * CI_NUM_STREAMS, -1);
   std::vector<TypeKind> types(streams.size(), INVALID_TYPE_KIND);
 
   for (auto& column : columns) {
@@ -568,41 +568,39 @@ orc_streams create_streams(host_span<orc_column_view> columns,
     auto const kind = column.orc_kind();
 
     auto add_stream =
-      [&](gpu::StreamIndexType index_type, StreamKind kind, TypeKind type_kind, size_t size) {
+      [&](StreamIndexType index_type, StreamKind kind, TypeKind type_kind, size_t size) {
         auto const max_alignment_padding = compress_required_chunk_alignment(compression) - 1;
-        const auto base                  = column.index() * gpu::CI_NUM_STREAMS;
+        const auto base                  = column.index() * CI_NUM_STREAMS;
         ids[base + index_type]           = streams.size();
-        streams.push_back(orc::Stream{
-          kind,
-          column.id(),
-          (size == 0) ? 0 : size + max_alignment_padding * segmentation.num_rowgroups()});
+        streams.push_back(
+          Stream{kind,
+                 column.id(),
+                 (size == 0) ? 0 : size + max_alignment_padding * segmentation.num_rowgroups()});
         types.push_back(type_kind);
       };
 
-    auto add_RLE_stream = [&](
-                            gpu::StreamIndexType index_type, StreamKind kind, TypeKind type_kind) {
+    auto add_RLE_stream = [&](StreamIndexType index_type, StreamKind kind, TypeKind type_kind) {
       add_stream(index_type, kind, type_kind, RLE_column_size(type_kind));
     };
 
-    if (is_nullable) { add_RLE_stream(gpu::CI_PRESENT, PRESENT, TypeKind::BOOLEAN); }
+    if (is_nullable) { add_RLE_stream(CI_PRESENT, PRESENT, TypeKind::BOOLEAN); }
     switch (kind) {
       case TypeKind::BOOLEAN:
       case TypeKind::BYTE:
-        add_RLE_stream(gpu::CI_DATA, DATA, kind);
+        add_RLE_stream(CI_DATA, DATA, kind);
         column.set_orc_encoding(DIRECT);
         break;
       case TypeKind::SHORT:
       case TypeKind::INT:
       case TypeKind::LONG:
       case TypeKind::DATE:
-        add_RLE_stream(gpu::CI_DATA, DATA, kind);
+        add_RLE_stream(CI_DATA, DATA, kind);
         column.set_orc_encoding(DIRECT_V2);
         break;
       case TypeKind::FLOAT:
       case TypeKind::DOUBLE:
         // Pass through if no nulls (no RLE encoding for floating point)
-        add_stream(
-          gpu::CI_DATA, DATA, kind, (column.null_count() != 0) ? RLE_column_size(kind) : 0);
+        add_stream(CI_DATA, DATA, kind, (column.null_count() != 0) ? RLE_column_size(kind) : 0);
         column.set_orc_encoding(DIRECT);
         break;
       case TypeKind::STRING: {
@@ -632,35 +630,34 @@ orc_streams create_streams(host_span<orc_column_view> columns,
 
         // Decide between direct or dictionary encoding
         if (enable_dict && dict_data_size < direct_data_size) {
-          add_RLE_stream(gpu::CI_DATA, DATA, TypeKind::INT);
-          add_stream(gpu::CI_DATA2, LENGTH, TypeKind::INT, dict_lengths_div512 * (512 * 4 + 2));
-          add_stream(
-            gpu::CI_DICTIONARY, DICTIONARY_DATA, TypeKind::CHAR, std::max(dict_data_size, 1ul));
+          add_RLE_stream(CI_DATA, DATA, TypeKind::INT);
+          add_stream(CI_DATA2, LENGTH, TypeKind::INT, dict_lengths_div512 * (512 * 4 + 2));
+          add_stream(CI_DICTIONARY, DICTIONARY_DATA, TypeKind::CHAR, std::max(dict_data_size, 1ul));
           column.set_orc_encoding(DICTIONARY_V2);
         } else {
-          add_stream(gpu::CI_DATA, DATA, TypeKind::CHAR, std::max<size_t>(direct_data_size, 1));
-          add_RLE_stream(gpu::CI_DATA2, LENGTH, TypeKind::INT);
+          add_stream(CI_DATA, DATA, TypeKind::CHAR, std::max<size_t>(direct_data_size, 1));
+          add_RLE_stream(CI_DATA2, LENGTH, TypeKind::INT);
           column.set_orc_encoding(DIRECT_V2);
         }
         break;
       }
       case TypeKind::TIMESTAMP:
-        add_RLE_stream(gpu::CI_DATA, DATA, TypeKind::LONG);
-        add_RLE_stream(gpu::CI_DATA2, SECONDARY, TypeKind::LONG);
+        add_RLE_stream(CI_DATA, DATA, TypeKind::LONG);
+        add_RLE_stream(CI_DATA2, SECONDARY, TypeKind::LONG);
         column.set_orc_encoding(DIRECT_V2);
         break;
       case TypeKind::DECIMAL:
         // varint values (NO RLE)
         // data_stream_size = decimal_column_sizes.at(column.index());
-        add_stream(gpu::CI_DATA, DATA, TypeKind::DECIMAL, decimal_column_sizes.at(column.index()));
+        add_stream(CI_DATA, DATA, TypeKind::DECIMAL, decimal_column_sizes.at(column.index()));
         // scale stream TODO: compute exact size since all elems are equal
-        add_RLE_stream(gpu::CI_DATA2, SECONDARY, TypeKind::INT);
+        add_RLE_stream(CI_DATA2, SECONDARY, TypeKind::INT);
         column.set_orc_encoding(DIRECT_V2);
         break;
       case TypeKind::LIST:
       case TypeKind::MAP:
         // no data stream, only lengths
-        add_RLE_stream(gpu::CI_DATA2, LENGTH, TypeKind::INT);
+        add_RLE_stream(CI_DATA2, LENGTH, TypeKind::INT);
         column.set_orc_encoding(DIRECT_V2);
         break;
       case TypeKind::STRUCT:
@@ -683,7 +680,7 @@ std::vector<std::vector<rowgroup_rows>> calculate_aligned_rowgroup_bounds(
     orc_table.num_columns() * segmentation.num_rowgroups(), stream);
   auto const d_pd_set_counts =
     device_2dspan<cudf::size_type>{d_pd_set_counts_data, orc_table.num_columns()};
-  gpu::reduce_pushdown_masks(orc_table.d_columns, segmentation.rowgroups, d_pd_set_counts, stream);
+  reduce_pushdown_masks(orc_table.d_columns, segmentation.rowgroups, d_pd_set_counts, stream);
 
   auto aligned_rgs = hostdevice_2dvector<rowgroup_rows>(
     segmentation.num_rowgroups(), orc_table.num_columns(), stream);
@@ -838,7 +835,7 @@ encoded_data encode_columns(orc_table_view const& orc_table,
                             rmm::cuda_stream_view stream)
 {
   auto const num_columns = orc_table.num_columns();
-  hostdevice_2dvector<gpu::EncChunk> chunks(num_columns, segmentation.num_rowgroups(), stream);
+  hostdevice_2dvector<EncChunk> chunks(num_columns, segmentation.num_rowgroups(), stream);
 
   auto const aligned_rowgroups = calculate_aligned_rowgroup_bounds(orc_table, segmentation, stream);
 
@@ -911,7 +908,7 @@ encoded_data encode_columns(orc_table_view const& orc_table,
       " Please see https://github.com/rapidsai/cudf/issues/6763 for more information.");
   }
 
-  hostdevice_2dvector<gpu::encoder_chunk_streams> chunk_streams(
+  hostdevice_2dvector<encoder_chunk_streams> chunk_streams(
     num_columns, segmentation.num_rowgroups(), stream);
   // per-stripe, per-stream owning buffers
   std::vector<std::vector<rmm::device_uvector<uint8_t>>> encoded_data(segmentation.num_stripes());
@@ -921,10 +918,10 @@ encoded_data encode_columns(orc_table_view const& orc_table,
     });
 
     for (size_t col_idx = 0; col_idx < num_columns; col_idx++) {
-      for (int strm_type = 0; strm_type < gpu::CI_NUM_STREAMS; ++strm_type) {
+      for (int strm_type = 0; strm_type < CI_NUM_STREAMS; ++strm_type) {
         auto const& column = orc_table.column(col_idx);
         auto col_streams   = chunk_streams[col_idx];
-        auto const strm_id = streams.id(col_idx * gpu::CI_NUM_STREAMS + strm_type);
+        auto const strm_id = streams.id(col_idx * CI_NUM_STREAMS + strm_type);
 
         std::for_each(stripe.cbegin(), stripe.cend(), [&](auto rg_idx) {
           col_streams[rg_idx].ids[strm_type]     = strm_id;
@@ -938,25 +935,25 @@ encoded_data encode_columns(orc_table_view const& orc_table,
             auto const& ck = chunks[col_idx][rg_idx];
             auto& strm     = col_streams[rg_idx];
 
-            if ((strm_type == gpu::CI_DICTIONARY) ||
-                (strm_type == gpu::CI_DATA2 && ck.encoding_kind == DICTIONARY_V2)) {
+            if ((strm_type == CI_DICTIONARY) ||
+                (strm_type == CI_DATA2 && ck.encoding_kind == DICTIONARY_V2)) {
               if (rg_idx == *stripe.cbegin()) {
                 auto const stripe_dict = column.host_stripe_dict(stripe.id);
                 strm.lengths[strm_type] =
-                  (strm_type == gpu::CI_DICTIONARY)
+                  (strm_type == CI_DICTIONARY)
                     ? stripe_dict.char_count
                     : (((stripe_dict.entry_count + 0x1ff) >> 9) * (512 * 4 + 2));
               } else {
                 strm.lengths[strm_type] = 0;
               }
-            } else if (strm_type == gpu::CI_DATA && ck.type_kind == TypeKind::STRING &&
+            } else if (strm_type == CI_DATA && ck.type_kind == TypeKind::STRING &&
                        ck.encoding_kind == DIRECT_V2) {
               strm.lengths[strm_type] = std::max(column.rowgroup_char_count(rg_idx), 1);
-            } else if (strm_type == gpu::CI_DATA && streams[strm_id].length == 0 &&
+            } else if (strm_type == CI_DATA && streams[strm_id].length == 0 &&
                        (ck.type_kind == DOUBLE || ck.type_kind == FLOAT)) {
               // Pass-through
               strm.lengths[strm_type] = ck.num_rows * ck.dtype_len;
-            } else if (ck.type_kind == DECIMAL && strm_type == gpu::CI_DATA) {
+            } else if (ck.type_kind == DECIMAL && strm_type == CI_DATA) {
               strm.lengths[strm_type] = dec_chunk_sizes.rg_sizes.at(col_idx)[rg_idx];
             } else {
               strm.lengths[strm_type] = RLE_stream_size(streams.type(strm_id), ck.num_rows);
@@ -974,12 +971,12 @@ encoded_data encode_columns(orc_table_view const& orc_table,
           auto const& ck    = chunks[col_idx][rg_idx];
           auto& strm        = col_streams[rg_idx];
 
-          if (strm_id < 0 or (strm_type == gpu::CI_DATA && streams[strm_id].length == 0 &&
+          if (strm_id < 0 or (strm_type == CI_DATA && streams[strm_id].length == 0 &&
                               (ck.type_kind == DOUBLE || ck.type_kind == FLOAT))) {
             strm.data_ptrs[strm_type] = nullptr;
           } else {
-            if ((strm_type == gpu::CI_DICTIONARY) ||
-                (strm_type == gpu::CI_DATA2 && ck.encoding_kind == DICTIONARY_V2)) {
+            if ((strm_type == CI_DICTIONARY) ||
+                (strm_type == CI_DATA2 && ck.encoding_kind == DICTIONARY_V2)) {
               strm.data_ptrs[strm_type] = encoded_data[stripe.id][strm_id].data();
             } else {
               strm.data_ptrs[strm_type] = (rg_idx_it == stripe.cbegin())
@@ -1003,16 +1000,16 @@ encoded_data encode_columns(orc_table_view const& orc_table,
   if (orc_table.num_rows() > 0) {
     if (orc_table.num_string_columns() != 0) {
       auto d_stripe_dict = orc_table.string_column(0).device_stripe_dicts();
-      gpu::EncodeStripeDictionaries(d_stripe_dict.data(),
-                                    orc_table.d_columns,
-                                    chunks,
-                                    orc_table.num_string_columns(),
-                                    segmentation.num_stripes(),
-                                    chunk_streams,
-                                    stream);
+      EncodeStripeDictionaries(d_stripe_dict.data(),
+                               orc_table.d_columns,
+                               chunks,
+                               orc_table.num_string_columns(),
+                               segmentation.num_stripes(),
+                               chunk_streams,
+                               stream);
     }
 
-    gpu::EncodeOrcColumnData(chunks, chunk_streams, stream);
+    EncodeOrcColumnData(chunks, chunk_streams, stream);
   }
   chunk_streams.device_to_host_sync(stream);
 
@@ -1034,7 +1031,7 @@ encoded_data encode_columns(orc_table_view const& orc_table,
 std::vector<StripeInformation> gather_stripes(size_t num_index_streams,
                                               file_segmentation const& segmentation,
                                               encoded_data* enc_data,
-                                              hostdevice_2dvector<gpu::StripeStream>* strm_desc,
+                                              hostdevice_2dvector<StripeStream>* strm_desc,
                                               rmm::cuda_stream_view stream)
 {
   if (segmentation.num_stripes() == 0) { return {}; }
@@ -1051,7 +1048,7 @@ std::vector<StripeInformation> gather_stripes(size_t num_index_streams,
     for (size_t col_idx = 0; col_idx < enc_data->streams.size().first; col_idx++) {
       auto const& col_streams = (enc_data->streams)[col_idx];
       // Assign stream data of column data stream(s)
-      for (int k = 0; k < gpu::CI_INDEX; k++) {
+      for (int k = 0; k < CI_INDEX; k++) {
         auto const stream_id = col_streams[0].ids[k];
         if (stream_id != -1) {
           auto const actual_stripe_size = std::accumulate(
@@ -1091,7 +1088,7 @@ std::vector<StripeInformation> gather_stripes(size_t num_index_streams,
 
   strm_desc->host_to_device_async(stream);
   // TODO: use cub::DeviceMemcpy::Batched
-  gpu::CompactOrcDataStreams(*strm_desc, enc_data->streams, stream);
+  CompactOrcDataStreams(*strm_desc, enc_data->streams, stream);
   strm_desc->device_to_host_async(stream);
   enc_data->streams.device_to_host_sync(stream);
 
@@ -1123,17 +1120,17 @@ cudf::detail::hostdevice_vector<uint8_t> allocate_and_encode_blobs(
   rmm::cuda_stream_view stream)
 {
   // figure out the buffer size needed for protobuf format
-  gpu::orc_init_statistics_buffersize(
+  orc_init_statistics_buffersize(
     stats_merge_groups.device_ptr(), stat_chunks.data(), num_stat_blobs, stream);
   auto max_blobs = stats_merge_groups.element(num_stat_blobs - 1, stream);
 
   cudf::detail::hostdevice_vector<uint8_t> blobs(max_blobs.start_chunk + max_blobs.num_chunks,
                                                  stream);
-  gpu::orc_encode_statistics(blobs.device_ptr(),
-                             stats_merge_groups.device_ptr(),
-                             stat_chunks.data(),
-                             num_stat_blobs,
-                             stream);
+  orc_encode_statistics(blobs.device_ptr(),
+                        stats_merge_groups.device_ptr(),
+                        stat_chunks.data(),
+                        num_stat_blobs,
+                        stream);
   stats_merge_groups.device_to_host_async(stream);
   blobs.device_to_host_sync(stream);
   return blobs;
@@ -1238,7 +1235,7 @@ intermediate_statistics gather_statistic_blobs(statistics_freq const stats_freq,
   auto stripe_stat_chunks   = stripe_chunks.data();
 
   rmm::device_uvector<statistics_group> rowgroup_groups(num_rowgroup_blobs, stream);
-  gpu::orc_init_statistics_groups(
+  orc_init_statistics_groups(
     rowgroup_groups.data(), stat_desc.device_ptr(), segmentation.rowgroups, stream);
 
   detail::calculate_group_statistics<detail::io_file_format::ORC>(
@@ -1440,8 +1437,8 @@ void write_index_stream(int32_t stripe_id,
                         int32_t stream_id,
                         host_span<orc_column_view const> columns,
                         file_segmentation const& segmentation,
-                        host_2dspan<gpu::encoder_chunk_streams const> enc_streams,
-                        host_2dspan<gpu::StripeStream const> strm_desc,
+                        host_2dspan<encoder_chunk_streams const> enc_streams,
+                        host_2dspan<StripeStream const> strm_desc,
                         host_span<compression_result const> comp_res,
                         host_span<ColStatsBlob const> rg_stats,
                         StripeInformation* stripe,
@@ -1455,8 +1452,7 @@ void write_index_stream(int32_t stripe_id,
   row_group_index_info data2;
   auto const column_id = stream_id - 1;
 
-  auto find_record = [=, &strm_desc](gpu::encoder_chunk_streams const& stream,
-                                     gpu::StreamIndexType type) {
+  auto find_record = [=, &strm_desc](encoder_chunk_streams const& stream, StreamIndexType type) {
     row_group_index_info record;
     if (stream.ids[type] > 0) {
       record.pos = 0;
@@ -1469,8 +1465,8 @@ void write_index_stream(int32_t stripe_id,
     }
     return record;
   };
-  auto scan_record = [=, &comp_res](gpu::encoder_chunk_streams const& stream,
-                                    gpu::StreamIndexType type,
+  auto scan_record = [=, &comp_res](encoder_chunk_streams const& stream,
+                                    StreamIndexType type,
                                     row_group_index_info& record) {
     if (record.pos >= 0) {
       record.pos += stream.lengths[type];
@@ -1489,9 +1485,9 @@ void write_index_stream(int32_t stripe_id,
   // TBD: Not sure we need an empty index stream for column 0
   if (stream_id != 0) {
     auto const& strm = enc_streams[column_id][0];
-    present          = find_record(strm, gpu::CI_PRESENT);
-    data             = find_record(strm, gpu::CI_DATA);
-    data2            = find_record(strm, gpu::CI_DATA2);
+    present          = find_record(strm, CI_PRESENT);
+    data             = find_record(strm, CI_DATA);
+    data2            = find_record(strm, CI_DATA2);
 
     // Change string dictionary to int from index point of view
     kind = columns[column_id].orc_kind();
@@ -1518,9 +1514,9 @@ void write_index_stream(int32_t stripe_id,
 
     if (stream_id != 0) {
       const auto& strm = enc_streams[column_id][rowgroup];
-      scan_record(strm, gpu::CI_PRESENT, present);
-      scan_record(strm, gpu::CI_DATA, data);
-      scan_record(strm, gpu::CI_DATA2, data2);
+      scan_record(strm, CI_PRESENT, present);
+      scan_record(strm, CI_DATA, data);
+      scan_record(strm, CI_DATA2, data2);
     }
   });
 
@@ -1549,8 +1545,8 @@ void write_index_stream(int32_t stripe_id,
  * @param[in] stream CUDA stream used for device memory operations and kernel launches
  * @return An std::future that should be synchronized to ensure the writing is complete
  */
-std::future<void> write_data_stream(gpu::StripeStream const& strm_desc,
-                                    gpu::encoder_chunk_streams const& enc_stream,
+std::future<void> write_data_stream(StripeStream const& strm_desc,
+                                    encoder_chunk_streams const& enc_stream,
                                     uint8_t const* compressed_data,
                                     host_span<uint8_t> bounce_buffer,
                                     StripeInformation* stripe,
@@ -1944,7 +1940,7 @@ encoder_decimal_info decimal_chunk_sizes(orc_table_view& orc_table,
   if (elem_sizes.empty()) return {};
 
   // Compute element offsets within each row group
-  gpu::decimal_sizes_to_offsets(segmentation.rowgroups, elem_sizes, stream);
+  decimal_sizes_to_offsets(segmentation.rowgroups, elem_sizes, stream);
 
   // Gather the row group sizes and copy to host
   auto d_tmp_rowgroup_sizes = rmm::device_uvector<uint32_t>(segmentation.num_rowgroups(), stream);
@@ -2011,11 +2007,11 @@ auto set_rowgroup_char_counts(orc_table_view& orc_table,
 
   auto counts         = rmm::device_uvector<size_type>(num_str_cols * num_rowgroups, stream);
   auto counts_2d_view = device_2dspan<size_type>(counts, num_rowgroups);
-  gpu::rowgroup_char_counts(counts_2d_view,
-                            orc_table.d_columns,
-                            rowgroup_bounds,
-                            orc_table.d_string_column_indices,
-                            stream);
+  rowgroup_char_counts(counts_2d_view,
+                       orc_table.d_columns,
+                       rowgroup_bounds,
+                       orc_table.d_string_column_indices,
+                       stream);
 
   auto const h_counts = cudf::detail::make_host_vector_sync(counts, stream);
 
@@ -2030,7 +2026,7 @@ auto set_rowgroup_char_counts(orc_table_view& orc_table,
 
 // Holds the stripe dictionary descriptors and dictionary buffers.
 struct stripe_dictionaries {
-  hostdevice_2dvector<gpu::stripe_dictionary> views;       // descriptors [string_column][stripe]
+  hostdevice_2dvector<stripe_dictionary> views;            // descriptors [string_column][stripe]
   std::vector<rmm::device_uvector<uint32_t>> data_owner;   // dictionary data owner, per stripe
   std::vector<rmm::device_uvector<uint32_t>> index_owner;  // dictionary index owner, per stripe
   std::vector<rmm::device_uvector<uint32_t>> order_owner;  // dictionary order owner, per stripe
@@ -2082,17 +2078,17 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table,
                          : segmentation.rowgroups[stripe.first + stripe.size - 1][col_idx].end -
                              segmentation.rowgroups[stripe.first][col_idx].begin;
       hash_maps_storage_offsets[str_column.str_index()].emplace_back(total_map_storage_size);
-      total_map_storage_size += stripe_num_rows * gpu::occupancy_factor;
+      total_map_storage_size += stripe_num_rows * occupancy_factor;
     }
     hash_maps_storage_offsets[str_column.str_index()].emplace_back(total_map_storage_size);
   }
 
-  hostdevice_2dvector<gpu::stripe_dictionary> stripe_dicts(
+  hostdevice_2dvector<stripe_dictionary> stripe_dicts(
     orc_table.num_string_columns(), segmentation.num_stripes(), stream);
   if (stripe_dicts.count() == 0) return {std::move(stripe_dicts), {}, {}};
 
   // Create a single bulk storage to use for all sub-dictionaries
-  auto map_storage = std::make_unique<gpu::storage_type>(
+  auto map_storage = std::make_unique<storage_type>(
     total_map_storage_size,
     cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream});
 
@@ -2121,8 +2117,8 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table,
   }
   stripe_dicts.host_to_device_async(stream);
 
-  map_storage->initialize_async({gpu::KEY_SENTINEL, gpu::VALUE_SENTINEL}, {stream.value()});
-  gpu::populate_dictionary_hash_maps(stripe_dicts, orc_table.d_columns, stream);
+  map_storage->initialize_async({KEY_SENTINEL, VALUE_SENTINEL}, {stream.value()});
+  populate_dictionary_hash_maps(stripe_dicts, orc_table.d_columns, stream);
   // Copy the entry counts and char counts from the device to the host
   stripe_dicts.device_to_host_sync(stream);
 
@@ -2169,8 +2165,8 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table,
   // Synchronize to ensure the copy is complete before we clear `map_slots`
   stripe_dicts.host_to_device_sync(stream);
 
-  gpu::collect_map_entries(stripe_dicts, stream);
-  gpu::get_dictionary_indices(stripe_dicts, orc_table.d_columns, stream);
+  collect_map_entries(stripe_dicts, stream);
+  get_dictionary_indices(stripe_dicts, orc_table.d_columns, stream);
 
   // deallocate hash map storage, unused after this point
   map_storage.reset();
@@ -2299,7 +2295,7 @@ auto convert_table_to_orc_data(table_view const& input,
   // Assemble individual disparate column chunks into contiguous data streams
   size_type const num_index_streams = (orc_table.num_columns() + 1);
   auto const num_data_streams       = streams.size() - num_index_streams;
-  hostdevice_2dvector<gpu::StripeStream> strm_descs(
+  hostdevice_2dvector<StripeStream> strm_descs(
     segmentation.num_stripes(), num_data_streams, stream);
   auto stripes = gather_stripes(num_index_streams, segmentation, &enc_data, &strm_descs, stream);
 
@@ -2353,17 +2349,17 @@ auto convert_table_to_orc_data(table_view const& input,
                compression_result{0, compression_status::FAILURE});
   if (compression != compression_type::NONE) {
     strm_descs.host_to_device_async(stream);
-    compression_stats = gpu::CompressOrcDataStreams(compressed_data,
-                                                    num_compressed_blocks,
-                                                    compression,
-                                                    compression_blocksize,
-                                                    max_compressed_block_size,
-                                                    block_align,
-                                                    collect_compression_stats,
-                                                    strm_descs,
-                                                    enc_data.streams,
-                                                    comp_results,
-                                                    stream);
+    compression_stats = CompressOrcDataStreams(compressed_data,
+                                               num_compressed_blocks,
+                                               compression,
+                                               compression_blocksize,
+                                               max_compressed_block_size,
+                                               block_align,
+                                               collect_compression_stats,
+                                               strm_descs,
+                                               enc_data.streams,
+                                               comp_results,
+                                               stream);
 
     // deallocate encoded data as it is not needed anymore
     enc_data.data.clear();
@@ -2535,7 +2531,7 @@ void writer::impl::write_orc_data_to_sink(encoded_data const& enc_data,
                                           orc_table_view const& orc_table,
                                           device_span<uint8_t const> compressed_data,
                                           host_span<compression_result const> comp_results,
-                                          host_2dspan<gpu::StripeStream const> strm_descs,
+                                          host_2dspan<StripeStream const> strm_descs,
                                           host_span<ColStatsBlob const> rg_stats,
                                           orc_streams& streams,
                                           host_span<StripeInformation> stripes,
diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp
index 7d23482cb17..b6a27d5a6c5 100644
--- a/cpp/src/io/orc/writer_impl.hpp
+++ b/cpp/src/io/orc/writer_impl.hpp
@@ -136,7 +136,7 @@ struct file_segmentation {
  */
 struct encoded_data {
   std::vector<std::vector<rmm::device_uvector<uint8_t>>> data;  // Owning array of the encoded data
-  hostdevice_2dvector<gpu::encoder_chunk_streams> streams;  // streams of encoded data, per chunk
+  hostdevice_2dvector<encoder_chunk_streams> streams;  // streams of encoded data, per chunk
 };
 
 /**
@@ -309,7 +309,7 @@ class writer::impl {
                               orc_table_view const& orc_table,
                               device_span<uint8_t const> compressed_data,
                               host_span<compression_result const> comp_results,
-                              host_2dspan<gpu::StripeStream const> strm_descs,
+                              host_2dspan<StripeStream const> strm_descs,
                               host_span<ColStatsBlob const> rg_stats,
                               orc_streams& streams,
                               host_span<StripeInformation> stripes,

From f1c2f2a679403a796e1da28c9b436f3fe37c84a9 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 3 Feb 2025 09:05:44 -0600
Subject: [PATCH 16/17] Fix third-party `cudf.pandas` tests (#17900)

## Description
This PR fixes cudf ci nightly test failures:
https://github.com/rapidsai/cudf/actions/runs/13097249137/job/36541039646

## Checklist
- [x] I am familiar with the [Contributing
Guidelines](https://github.com/rapidsai/cudf/blob/HEAD/CONTRIBUTING.md).
- [x] New or existing tests cover these changes.
- [x] The documentation is up to date with these changes.
---
 .../run-library-tests.sh                      |  1 +
 .../third-party-integration/test.sh           |  4 +-
 .../tests/conftest.py                         | 40 ++++++++++++++-----
 3 files changed, 32 insertions(+), 13 deletions(-)

diff --git a/ci/cudf_pandas_scripts/third-party-integration/run-library-tests.sh b/ci/cudf_pandas_scripts/third-party-integration/run-library-tests.sh
index ce3291b864a..ed564a39745 100755
--- a/ci/cudf_pandas_scripts/third-party-integration/run-library-tests.sh
+++ b/ci/cudf_pandas_scripts/third-party-integration/run-library-tests.sh
@@ -13,6 +13,7 @@ runtest() {
     local lib=$1
     local mode=$2
 
+    echo "Running tests for $lib in $mode mode"
     local plugin=""
     if [ "$mode" = "cudf" ]; then
         plugin="-p cudf.pandas"
diff --git a/ci/cudf_pandas_scripts/third-party-integration/test.sh b/ci/cudf_pandas_scripts/third-party-integration/test.sh
index c6f5844427d..cf0a16fb3cb 100755
--- a/ci/cudf_pandas_scripts/third-party-integration/test.sh
+++ b/ci/cudf_pandas_scripts/third-party-integration/test.sh
@@ -27,7 +27,7 @@ main() {
         lib=$(echo "$lib" | tr -d '""')
         echo "Running tests for library $lib"
 
-        CUDA_MAJOR=$(if [ "$lib" = "tensorflow" ]; then echo "11"; else echo "12"; fi)
+        CUDA_VERSION=$(if [ "$lib" = "tensorflow" ]; then echo "11.8"; else echo "${RAPIDS_CUDA_VERSION%.*}"; fi)
 
         . /opt/conda/etc/profile.d/conda.sh
 
@@ -36,7 +36,7 @@ main() {
           --config "$dependencies_yaml" \
           --output conda \
           --file-key "test_${lib}" \
-          --matrix "cuda=${CUDA_MAJOR};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml
+          --matrix "cuda=${CUDA_VERSION};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml
 
         rapids-mamba-retry env create --yes -f env.yaml -n test
 
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/conftest.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/conftest.py
index 33b6ffdbd5c..553d9c4459e 100644
--- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/conftest.py
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/conftest.py
@@ -1,7 +1,8 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Copyright (c) 2023-2025, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
+import glob
 import os
 import pickle
 from typing import TYPE_CHECKING, BinaryIO
@@ -75,23 +76,40 @@ def swap_xfail(item: _pytest.nodes.Item, name: str):
             swap_xfail(item, "xfail_compare")
 
 
+def get_full_nodeid(pyfuncitem):
+    # Get the full path to the test file
+    filepath = pyfuncitem.path
+    # Get the test name and any parameters
+    test_name = "::".join(pyfuncitem.nodeid.split("::")[1:])
+    # Combine the full file path with the test name
+    full_nodeid = f"{filepath}::{test_name}"
+    return full_nodeid
+
+
+def read_all_results(pattern):
+    results = {}
+    for filepath in glob.glob(pattern):
+        with open(filepath, "rb") as f:
+            results.update(dict(read_results(f)))
+    return results
+
+
 def pytest_configure(config: _pytest.config.Config):
     gold_basename = "results-gold"
     cudf_basename = "results-cudf-pandas"
     test_folder = os.path.join(os.path.dirname(__file__))
 
     if config.getoption("--compare"):
-        # Everyone reads everything
-        gold_path = os.path.join(test_folder, f"{gold_basename}.pickle")
-        cudf_path = os.path.join(test_folder, f"{cudf_basename}.pickle")
+        gold_path = os.path.join(test_folder, f"{gold_basename}*.pickle")
+        cudf_path = os.path.join(test_folder, f"{cudf_basename}*.pickle")
         with disable_module_accelerator():
-            with open(gold_path, "rb") as f:
-                gold_results = dict(read_results(f))
-        with open(cudf_path, "rb") as f:
-            cudf_results = dict(read_results(f))
+            gold_results = read_all_results(gold_path)
+        cudf_results = read_all_results(cudf_path)
         config.stash[results] = (gold_results, cudf_results)
     else:
-        if "cudf.pandas" in config.option.plugins:
+        if any(
+            plugin.strip() == "cudf.pandas" for plugin in config.option.plugins
+        ):
             basename = cudf_basename
         else:
             basename = gold_basename
@@ -112,7 +130,7 @@ def pytest_configure(config: _pytest.config.Config):
 def pytest_pyfunc_call(pyfuncitem: _pytest.python.Function):
     if pyfuncitem.config.getoption("--compare"):
         gold_results, cudf_results = pyfuncitem.config.stash[results]
-        key = pyfuncitem.nodeid
+        key = get_full_nodeid(pyfuncitem)
         try:
             gold = gold_results[key]
         except KeyError:
@@ -140,7 +158,7 @@ def pytest_pyfunc_call(pyfuncitem: _pytest.python.Function):
         # Tuple-based key-value pairs, key is the node-id
         try:
             pickle.dump(
-                (pyfuncitem.nodeid, result),
+                (get_full_nodeid(pyfuncitem), result),
                 pyfuncitem.config.stash[file_handle_key],
             )
         except pickle.PicklingError:

From d4bda07fee6280d8454c9f318b0e28e61782559c Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 3 Feb 2025 15:36:12 -0600
Subject: [PATCH 17/17] Fix `numpy` data access by making attribute private
 (#17890)

`data` attribute of numpy should be marked private as it actually points to the underlying memory and it will be distinct for a cupy array.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/17890
---
 python/cudf/cudf/pandas/_wrappers/numpy.py        | 3 ++-
 python/cudf/cudf_pandas_tests/test_cudf_pandas.py | 9 +++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/pandas/_wrappers/numpy.py b/python/cudf/cudf/pandas/_wrappers/numpy.py
index d5e669cb58f..1fc53bbbaae 100644
--- a/python/cudf/cudf/pandas/_wrappers/numpy.py
+++ b/python/cudf/cudf/pandas/_wrappers/numpy.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
@@ -149,6 +149,7 @@ def ndarray__array_ufunc__(self, ufunc, method, *inputs, **kwargs):
         # Special wrapping to handle scalar values
         "_fsproxy_wrap": classmethod(wrap_ndarray),
         "base": _FastSlowAttribute("base", private=True),
+        "data": _FastSlowAttribute("data", private=True),
     },
 )
 
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index 938d22de076..3e8b6d5786c 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -1927,3 +1927,12 @@ def test_series_dtype_property():
     expected = np.dtype(s)
     actual = np.dtype(xs)
     assert expected == actual
+
+
+def test_numpy_data_access():
+    s = pd.Series([1, 2, 3])
+    xs = xpd.Series([1, 2, 3])
+    expected = s.values.data
+    actual = xs.values.data
+
+    assert type(expected) is type(actual)