diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json
index e793dda3823..8c2226bfa98 100644
--- a/.devcontainer/cuda11.8-conda/devcontainer.json
+++ b/.devcontainer/cuda11.8-conda/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:25.02-cpp-cuda11.8-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:25.04-cpp-cuda11.8-mambaforge-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.02-cuda11.8-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.04-cuda11.8-conda"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.2": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.4": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json
index ba959f2bc27..a0c2e65b337 100644
--- a/.devcontainer/cuda11.8-pip/devcontainer.json
+++ b/.devcontainer/cuda11.8-pip/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:25.02-cpp-cuda11.8-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:25.04-cpp-cuda11.8-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.02-cuda11.8-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.04-cuda11.8-pip"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.2": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.4": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda12.8-conda/devcontainer.json b/.devcontainer/cuda12.8-conda/devcontainer.json
index 5a544e26145..3977a1c5f86 100644
--- a/.devcontainer/cuda12.8-conda/devcontainer.json
+++ b/.devcontainer/cuda12.8-conda/devcontainer.json
@@ -5,45 +5,55 @@
     "args": {
       "CUDA": "12.8",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:25.02-cpp-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:25.04-cpp-mambaforge-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.02-cuda12.8-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.04-cuda12.8-conda"
   ],
-  "hostRequirements": {"gpu": "optional"},
+  "hostRequirements": {
+    "gpu": "optional"
+  },
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/cuda:25.2": {
-        "version": "12.8",
-        "installCompilers": false,
-        "installProfilers": true,
-        "installDevPackages": false,
-        "installcuDNN": false,
-        "installcuTensor": false,
-        "installNCCL": false,
-        "installCUDARuntime": false,
-        "installNVRTC": false,
-        "installOpenCL": false,
-        "installcuBLAS": false,
-        "installcuSPARSE": false,
-        "installcuFFT": false,
-        "installcuFile": false,
-        "installcuRAND": false,
-        "installcuSOLVER": false,
-        "installNPP": false,
-        "installnvJPEG": false,
-        "pruneStaticLibs": true
-      },
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.2": {}
+    "ghcr.io/rapidsai/devcontainers/features/cuda:25.4": {
+      "version": "12.8",
+      "installCompilers": false,
+      "installProfilers": true,
+      "installDevPackages": false,
+      "installcuDNN": false,
+      "installcuTensor": false,
+      "installNCCL": false,
+      "installCUDARuntime": false,
+      "installNVRTC": false,
+      "installOpenCL": false,
+      "installcuBLAS": false,
+      "installcuSPARSE": false,
+      "installcuFFT": false,
+      "installcuFile": false,
+      "installcuRAND": false,
+      "installcuSOLVER": false,
+      "installNPP": false,
+      "installnvJPEG": false,
+      "pruneStaticLibs": true
+    },
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.4": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/cuda",
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
   ],
-  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.8-envs}"],
-  "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.8-envs}"
+  ],
+  "postAttachCommand": [
+    "/bin/bash",
+    "-c",
+    "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"
+  ],
   "workspaceFolder": "/home/coder",
   "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cudf,type=bind,consistency=consistent",
   "mounts": [
diff --git a/.devcontainer/cuda12.8-pip/devcontainer.json b/.devcontainer/cuda12.8-pip/devcontainer.json
index e9ffa620bb0..ab83fa88300 100644
--- a/.devcontainer/cuda12.8-pip/devcontainer.json
+++ b/.devcontainer/cuda12.8-pip/devcontainer.json
@@ -5,23 +5,33 @@
     "args": {
       "CUDA": "12.8",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:25.02-cpp-cuda12.8-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:25.04-cpp-cuda12.8-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.02-cuda12.8-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.04-cuda12.8-pip"
   ],
-  "hostRequirements": {"gpu": "optional"},
+  "hostRequirements": {
+    "gpu": "optional"
+  },
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.2": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.4": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
   ],
-  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda12.8-venvs}"],
-  "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda12.8-venvs}"
+  ],
+  "postAttachCommand": [
+    "/bin/bash",
+    "-c",
+    "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"
+  ],
   "workspaceFolder": "/home/coder",
   "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cudf,type=bind,consistency=consistent",
   "mounts": [
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index f0be4cb509d..9bcd3a65a9d 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@nvks-runners
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@nvks-runners
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@nvks-runners
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -57,19 +57,19 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@nvks-runners
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
       build_type: ${{ inputs.build_type || 'branch' }}
       container_image: "rapidsai/ci-conda:latest"
       date: ${{ inputs.date }}
-      node_type: "gpu-v100-latest-1"
+      node_type: "gpu-l4-latest-1"
       run_script: "ci/build_docs.sh"
       sha: ${{ inputs.sha }}
   wheel-build-libcudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@nvks-runners
     with:
       # build for every combination of arch and CUDA version, but only for the latest Python
       matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
@@ -81,7 +81,7 @@ jobs:
   wheel-publish-libcudf:
     needs: wheel-build-libcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@nvks-runners
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -92,7 +92,7 @@ jobs:
   wheel-build-pylibcudf:
     needs: [wheel-build-libcudf]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@nvks-runners
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -102,7 +102,7 @@ jobs:
   wheel-publish-pylibcudf:
     needs: wheel-build-pylibcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@nvks-runners
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -113,7 +113,7 @@ jobs:
   wheel-build-cudf:
     needs: wheel-build-pylibcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@nvks-runners
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -123,7 +123,7 @@ jobs:
   wheel-publish-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@nvks-runners
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -134,7 +134,7 @@ jobs:
   wheel-build-dask-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@nvks-runners
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -146,7 +146,7 @@ jobs:
   wheel-publish-dask-cudf:
     needs: wheel-build-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@nvks-runners
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -157,7 +157,7 @@ jobs:
   wheel-build-cudf-polars:
     needs: wheel-build-pylibcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@nvks-runners
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -169,7 +169,7 @@ jobs:
   wheel-publish-cudf-polars:
     needs: wheel-build-cudf-polars
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@nvks-runners
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pandas-tests.yaml b/.github/workflows/pandas-tests.yaml
index bd5e87a0a0b..8730ae43ddf 100644
--- a/.github/workflows/pandas-tests.yaml
+++ b/.github/workflows/pandas-tests.yaml
@@ -17,7 +17,7 @@ jobs:
   pandas-tests:
       # run the Pandas unit tests
       secrets: inherit
-      uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.8.0
+      uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@nvks-runners
       with:
         # This selects "ARCH=amd64 + the latest supported Python + CUDA".
         matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 6df5019b2aa..34fcbc14420 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -42,7 +42,7 @@ jobs:
       - pandas-tests-diff
       - telemetry-setup
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@nvks-runners
     if: always()
     with:
       needs: ${{ toJSON(needs) }}
@@ -70,7 +70,7 @@ jobs:
   changed-files:
     secrets: inherit
     needs: telemetry-setup
-    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@nvks-runners
     with:
       files_yaml: |
         test_cpp:
@@ -123,48 +123,48 @@ jobs:
   checks:
     secrets: inherit
     needs: telemetry-setup
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@nvks-runners
     with:
       enable_check_generated_files: false
       ignored_pr_jobs: "telemetry-summarize"
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@nvks-runners
     with:
       build_type: pull-request
       node_type: "cpu16"
   cpp-linters:
     secrets: inherit
     needs: checks
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@nvks-runners
     with:
       build_type: pull-request
       run_script: "ci/cpp_linters.sh"
   conda-cpp-checks:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@nvks-runners
     with:
       build_type: pull-request
       enable_check_symbols: true
   conda-cpp-tests:
     needs: [conda-cpp-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@nvks-runners
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_cpp
     with:
       build_type: pull-request
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@nvks-runners
     with:
       build_type: pull-request
   conda-python-cudf-tests:
     needs: [conda-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@nvks-runners
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
@@ -173,7 +173,7 @@ jobs:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     needs: [conda-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@nvks-runners
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
@@ -181,18 +181,18 @@ jobs:
   conda-java-tests:
     needs: [conda-cpp-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@nvks-runners
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_java
     with:
       build_type: pull-request
-      node_type: "gpu-v100-latest-1"
+      node_type: "gpu-l4-latest-1"
       arch: "amd64"
       container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/test_java.sh"
   static-configure:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@nvks-runners
     with:
       build_type: pull-request
       # Use the wheel container so we can skip conda solves and since our
@@ -202,28 +202,28 @@ jobs:
   conda-notebook-tests:
     needs: [conda-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@nvks-runners
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_notebooks
     with:
       build_type: pull-request
-      node_type: "gpu-v100-latest-1"
+      node_type: "gpu-l4-latest-1"
       arch: "amd64"
       container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/test_notebooks.sh"
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@nvks-runners
     with:
       build_type: pull-request
-      node_type: "gpu-v100-latest-1"
+      node_type: "gpu-l4-latest-1"
       arch: "amd64"
       container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/build_docs.sh"
   wheel-build-libcudf:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@nvks-runners
     with:
       # build for every combination of arch and CUDA version, but only for the latest Python
       matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
@@ -233,21 +233,21 @@ jobs:
   wheel-build-pylibcudf:
     needs: [checks, wheel-build-libcudf]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@nvks-runners
     with:
       build_type: pull-request
       script: "ci/build_wheel_pylibcudf.sh"
   wheel-build-cudf:
     needs: wheel-build-pylibcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@nvks-runners
     with:
       build_type: pull-request
       script: "ci/build_wheel_cudf.sh"
   wheel-tests-cudf:
     needs: [wheel-build-cudf, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@nvks-runners
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
@@ -255,7 +255,7 @@ jobs:
   wheel-build-cudf-polars:
     needs: wheel-build-pylibcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@nvks-runners
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -264,7 +264,7 @@ jobs:
   wheel-tests-cudf-polars:
     needs: [wheel-build-cudf-polars, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@nvks-runners
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
@@ -274,7 +274,7 @@ jobs:
   cudf-polars-polars-tests:
     needs: wheel-build-cudf-polars
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@nvks-runners
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -283,7 +283,7 @@ jobs:
   wheel-build-dask-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@nvks-runners
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -292,7 +292,7 @@ jobs:
   wheel-tests-dask-cudf:
     needs: [wheel-build-dask-cudf, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@nvks-runners
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
@@ -302,7 +302,7 @@ jobs:
   devcontainer:
     secrets: inherit
     needs: telemetry-setup
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@nvks-runners
     with:
       node_type: "cpu32"
       arch: '["amd64"]'
@@ -314,7 +314,7 @@ jobs:
   unit-tests-cudf-pandas:
     needs: [wheel-build-cudf, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@nvks-runners
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python || fromJSON(needs.changed-files.outputs.changed_file_groups).test_cudf_pandas
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
@@ -325,7 +325,7 @@ jobs:
     # run the Pandas unit tests using PR branch
     needs: [wheel-build-cudf, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@nvks-runners
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python || fromJSON(needs.changed-files.outputs.changed_file_groups).test_cudf_pandas
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
@@ -337,7 +337,7 @@ jobs:
   pandas-tests-diff:
     # diff the results of running the Pandas unit tests and publish a job summary
     needs: pandas-tests
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@nvks-runners
     with:
         node_type: "cpu4"
         build_type: pull-request
diff --git a/.github/workflows/pr_issue_status_automation.yml b/.github/workflows/pr_issue_status_automation.yml
index 8ad6448bc27..b1bd2d4e768 100644
--- a/.github/workflows/pr_issue_status_automation.yml
+++ b/.github/workflows/pr_issue_status_automation.yml
@@ -23,7 +23,7 @@ on:
 
 jobs:
     get-project-id:
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@cuda-12.8.0
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@nvks-runners
       if: github.event.pull_request.state == 'open'
       secrets: inherit
       permissions:
@@ -34,7 +34,7 @@ jobs:
 
     update-status:
       # This job sets the PR and its linked issues to "In Progress" status
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@cuda-12.8.0
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@nvks-runners
       if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }}
       needs: get-project-id
       with:
@@ -50,7 +50,7 @@ jobs:
 
     update-sprint:
       # This job sets the PR and its linked issues to the current "Weekly Sprint"
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@cuda-12.8.0
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@nvks-runners
       if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }}
       needs: get-project-id
       with:
@@ -79,7 +79,7 @@ jobs:
 
     update-release:
       # This job sets the PR and its linked issues to the release they are targeting
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@cuda-12.8.0
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@nvks-runners
       if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }}
       needs: [get-project-id, process-branch-name]
       with:
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index d909fd4a657..b6b2caddeb8 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@nvks-runners
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -25,7 +25,7 @@ jobs:
       enable_check_symbols: true
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@nvks-runners
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -33,19 +33,19 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-cpp-memcheck-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@nvks-runners
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
-      node_type: "gpu-v100-latest-1"
+      node_type: "gpu-l4-latest-1"
       arch: "amd64"
       container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/test_cpp_memcheck.sh"
   static-configure:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@nvks-runners
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -57,7 +57,7 @@ jobs:
       run_script: "ci/configure_cpp_static.sh"
   cpp-linters:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@nvks-runners
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -67,7 +67,7 @@ jobs:
       file_to_upload: iwyu_results.txt
   conda-python-cudf-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@nvks-runners
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -77,7 +77,7 @@ jobs:
   conda-python-other-tests:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@nvks-runners
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -86,31 +86,31 @@ jobs:
       script: "ci/test_python_other.sh"
   conda-java-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@nvks-runners
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
-      node_type: "gpu-v100-latest-1"
+      node_type: "gpu-l4-latest-1"
       arch: "amd64"
       container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/test_java.sh"
   conda-notebook-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@nvks-runners
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
-      node_type: "gpu-v100-latest-1"
+      node_type: "gpu-l4-latest-1"
       arch: "amd64"
       container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/test_notebooks.sh"
   wheel-tests-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@nvks-runners
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -119,7 +119,7 @@ jobs:
       script: ci/test_wheel_cudf.sh
   wheel-tests-dask-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@nvks-runners
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -128,7 +128,7 @@ jobs:
       script: ci/test_wheel_dask_cudf.sh
   unit-tests-cudf-pandas:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@nvks-runners
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -137,19 +137,19 @@ jobs:
       script: ci/cudf_pandas_scripts/run_tests.sh
   third-party-integration-tests-cudf-pandas:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@nvks-runners
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
-      node_type: "gpu-v100-latest-1"
+      node_type: "gpu-l4-latest-1"
       container_image: "rapidsai/ci-conda:latest"
       run_script: |
         ci/cudf_pandas_scripts/third-party-integration/test.sh python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml
   wheel-tests-cudf-polars:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@nvks-runners
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -158,7 +158,7 @@ jobs:
       script: "ci/test_wheel_cudf_polars.sh"
   cudf-polars-polars-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@nvks-runners
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/trigger-breaking-change-alert.yaml b/.github/workflows/trigger-breaking-change-alert.yaml
index 07f0f83cc92..7b5b4810fb6 100644
--- a/.github/workflows/trigger-breaking-change-alert.yaml
+++ b/.github/workflows/trigger-breaking-change-alert.yaml
@@ -12,7 +12,7 @@ jobs:
   trigger-notifier:
     if: contains(github.event.pull_request.labels.*.name, 'breaking')
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@nvks-runners
     with:
       sender_login: ${{ github.event.sender.login }}
       sender_avatar: ${{ github.event.sender.avatar_url }}
diff --git a/README.md b/README.md
index 4e917e15b4e..a240d6c2aa9 100644
--- a/README.md
+++ b/README.md
@@ -83,7 +83,7 @@ cuDF can be installed with conda (via [miniforge](https://github.com/conda-forge
 
 ```bash
 conda install -c rapidsai -c conda-forge -c nvidia \
-    cudf=25.02 python=3.12 cuda-version=12.8
+    cudf=25.04 python=3.12 cuda-version=12.8
 ```
 
 We also provide [nightly Conda packages](https://anaconda.org/rapidsai-nightly) built from the HEAD
diff --git a/VERSION b/VERSION
index 72eefaf7c79..b922658ff3f 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-25.02.00
+25.04.00
diff --git a/ci/cudf_pandas_scripts/third-party-integration/run-library-tests.sh b/ci/cudf_pandas_scripts/third-party-integration/run-library-tests.sh
index ce3291b864a..ed564a39745 100755
--- a/ci/cudf_pandas_scripts/third-party-integration/run-library-tests.sh
+++ b/ci/cudf_pandas_scripts/third-party-integration/run-library-tests.sh
@@ -13,6 +13,7 @@ runtest() {
     local lib=$1
     local mode=$2
 
+    echo "Running tests for $lib in $mode mode"
     local plugin=""
     if [ "$mode" = "cudf" ]; then
         plugin="-p cudf.pandas"
diff --git a/ci/cudf_pandas_scripts/third-party-integration/test.sh b/ci/cudf_pandas_scripts/third-party-integration/test.sh
index 43ed3594917..cf0a16fb3cb 100755
--- a/ci/cudf_pandas_scripts/third-party-integration/test.sh
+++ b/ci/cudf_pandas_scripts/third-party-integration/test.sh
@@ -11,7 +11,6 @@ extract_lib_from_dependencies_yaml() {
     # Parse all keys in dependencies.yaml under the "files" section,
     # extract all the keys that start with "test_", and extract the rest
     extracted_libs="$(yq -o json "$file" | jq -rc '.files | with_entries(select(.key | contains("test_"))) | keys | map(sub("^test_"; ""))')"
-    local extracted_libs
     echo "$extracted_libs"
 }
 
@@ -28,7 +27,7 @@ main() {
         lib=$(echo "$lib" | tr -d '""')
         echo "Running tests for library $lib"
 
-        CUDA_MAJOR=$(if [ "$lib" = "tensorflow" ]; then echo "11"; else echo "12"; fi)
+        CUDA_VERSION=$(if [ "$lib" = "tensorflow" ]; then echo "11.8"; else echo "${RAPIDS_CUDA_VERSION%.*}"; fi)
 
         . /opt/conda/etc/profile.d/conda.sh
 
@@ -37,7 +36,7 @@ main() {
           --config "$dependencies_yaml" \
           --output conda \
           --file-key "test_${lib}" \
-          --matrix "cuda=${CUDA_MAJOR};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml
+          --matrix "cuda=${CUDA_VERSION};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml
 
         rapids-mamba-retry env create --yes -f env.yaml -n test
 
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index cc01f5286ef..190533abc51 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -26,7 +26,7 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.3
-- dask-cuda==25.2.*,>=0.0.0a0
+- dask-cuda==25.4.*,>=0.0.0a0
 - dlpack>=0.8,<1.0
 - doxygen=1.9.1
 - fastavro>=0.22.9
@@ -42,9 +42,9 @@ dependencies:
 - libcufile=1.4.0.31
 - libcurand-dev=10.3.0.86
 - libcurand=10.3.0.86
-- libkvikio==25.2.*,>=0.0.0a0
+- libkvikio==25.4.*,>=0.0.0a0
 - librdkafka>=2.5.0,<2.6.0a0
-- librmm==25.2.*,>=0.0.0a0
+- librmm==25.4.*,>=0.0.0a0
 - make
 - mmh3
 - moto>=4.0.8
@@ -82,9 +82,9 @@ dependencies:
 - python-xxhash
 - python>=3.10,<3.13
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
-- rapids-dask-dependency==25.2.*,>=0.0.0a0
+- rapids-dask-dependency==25.4.*,>=0.0.0a0
 - rich
-- rmm==25.2.*,>=0.0.0a0
+- rmm==25.4.*,>=0.0.0a0
 - s3fs>=2022.3.0
 - scikit-build-core>=0.10.0
 - scipy
diff --git a/conda/environments/all_cuda-128_arch-x86_64.yaml b/conda/environments/all_cuda-128_arch-x86_64.yaml
index f4cdbed9be6..e719fd51573 100644
--- a/conda/environments/all_cuda-128_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-128_arch-x86_64.yaml
@@ -27,7 +27,7 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.3
-- dask-cuda==25.2.*,>=0.0.0a0
+- dask-cuda==25.4.*,>=0.0.0a0
 - dlpack>=0.8,<1.0
 - doxygen=1.9.1
 - fastavro>=0.22.9
@@ -41,9 +41,9 @@ dependencies:
 - jupyter_client
 - libcufile-dev
 - libcurand-dev
-- libkvikio==25.2.*,>=0.0.0a0
+- libkvikio==25.4.*,>=0.0.0a0
 - librdkafka>=2.5.0,<2.6.0a0
-- librmm==25.2.*,>=0.0.0a0
+- librmm==25.4.*,>=0.0.0a0
 - make
 - mmh3
 - moto>=4.0.8
@@ -81,9 +81,9 @@ dependencies:
 - python>=3.10,<3.13
 - pytorch>=2.4.0
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
-- rapids-dask-dependency==25.2.*,>=0.0.0a0
+- rapids-dask-dependency==25.4.*,>=0.0.0a0
 - rich
-- rmm==25.2.*,>=0.0.0a0
+- rmm==25.4.*,>=0.0.0a0
 - s3fs>=2022.3.0
 - scikit-build-core>=0.10.0
 - scipy
diff --git a/cpp/examples/versions.cmake b/cpp/examples/versions.cmake
index 13e0cf81625..c6c07dbc150 100644
--- a/cpp/examples/versions.cmake
+++ b/cpp/examples/versions.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -12,4 +12,4 @@
 # the License.
 # =============================================================================
 
-set(CUDF_TAG branch-25.02)
+set(CUDF_TAG branch-25.04)
diff --git a/cpp/include/cudf/io/orc_metadata.hpp b/cpp/include/cudf/io/orc_metadata.hpp
index 3c6194bb721..a9045d460b3 100644
--- a/cpp/include/cudf/io/orc_metadata.hpp
+++ b/cpp/include/cudf/io/orc_metadata.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -171,12 +171,12 @@ using statistics_type = std::variant<no_statistics,
                                      timestamp_statistics>;
 
 //! Orc I/O interfaces
-namespace orc {
+namespace orc::detail {
 // forward declare the type that ProtobufReader uses. The `cudf::io::column_statistics` objects,
 // returned from `read_parsed_orc_statistics`, are constructed from
-// `cudf::io::orc::column_statistics` objects that `ProtobufReader` initializes.
+// `cudf::io::orc::detail::column_statistics` objects that `ProtobufReader` initializes.
 struct column_statistics;
-}  // namespace orc
+}  // namespace orc::detail
 
 /**
  * @brief Contains per-column ORC statistics.
@@ -194,7 +194,7 @@ struct column_statistics {
    *
    * @param detail_statistics The statistics to initialize the object with
    */
-  column_statistics(orc::column_statistics&& detail_statistics);
+  column_statistics(orc::detail::column_statistics&& detail_statistics);
 };
 
 /**
diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp
index a34881942ce..9e171a62f78 100644
--- a/cpp/include/cudf/io/types.hpp
+++ b/cpp/include/cudf/io/types.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -277,13 +277,24 @@ struct column_name_info {
 struct table_metadata {
   std::vector<column_name_info>
     schema_info;  //!< Detailed name information for the entire output hierarchy
-  std::vector<size_t> num_rows_per_source;  //!< Number of rows read from each data source.
+  std::vector<size_t> num_rows_per_source;  //!< Number of rows read from each data source
                                             //!< Currently only computed for Parquet readers if no
-                                            //!< AST filters being used. Empty vector otherwise.
+                                            //!< AST filters being used. Empty vector otherwise
   std::map<std::string, std::string> user_data;  //!< Format-dependent metadata of the first input
                                                  //!< file as key-values pairs (deprecated)
   std::vector<std::unordered_map<std::string, std::string>>
     per_file_user_data;  //!< Per file format-dependent metadata as key-values pairs
+
+  // The following variables are currently only computed for Parquet reader
+  size_type num_input_row_groups{0};  //!< Total number of input row groups across all data sources
+  std::optional<size_type>
+    num_row_groups_after_stats_filter;  //!< Number of remaining row groups after stats filter.
+                                        //!< std::nullopt if no filtering done. Currently only
+                                        //!< reported by Parquet readers
+  std::optional<size_type>
+    num_row_groups_after_bloom_filter;  //!< Number of remaining row groups after bloom filter.
+                                        //!< std::nullopt if no filtering done. Currently only
+                                        //!< reported by Parquet readers
 };
 
 /**
diff --git a/cpp/include/cudf/unary.hpp b/cpp/include/cudf/unary.hpp
index 046e9745a71..6dcd84d3f4d 100644
--- a/cpp/include/cudf/unary.hpp
+++ b/cpp/include/cudf/unary.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -139,6 +139,7 @@ enum class unary_operator : int32_t {
   RINT,        ///< Rounds the floating-point argument arg to an integer value
   BIT_INVERT,  ///< Bitwise Not (~)
   NOT,         ///< Logical Not (!)
+  NEGATE,      ///< Unary negation (-), only for signed numeric and duration types.
 };
 
 /**
diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp
index 0f4bde204fa..168beb7fa9e 100644
--- a/cpp/include/cudf/utilities/traits.hpp
+++ b/cpp/include/cudf/utilities/traits.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -94,6 +94,8 @@ constexpr inline bool has_common_type_v = detail::has_common_type_impl<void, Ts.
 /// Checks if a type is a timestamp type.
 template <typename T>
 using is_timestamp_t = cuda::std::disjunction<std::is_same<cudf::timestamp_D, T>,
+                                              std::is_same<cudf::timestamp_h, T>,
+                                              std::is_same<cudf::timestamp_m, T>,
                                               std::is_same<cudf::timestamp_s, T>,
                                               std::is_same<cudf::timestamp_ms, T>,
                                               std::is_same<cudf::timestamp_us, T>,
@@ -102,6 +104,8 @@ using is_timestamp_t = cuda::std::disjunction<std::is_same<cudf::timestamp_D, T>
 /// Checks if a type is a duration type.
 template <typename T>
 using is_duration_t = cuda::std::disjunction<std::is_same<cudf::duration_D, T>,
+                                             std::is_same<cudf::duration_h, T>,
+                                             std::is_same<cudf::duration_m, T>,
                                              std::is_same<cudf::duration_s, T>,
                                              std::is_same<cudf::duration_ms, T>,
                                              std::is_same<cudf::duration_us, T>,
diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp
index ac81dd421fa..7c191b03350 100644
--- a/cpp/src/io/comp/nvcomp_adapter.cpp
+++ b/cpp/src/io/comp/nvcomp_adapter.cpp
@@ -33,6 +33,49 @@
 namespace cudf::io::detail::nvcomp {
 namespace {
 
+[[nodiscard]] std::string nvcomp_status_to_string(nvcompStatus_t status)
+{
+  switch (status) {
+    case nvcompStatus_t::nvcompSuccess: return "nvcompSuccess";
+    case nvcompStatus_t::nvcompErrorInvalidValue: return "nvcompErrorInvalidValue";
+    case nvcompStatus_t::nvcompErrorNotSupported: return "nvcompErrorNotSupported";
+    case nvcompStatus_t::nvcompErrorCannotDecompress: return "nvcompErrorCannotDecompress";
+    case nvcompStatus_t::nvcompErrorBadChecksum: return "nvcompErrorBadChecksum";
+    case nvcompStatus_t::nvcompErrorCannotVerifyChecksums:
+      return "nvcompErrorCannotVerifyChecksums";
+    case nvcompStatus_t::nvcompErrorOutputBufferTooSmall: return "nvcompErrorOutputBufferTooSmall";
+    case nvcompStatus_t::nvcompErrorWrongHeaderLength: return "nvcompErrorWrongHeaderLength";
+    case nvcompStatus_t::nvcompErrorAlignment: return "nvcompErrorAlignment";
+    case nvcompStatus_t::nvcompErrorChunkSizeTooLarge: return "nvcompErrorChunkSizeTooLarge";
+    case nvcompStatus_t::nvcompErrorCudaError: return "nvcompErrorCudaError";
+    case nvcompStatus_t::nvcompErrorInternal: return "nvcompErrorInternal";
+  }
+  return "nvcompStatus_t(" + std::to_string(static_cast<int>(status)) + ")";
+}
+
+[[nodiscard]] std::string compression_type_name(compression_type compression)
+{
+  switch (compression) {
+    case compression_type::SNAPPY: return "Snappy";
+    case compression_type::ZSTD: return "Zstandard";
+    case compression_type::DEFLATE: return "Deflate";
+    case compression_type::LZ4: return "LZ4";
+    case compression_type::GZIP: return "GZIP";
+  }
+  return "compression_type(" + std::to_string(static_cast<int>(compression)) + ")";
+}
+
+#define CHECK_NVCOMP_STATUS(status)                                   \
+  do {                                                                \
+    CUDF_EXPECTS(status == nvcompStatus_t::nvcompSuccess,             \
+                 "nvCOMP error: " + nvcomp_status_to_string(status)); \
+  } while (0)
+
+#define UNSUPPORTED_COMPRESSION(compression)                                          \
+  do {                                                                                \
+    CUDF_FAIL("Unsupported compression type: " + compression_type_name(compression)); \
+  } while (0)
+
 // Dispatcher for nvcompBatched<format>DecompressGetTempSizeEx
 template <typename... Args>
 auto batched_decompress_get_temp_size_ex(compression_type compression, Args&&... args)
@@ -48,7 +91,7 @@ auto batched_decompress_get_temp_size_ex(compression_type compression, Args&&...
       return nvcompBatchedDeflateDecompressGetTempSizeEx(std::forward<Args>(args)...);
     case compression_type::GZIP:
       return nvcompBatchedGzipDecompressGetTempSizeEx(std::forward<Args>(args)...);
-    default: CUDF_FAIL("Unsupported compression type");
+    default: UNSUPPORTED_COMPRESSION(compression);
   }
 }
 
@@ -66,22 +109,10 @@ auto batched_decompress_async(compression_type compression, Args&&... args)
     case compression_type::LZ4: return nvcompBatchedLZ4DecompressAsync(std::forward<Args>(args)...);
     case compression_type::GZIP:
       return nvcompBatchedGzipDecompressAsync(std::forward<Args>(args)...);
-    default: CUDF_FAIL("Unsupported compression type");
+    default: UNSUPPORTED_COMPRESSION(compression);
   }
 }
 
-[[maybe_unused]] std::string compression_type_name(compression_type compression)
-{
-  switch (compression) {
-    case compression_type::SNAPPY: return "Snappy";
-    case compression_type::ZSTD: return "Zstandard";
-    case compression_type::DEFLATE: return "Deflate";
-    case compression_type::LZ4: return "LZ4";
-    case compression_type::GZIP: return "GZIP";
-  }
-  return "compression_type(" + std::to_string(static_cast<int>(compression)) + ")";
-}
-
 size_t batched_compress_temp_size(compression_type compression,
                                   size_t batch_size,
                                   size_t max_uncompressed_chunk_bytes,
@@ -118,11 +149,9 @@ size_t batched_compress_temp_size(compression_type compression,
                                                             &temp_size,
                                                             max_total_uncompressed_bytes);
       break;
-    default: CUDF_FAIL("Unsupported compression type");
+    default: UNSUPPORTED_COMPRESSION(compression);
   }
-
-  CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess,
-               "Unable to get scratch size for compression");
+  CHECK_NVCOMP_STATUS(nvcomp_status);
   return temp_size;
 }
 
@@ -188,9 +217,9 @@ void batched_compress_async(compression_type compression,
                                                     nvcompBatchedLZ4DefaultOpts,
                                                     stream.value());
       break;
-    default: CUDF_FAIL("Unsupported compression type");
+    default: UNSUPPORTED_COMPRESSION(compression);
   }
-  CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess, "Error in compression");
+  CHECK_NVCOMP_STATUS(nvcomp_status);
 }
 
 bool is_aligned(void const* ptr, std::uintptr_t alignment) noexcept
@@ -254,9 +283,7 @@ size_t batched_decompress_temp_size(compression_type compression,
   size_t temp_size                   = 0;
   nvcompStatus_t const nvcomp_status = batched_decompress_get_temp_size_ex(
     compression, num_chunks, max_uncomp_chunk_size, &temp_size, max_total_uncomp_size);
-
-  CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess,
-               "Unable to get scratch size for decompression");
+  CHECK_NVCOMP_STATUS(nvcomp_status);
   return temp_size;
 }
 
@@ -289,7 +316,7 @@ void batched_decompress(compression_type compression,
                                                       nvcomp_args.output_data_ptrs.data(),
                                                       nvcomp_statuses.data(),
                                                       stream.value());
-  CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess, "unable to perform decompression");
+  CHECK_NVCOMP_STATUS(nvcomp_status);
 
   update_compression_results(nvcomp_statuses, actual_uncompressed_data_sizes, results, stream);
 }
@@ -321,11 +348,9 @@ size_t compress_max_output_chunk_size(compression_type compression,
       status = nvcompBatchedLZ4CompressGetMaxOutputChunkSize(
         capped_uncomp_bytes, nvcompBatchedLZ4DefaultOpts, &max_comp_chunk_size);
       break;
-    default: CUDF_FAIL("Unsupported compression type");
+    default: UNSUPPORTED_COMPRESSION(compression);
   }
-
-  CUDF_EXPECTS(status == nvcompStatus_t::nvcompSuccess,
-               "failed to get max uncompressed chunk size");
+  CHECK_NVCOMP_STATUS(status);
   return max_comp_chunk_size;
 }
 
@@ -463,7 +488,7 @@ size_t required_alignment(compression_type compression)
     case compression_type::SNAPPY: return nvcompSnappyRequiredAlignment;
     case compression_type::ZSTD: return nvcompZstdRequiredAlignment;
     case compression_type::LZ4: return nvcompLZ4RequiredAlignment;
-    default: CUDF_FAIL("Unsupported compression type");
+    default: UNSUPPORTED_COMPRESSION(compression);
   }
 }
 
@@ -474,7 +499,7 @@ std::optional<size_t> compress_max_allowed_chunk_size(compression_type compressi
     case compression_type::SNAPPY: return nvcompSnappyCompressionMaxAllowedChunkSize;
     case compression_type::ZSTD: return nvcompZstdCompressionMaxAllowedChunkSize;
     case compression_type::LZ4: return nvcompLZ4CompressionMaxAllowedChunkSize;
-    default: CUDF_FAIL("Unsupported compression type");
+    default: UNSUPPORTED_COMPRESSION(compression);
   }
 }
 
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index 69fd4068712..0d5bb8ac191 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -292,7 +292,7 @@ raw_orc_statistics read_raw_orc_statistics(source_info const& src_info,
     CUDF_FAIL("Unsupported source type");
   }
 
-  orc::metadata const metadata(source.get(), stream);
+  orc::detail::metadata const metadata(source.get(), stream);
 
   // Initialize statistics to return
   raw_orc_statistics result;
@@ -318,7 +318,7 @@ raw_orc_statistics read_raw_orc_statistics(source_info const& src_info,
   return result;
 }
 
-column_statistics::column_statistics(orc::column_statistics&& cs)
+column_statistics::column_statistics(orc::detail::column_statistics&& cs)
 {
   number_of_values = cs.number_of_values;
   has_null         = cs.has_null;
@@ -350,9 +350,9 @@ parsed_orc_statistics read_parsed_orc_statistics(source_info const& src_info,
   result.column_names = raw_stats.column_names;
 
   auto parse_column_statistics = [](auto const& raw_col_stats) {
-    orc::column_statistics stats_internal;
-    orc::ProtobufReader(reinterpret_cast<uint8_t const*>(raw_col_stats.c_str()),
-                        raw_col_stats.size())
+    orc::detail::column_statistics stats_internal;
+    orc::detail::ProtobufReader(reinterpret_cast<uint8_t const*>(raw_col_stats.c_str()),
+                                raw_col_stats.size())
       .read(stats_internal);
     return column_statistics(std::move(stats_internal));
   };
@@ -373,7 +373,7 @@ parsed_orc_statistics read_parsed_orc_statistics(source_info const& src_info,
   return result;
 }
 namespace {
-orc_column_schema make_orc_column_schema(host_span<orc::SchemaType const> orc_schema,
+orc_column_schema make_orc_column_schema(host_span<orc::detail::SchemaType const> orc_schema,
                                          uint32_t column_id,
                                          std::string column_name)
 {
@@ -400,7 +400,7 @@ orc_metadata read_orc_metadata(source_info const& src_info, rmm::cuda_stream_vie
   auto sources = make_datasources(src_info);
 
   CUDF_EXPECTS(sources.size() == 1, "Only a single source is currently supported.");
-  auto const footer = orc::metadata(sources.front().get(), stream).ff;
+  auto const footer = orc::detail::metadata(sources.front().get(), stream).ff;
 
   return {{make_orc_column_schema(footer.types, 0, "")},
           footer.numberOfRows,
diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu
index aeaa87e2202..be3c90a3e24 100644
--- a/cpp/src/io/orc/dict_enc.cu
+++ b/cpp/src/io/orc/dict_enc.cu
@@ -24,7 +24,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
-namespace cudf::io::orc::gpu {
+namespace cudf::io::orc::detail {
 
 /**
  * @brief Counts the number of characters in each rowgroup of each string column.
@@ -266,4 +266,4 @@ void get_dictionary_indices(device_2dspan<stripe_dictionary> dictionaries,
     <<<dictionaries.count(), block_size, 0, stream.value()>>>(dictionaries, columns);
 }
 
-}  // namespace cudf::io::orc::gpu
+}  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/orc.cpp b/cpp/src/io/orc/orc.cpp
index 7046b3b3f91..7ae32f3e8f8 100644
--- a/cpp/src/io/orc/orc.cpp
+++ b/cpp/src/io/orc/orc.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 
 #include "orc.hpp"
 
+#include "io/comp/io_uncomp.hpp"
 #include "orc_field_reader.hpp"
 #include "orc_field_writer.hpp"
 
@@ -25,7 +26,7 @@
 
 #include <string>
 
-namespace cudf::io::orc {
+namespace cudf::io::orc::detail {
 
 namespace {
 [[nodiscard]] constexpr uint32_t varint_size(uint64_t val)
@@ -496,7 +497,7 @@ metadata::metadata(datasource* const src, rmm::cuda_stream_view stream) : source
   buffer =
     source->host_read(len - ps_length - 1 - ps.footerLength - ps.metadataLength, ps.metadataLength);
   auto const md_data = decompressor->decompress_blocks({buffer->data(), buffer->size()}, stream);
-  orc::ProtobufReader(md_data.data(), md_data.size()).read(md);
+  ProtobufReader(md_data.data(), md_data.size()).read(md);
 
   init_parent_descriptors();
   init_column_names();
@@ -546,4 +547,4 @@ void metadata::init_parent_descriptors()
   }
 }
 
-}  // namespace cudf::io::orc
+}  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/orc.hpp b/cpp/src/io/orc/orc.hpp
index 8dccf65ef10..49652c9a0d2 100644
--- a/cpp/src/io/orc/orc.hpp
+++ b/cpp/src/io/orc/orc.hpp
@@ -16,8 +16,6 @@
 
 #pragma once
 
-#include "io/comp/io_uncomp.hpp"
-
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/io/datasource.hpp>
 #include <cudf/io/orc_metadata.hpp>
@@ -34,9 +32,7 @@
 #include <string>
 #include <vector>
 
-namespace cudf {
-namespace io {
-namespace orc {
+namespace cudf::io::orc::detail {
 
 static constexpr uint32_t block_header_size = 3;
 // Seconds from January 1st, 1970 to January 1st, 2015
@@ -710,6 +706,4 @@ struct rowgroup_rows {
   [[nodiscard]] CUDF_HOST_DEVICE constexpr auto size() const noexcept { return end - begin; }
 };
 
-}  // namespace orc
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/orc_field_reader.hpp b/cpp/src/io/orc/orc_field_reader.hpp
index 3689e4d958b..797db239538 100644
--- a/cpp/src/io/orc/orc_field_reader.hpp
+++ b/cpp/src/io/orc/orc_field_reader.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 
 #include "orc.hpp"
 
-#include <string>
+#include <cstdint>
 
 /**
  * @file orc_field_reader.hpp
@@ -25,9 +25,7 @@
  * ProtobufReader::read(...) functions
  */
 
-namespace cudf {
-namespace io {
-namespace orc {
+namespace cudf::io::orc::detail {
 
 /**
  * @brief Functor to run an operator for a specified field.
@@ -90,6 +88,4 @@ inline void ProtobufReader::function_builder(T& s, size_t maxlen, std::tuple<Ope
   CUDF_EXPECTS(m_cur <= end, "Current pointer to metadata stream is out of bounds");
 }
 
-}  // namespace orc
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/orc_field_writer.hpp b/cpp/src/io/orc/orc_field_writer.hpp
index 731e9d7687e..58890de46c2 100644
--- a/cpp/src/io/orc/orc_field_writer.hpp
+++ b/cpp/src/io/orc/orc_field_writer.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,9 +26,7 @@
  * `ProtobufWriter::write(...)` functions
  */
 
-namespace cudf {
-namespace io {
-namespace orc {
+namespace cudf::io::orc::detail {
 
 struct ProtobufWriter::ProtobufFieldWriter {
   int struct_size{0};
@@ -129,6 +127,4 @@ struct ProtobufWriter::ProtobufFieldWriter {
   size_t value() { return struct_size; }
 };
 
-}  // namespace orc
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/orc_gpu.hpp b/cpp/src/io/orc/orc_gpu.hpp
index 8b30cee6681..4545c0a917a 100644
--- a/cpp/src/io/orc/orc_gpu.hpp
+++ b/cpp/src/io/orc/orc_gpu.hpp
@@ -18,7 +18,6 @@
 
 #include "io/comp/comp.hpp"
 #include "io/statistics/statistics.cuh"
-#include "io/utilities/column_buffer.hpp"
 #include "orc.hpp"
 
 #include <cudf/detail/cuco_helpers.hpp>
@@ -33,10 +32,7 @@
 
 #include <cuco/static_map.cuh>
 
-namespace cudf {
-namespace io {
-namespace orc {
-namespace gpu {
+namespace cudf::io::orc::detail {
 
 using cudf::detail::device_2dspan;
 using cudf::detail::host_2dspan;
@@ -65,9 +61,7 @@ auto constexpr VALUE_SENTINEL = size_type{-1};
 struct CompressedStreamInfo {
   CompressedStreamInfo() = default;
   explicit constexpr CompressedStreamInfo(uint8_t const* compressed_data_, size_t compressed_size_)
-    : compressed_data(compressed_data_),
-      uncompressed_data(nullptr),
-      compressed_data_size(compressed_size_)
+    : compressed_data(compressed_data_), compressed_data_size(compressed_size_)
   {
   }
   uint8_t const* compressed_data{};  // [in] base ptr to compressed stream data
@@ -500,7 +494,4 @@ void reduce_pushdown_masks(device_span<orc_column_device_view const> orc_columns
                            device_2dspan<cudf::size_type> set_counts,
                            rmm::cuda_stream_view stream);
 
-}  // namespace gpu
-}  // namespace orc
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 726c79bd004..f19fb3c81d8 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include "io/comp/gpuinflate.hpp"
 #include "io/orc/reader_impl.hpp"
 #include "io/orc/reader_impl_chunking.hpp"
 #include "io/orc/reader_impl_helpers.hpp"
@@ -40,16 +39,16 @@ namespace cudf::io::orc::detail {
 std::size_t gather_stream_info_and_column_desc(
   std::size_t stripe_id,
   std::size_t level,
-  orc::StripeInformation const* stripeinfo,
-  orc::StripeFooter const* stripefooter,
+  StripeInformation const* stripeinfo,
+  StripeFooter const* stripefooter,
   host_span<int const> orc2gdf,
-  host_span<orc::SchemaType const> types,
+  host_span<SchemaType const> types,
   bool use_index,
   bool apply_struct_map,
   int64_t* num_dictionary_entries,
   std::size_t* local_stream_order,
   std::vector<orc_stream_info>* stream_info,
-  cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>* chunks)
+  cudf::detail::hostdevice_2dvector<ColumnDesc>* chunks)
 {
   CUDF_EXPECTS((stream_info == nullptr) ^ (chunks == nullptr),
                "Either stream_info or chunks must be provided, but not both.");
@@ -57,17 +56,17 @@ std::size_t gather_stream_info_and_column_desc(
   std::size_t src_offset = 0;
   std::size_t dst_offset = 0;
 
-  auto const get_stream_index_type = [](orc::StreamKind kind) {
+  auto const get_stream_index_type = [](StreamKind kind) {
     switch (kind) {
-      case orc::DATA: return gpu::CI_DATA;
-      case orc::LENGTH:
-      case orc::SECONDARY: return gpu::CI_DATA2;
-      case orc::DICTIONARY_DATA: return gpu::CI_DICTIONARY;
-      case orc::PRESENT: return gpu::CI_PRESENT;
-      case orc::ROW_INDEX: return gpu::CI_INDEX;
+      case DATA: return CI_DATA;
+      case LENGTH:
+      case SECONDARY: return CI_DATA2;
+      case DICTIONARY_DATA: return CI_DICTIONARY;
+      case PRESENT: return CI_PRESENT;
+      case ROW_INDEX: return CI_INDEX;
       default:
         // Skip this stream as it's not strictly required
-        return gpu::CI_NUM_STREAMS;
+        return CI_NUM_STREAMS;
     }
   };
 
@@ -87,16 +86,15 @@ std::size_t gather_stream_info_and_column_desc(
       // for each of its fields. There is only a PRESENT stream, which
       // needs to be included for the reader.
       auto const schema_type = types[column_id];
-      if (!schema_type.subtypes.empty() && schema_type.kind == orc::STRUCT &&
-          stream.kind == orc::PRESENT) {
+      if (!schema_type.subtypes.empty() && schema_type.kind == STRUCT && stream.kind == PRESENT) {
         for (auto const& idx : schema_type.subtypes) {
           auto const child_idx = (idx < orc2gdf.size()) ? orc2gdf[idx] : -1;
           if (child_idx >= 0) {
             col = child_idx;
             if (chunks) {
-              auto& chunk                     = (*chunks)[stripe_id][col];
-              chunk.strm_id[gpu::CI_PRESENT]  = *local_stream_order;
-              chunk.strm_len[gpu::CI_PRESENT] = stream.length;
+              auto& chunk                = (*chunks)[stripe_id][col];
+              chunk.strm_id[CI_PRESENT]  = *local_stream_order;
+              chunk.strm_len[CI_PRESENT] = stream.length;
             }
           }
         }
@@ -105,14 +103,14 @@ std::size_t gather_stream_info_and_column_desc(
       if (chunks) {
         if (src_offset >= stripeinfo->indexLength || use_index) {
           auto const index_type = get_stream_index_type(stream.kind);
-          if (index_type < gpu::CI_NUM_STREAMS) {
+          if (index_type < CI_NUM_STREAMS) {
             auto& chunk                = (*chunks)[stripe_id][col];
             chunk.strm_id[index_type]  = *local_stream_order;
             chunk.strm_len[index_type] = stream.length;
             // NOTE: skip_count field is temporarily used to track the presence of index streams
             chunk.skip_count |= 1 << index_type;
 
-            if (index_type == gpu::CI_DICTIONARY) {
+            if (index_type == CI_DICTIONARY) {
               chunk.dictionary_start = *num_dictionary_entries;
               chunk.dict_len         = stripefooter->columns[column_id].dictionarySize;
               *num_dictionary_entries +=
@@ -643,7 +641,7 @@ void reader_impl::load_next_stripe_data(read_mode mode)
   // memory once.
   auto hd_compinfo = [&] {
     std::size_t max_num_streams{0};
-    if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
+    if (_metadata.per_file_metadata[0].ps.compression != NONE) {
       // Find the maximum number of streams in all levels of the loaded stripes.
       for (std::size_t level = 0; level < num_levels; ++level) {
         auto const stream_range =
@@ -651,7 +649,7 @@ void reader_impl::load_next_stripe_data(read_mode mode)
         max_num_streams = std::max(max_num_streams, stream_range.size());
       }
     }
-    return cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo>(max_num_streams, _stream);
+    return cudf::detail::hostdevice_vector<CompressedStreamInfo>(max_num_streams, _stream);
   }();
 
   for (std::size_t level = 0; level < num_levels; ++level) {
@@ -665,26 +663,26 @@ void reader_impl::load_next_stripe_data(read_mode mode)
     auto const stream_range =
       merge_selected_ranges(_file_itm_data.lvl_stripe_stream_ranges[level], load_stripe_range);
 
-    if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
+    if (_metadata.per_file_metadata[0].ps.compression != NONE) {
       auto const& decompressor = *_metadata.per_file_metadata[0].decompressor;
 
-      auto compinfo = cudf::detail::hostdevice_span<gpu::CompressedStreamInfo>{hd_compinfo}.subspan(
+      auto compinfo = cudf::detail::hostdevice_span<CompressedStreamInfo>{hd_compinfo}.subspan(
         0, stream_range.size());
       for (auto stream_idx = stream_range.begin; stream_idx < stream_range.end; ++stream_idx) {
         auto const& info = stream_info[stream_idx];
         auto const dst_base =
           static_cast<uint8_t const*>(stripe_data[info.source.stripe_idx - stripe_start].data());
         compinfo[stream_idx - stream_range.begin] =
-          gpu::CompressedStreamInfo(dst_base + info.dst_pos, info.length);
+          CompressedStreamInfo(dst_base + info.dst_pos, info.length);
       }
 
       // Estimate the uncompressed data.
       compinfo.host_to_device_async(_stream);
-      gpu::ParseCompressedStripeData(compinfo.device_ptr(),
-                                     compinfo.size(),
-                                     decompressor.GetBlockSize(),
-                                     decompressor.GetLog2MaxCompressionRatio(),
-                                     _stream);
+      ParseCompressedStripeData(compinfo.device_ptr(),
+                                compinfo.size(),
+                                decompressor.GetBlockSize(),
+                                decompressor.GetLog2MaxCompressionRatio(),
+                                _stream);
       compinfo.device_to_host_sync(_stream);
 
       for (auto stream_idx = stream_range.begin; stream_idx < stream_range.end; ++stream_idx) {
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 4ef68ee8d86..cb66edf3c98 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -307,15 +307,15 @@ std::vector<range> find_splits(host_span<T const> cumulative_sizes,
 std::size_t gather_stream_info_and_column_desc(
   std::size_t stripe_id,
   std::size_t level,
-  orc::StripeInformation const* stripeinfo,
-  orc::StripeFooter const* stripefooter,
+  StripeInformation const* stripeinfo,
+  StripeFooter const* stripefooter,
   host_span<int const> orc2gdf,
-  host_span<orc::SchemaType const> types,
+  host_span<SchemaType const> types,
   bool use_index,
   bool apply_struct_map,
   int64_t* num_dictionary_entries,
   std::size_t* local_stream_order,
   std::vector<orc_stream_info>* stream_info,
-  cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>* chunks);
+  cudf::detail::hostdevice_2dvector<ColumnDesc>* chunks);
 
 }  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index b661bb4ff90..586c07cbc16 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -77,13 +77,13 @@ rmm::device_buffer decompress_stripe_data(
   range const& loaded_stripe_range,
   range const& stream_range,
   std::size_t num_decode_stripes,
-  cudf::detail::hostdevice_span<gpu::CompressedStreamInfo> compinfo,
+  cudf::detail::hostdevice_span<CompressedStreamInfo> compinfo,
   stream_source_map<stripe_level_comp_info> const& compinfo_map,
   OrcDecompressor const& decompressor,
   host_span<rmm::device_buffer const> stripe_data,
   host_span<orc_stream_info const> stream_info,
-  cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
-  cudf::detail::hostdevice_2dvector<gpu::RowGroup>& row_groups,
+  cudf::detail::hostdevice_2dvector<ColumnDesc>& chunks,
+  cudf::detail::hostdevice_2dvector<RowGroup>& row_groups,
   size_type row_index_stride,
   bool use_base_stride,
   rmm::cuda_stream_view stream)
@@ -100,7 +100,7 @@ rmm::device_buffer decompress_stripe_data(
     auto const& info = stream_info[stream_idx];
 
     auto& stream_comp_info = compinfo[stream_idx - stream_range.begin];
-    stream_comp_info       = gpu::CompressedStreamInfo(
+    stream_comp_info       = CompressedStreamInfo(
       static_cast<uint8_t const*>(
         stripe_data[info.source.stripe_idx - loaded_stripe_range.begin].data()) +
         info.dst_pos,
@@ -120,11 +120,11 @@ rmm::device_buffer decompress_stripe_data(
 
   if (!compinfo_ready) {
     compinfo.host_to_device_async(stream);
-    gpu::ParseCompressedStripeData(compinfo.device_ptr(),
-                                   compinfo.size(),
-                                   decompressor.GetBlockSize(),
-                                   decompressor.GetLog2MaxCompressionRatio(),
-                                   stream);
+    ParseCompressedStripeData(compinfo.device_ptr(),
+                              compinfo.size(),
+                              decompressor.GetBlockSize(),
+                              decompressor.GetLog2MaxCompressionRatio(),
+                              stream);
     compinfo.device_to_host_sync(stream);
 
     for (std::size_t i = 0; i < compinfo.size(); ++i) {
@@ -178,11 +178,11 @@ rmm::device_buffer decompress_stripe_data(
   }
 
   compinfo.host_to_device_async(stream);
-  gpu::ParseCompressedStripeData(compinfo.device_ptr(),
-                                 compinfo.size(),
-                                 decompressor.GetBlockSize(),
-                                 decompressor.GetLog2MaxCompressionRatio(),
-                                 stream);
+  ParseCompressedStripeData(compinfo.device_ptr(),
+                            compinfo.size(),
+                            decompressor.GetBlockSize(),
+                            decompressor.GetLog2MaxCompressionRatio(),
+                            stream);
 
   // Value for checking whether we decompress successfully.
   // It doesn't need to be atomic as there is no race condition: we only write `true` if needed.
@@ -275,7 +275,7 @@ rmm::device_buffer decompress_stripe_data(
   // Copy without stream sync, thus need to wait for stream sync below to access.
   any_block_failure.device_to_host_async(stream);
 
-  gpu::PostDecompressionReassemble(compinfo.device_ptr(), compinfo.size(), stream);
+  PostDecompressionReassemble(compinfo.device_ptr(), compinfo.size(), stream);
   compinfo.device_to_host_sync(stream);  // This also sync stream for `any_block_failure`.
 
   // We can check on host after stream synchronize
@@ -291,7 +291,7 @@ rmm::device_buffer decompress_stripe_data(
   for (std::size_t i = 0; i < num_decode_stripes; ++i) {
     for (std::size_t j = 0; j < num_columns; ++j) {
       auto& chunk = chunks[i][j];
-      for (int k = 0; k < gpu::CI_NUM_STREAMS; ++k) {
+      for (int k = 0; k < CI_NUM_STREAMS; ++k) {
         if (chunk.strm_len[k] > 0 && chunk.strm_id[k] < compinfo.size()) {
           chunk.streams[k]  = compinfo[chunk.strm_id[k]].uncompressed_data;
           chunk.strm_len[k] = compinfo[chunk.strm_id[k]].max_uncompressed_size;
@@ -303,14 +303,14 @@ rmm::device_buffer decompress_stripe_data(
   if (row_groups.size().first) {
     chunks.host_to_device_async(stream);
     row_groups.host_to_device_async(stream);
-    gpu::ParseRowGroupIndex(row_groups.base_device_ptr(),
-                            compinfo.device_ptr(),
-                            chunks.base_device_ptr(),
-                            num_columns,
-                            num_decode_stripes,
-                            row_index_stride,
-                            use_base_stride,
-                            stream);
+    ParseRowGroupIndex(row_groups.base_device_ptr(),
+                       compinfo.device_ptr(),
+                       chunks.base_device_ptr(),
+                       num_columns,
+                       num_decode_stripes,
+                       row_index_stride,
+                       use_base_stride,
+                       stream);
   }
 
   return decomp_data;
@@ -329,7 +329,7 @@ rmm::device_buffer decompress_stripe_data(
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource to use for device memory allocation
  */
-void update_null_mask(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
+void update_null_mask(cudf::detail::hostdevice_2dvector<ColumnDesc>& chunks,
                       host_span<column_buffer> out_buffers,
                       rmm::cuda_stream_view stream,
                       rmm::device_async_resource_ref mr)
@@ -419,8 +419,8 @@ void decode_stream_data(int64_t num_dicts,
                         size_type row_index_stride,
                         std::size_t level,
                         table_device_view const& d_tz_table,
-                        cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
-                        cudf::detail::device_2dspan<gpu::RowGroup> row_groups,
+                        cudf::detail::hostdevice_2dvector<ColumnDesc>& chunks,
+                        cudf::detail::device_2dspan<RowGroup> row_groups,
                         std::vector<column_buffer>& out_buffers,
                         rmm::cuda_stream_view stream,
                         rmm::device_async_resource_ref mr)
@@ -441,10 +441,10 @@ void decode_stream_data(int64_t num_dicts,
   });
 
   // Allocate global dictionary for deserializing
-  rmm::device_uvector<gpu::DictionaryEntry> global_dict(num_dicts, stream);
+  rmm::device_uvector<DictionaryEntry> global_dict(num_dicts, stream);
 
   chunks.host_to_device_async(stream);
-  gpu::DecodeNullsAndStringDictionaries(
+  DecodeNullsAndStringDictionaries(
     chunks.base_device_ptr(), global_dict.data(), num_columns, num_stripes, skip_rows, stream);
 
   if (level > 0) {
@@ -453,18 +453,18 @@ void decode_stream_data(int64_t num_dicts,
   }
 
   cudf::detail::device_scalar<size_type> error_count(0, stream);
-  gpu::DecodeOrcColumnData(chunks.base_device_ptr(),
-                           global_dict.data(),
-                           row_groups,
-                           num_columns,
-                           num_stripes,
-                           skip_rows,
-                           d_tz_table,
-                           row_groups.size().first,
-                           row_index_stride,
-                           level,
-                           error_count.data(),
-                           stream);
+  DecodeOrcColumnData(chunks.base_device_ptr(),
+                      global_dict.data(),
+                      row_groups,
+                      num_columns,
+                      num_stripes,
+                      skip_rows,
+                      d_tz_table,
+                      row_groups.size().first,
+                      row_index_stride,
+                      level,
+                      error_count.data(),
+                      stream);
   chunks.device_to_host_async(stream);
   // `value` synchronizes
   auto const num_errors = error_count.value(stream);
@@ -485,7 +485,7 @@ void decode_stream_data(int64_t num_dicts,
  * @brief Compute the per-stripe prefix sum of null count, for each struct column in the current
  * layer.
  */
-void scan_null_counts(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> const& chunks,
+void scan_null_counts(cudf::detail::hostdevice_2dvector<ColumnDesc> const& chunks,
                       uint32_t* d_prefix_sums,
                       rmm::cuda_stream_view stream)
 {
@@ -531,9 +531,9 @@ void scan_null_counts(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> const&
  * @brief Aggregate child metadata from parent column chunks.
  */
 void aggregate_child_meta(std::size_t level,
-                          cudf::io::orc::detail::column_hierarchy const& selected_columns,
-                          cudf::detail::host_2dspan<gpu::ColumnDesc> chunks,
-                          cudf::detail::host_2dspan<gpu::RowGroup> row_groups,
+                          column_hierarchy const& selected_columns,
+                          cudf::detail::host_2dspan<ColumnDesc> chunks,
+                          cudf::detail::host_2dspan<RowGroup> row_groups,
                           host_span<orc_column_meta const> nested_cols,
                           host_span<column_buffer> out_buffers,
                           reader_column_meta& col_meta)
@@ -766,7 +766,7 @@ void reader_impl::decompress_and_decode_stripes(read_mode mode)
   // Each 'chunk' of data here corresponds to an orc column, in a stripe, at a nested level.
   // Unfortunately we cannot create one hostdevice_vector to use for all levels because
   // currently we do not have a hostdevice_2dspan class.
-  std::vector<cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>> lvl_chunks(num_levels);
+  std::vector<cudf::detail::hostdevice_2dvector<ColumnDesc>> lvl_chunks(num_levels);
 
   // For computing null count.
   auto null_count_prefix_sums = [&] {
@@ -787,7 +787,7 @@ void reader_impl::decompress_and_decode_stripes(read_mode mode)
   // thus only need to allocate memory once.
   auto hd_compinfo = [&] {
     std::size_t max_num_streams{0};
-    if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
+    if (_metadata.per_file_metadata[0].ps.compression != NONE) {
       // Find the maximum number of streams in all levels of the decoding stripes.
       for (std::size_t level = 0; level < num_levels; ++level) {
         auto const stream_range =
@@ -795,7 +795,7 @@ void reader_impl::decompress_and_decode_stripes(read_mode mode)
         max_num_streams = std::max(max_num_streams, stream_range.size());
       }
     }
-    return cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo>{max_num_streams, _stream};
+    return cudf::detail::hostdevice_vector<CompressedStreamInfo>{max_num_streams, _stream};
   }();
 
   auto& col_meta = *_col_meta;
@@ -812,8 +812,7 @@ void reader_impl::decompress_and_decode_stripes(read_mode mode)
     auto& chunks      = lvl_chunks[level];
 
     auto const num_lvl_columns = columns_level.size();
-    chunks =
-      cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>(stripe_count, num_lvl_columns, _stream);
+    chunks = cudf::detail::hostdevice_2dvector<ColumnDesc>(stripe_count, num_lvl_columns, _stream);
     memset(chunks.base_host_ptr(), 0, chunks.size_bytes());
 
     bool const use_index =
@@ -897,7 +896,7 @@ void reader_impl::decompress_and_decode_stripes(read_mode mode)
 
         // num_child_rows for a struct column will be same, for other nested types it will be
         // calculated.
-        chunk.num_child_rows = (chunk.type_kind != orc::STRUCT) ? 0 : chunk.num_rows;
+        chunk.num_child_rows = (chunk.type_kind != STRUCT) ? 0 : chunk.num_rows;
         chunk.dtype_id       = column_types[col_idx].id();
         chunk.decimal_scale  = _metadata.per_file_metadata[stripe.source_idx]
                                 .ff.types[columns_level[col_idx].id]
@@ -912,11 +911,11 @@ void reader_impl::decompress_and_decode_stripes(read_mode mode)
                                 : cudf::size_of(column_types[col_idx]);
         chunk.num_rowgroups = stripe_num_rowgroups;
 
-        if (chunk.type_kind == orc::TIMESTAMP) {
+        if (chunk.type_kind == TIMESTAMP) {
           chunk.timestamp_type_id = _options.timestamp_type.id();
         }
         if (not is_stripe_data_empty) {
-          for (int k = 0; k < gpu::CI_NUM_STREAMS; k++) {
+          for (int k = 0; k < CI_NUM_STREAMS; k++) {
             chunk.streams[k] =
               dst_base + stream_info[chunk.strm_id[k] + stream_range.begin].dst_pos;
           }
@@ -931,10 +930,10 @@ void reader_impl::decompress_and_decode_stripes(read_mode mode)
 
     // Process dataset chunks into output columns.
     auto row_groups =
-      cudf::detail::hostdevice_2dvector<gpu::RowGroup>(num_rowgroups, num_lvl_columns, _stream);
+      cudf::detail::hostdevice_2dvector<RowGroup>(num_rowgroups, num_lvl_columns, _stream);
     if (level > 0 and row_groups.size().first) {
-      cudf::host_span<gpu::RowGroup> row_groups_span(row_groups.base_host_ptr(),
-                                                     num_rowgroups * num_lvl_columns);
+      cudf::host_span<RowGroup> row_groups_span(row_groups.base_host_ptr(),
+                                                num_rowgroups * num_lvl_columns);
       auto& rw_grp_meta = col_meta.rwgrp_meta;
 
       // Update start row and num rows per row group
@@ -950,9 +949,9 @@ void reader_impl::decompress_and_decode_stripes(read_mode mode)
     }
 
     // Setup row group descriptors if using indexes.
-    if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
+    if (_metadata.per_file_metadata[0].ps.compression != NONE) {
       auto const compinfo =
-        cudf::detail::hostdevice_span<gpu::CompressedStreamInfo>{hd_compinfo}.subspan(
+        cudf::detail::hostdevice_span<CompressedStreamInfo>{hd_compinfo}.subspan(
           0, stream_range.size());
       auto decomp_data = decompress_stripe_data(load_stripe_range,
                                                 stream_range,
@@ -979,14 +978,14 @@ void reader_impl::decompress_and_decode_stripes(read_mode mode)
         chunks.host_to_device_async(_stream);
         row_groups.host_to_device_async(_stream);
         row_groups.host_to_device_async(_stream);
-        gpu::ParseRowGroupIndex(row_groups.base_device_ptr(),
-                                nullptr,
-                                chunks.base_device_ptr(),
-                                num_lvl_columns,
-                                stripe_count,
-                                _metadata.get_row_index_stride(),
-                                level == 0,
-                                _stream);
+        ParseRowGroupIndex(row_groups.base_device_ptr(),
+                           nullptr,
+                           chunks.base_device_ptr(),
+                           num_lvl_columns,
+                           stripe_count,
+                           _metadata.get_row_index_stride(),
+                           level == 0,
+                           _stream);
       }
     }
 
@@ -995,7 +994,7 @@ void reader_impl::decompress_and_decode_stripes(read_mode mode)
     for (std::size_t i = 0; i < column_types.size(); ++i) {
       bool is_nullable = false;
       for (std::size_t j = 0; j < stripe_count; ++j) {
-        if (chunks[j][i].strm_len[gpu::CI_PRESENT] != 0) {
+        if (chunks[j][i].strm_len[CI_PRESENT] != 0) {
           is_nullable = true;
           break;
         }
diff --git a/cpp/src/io/orc/reader_impl_helpers.cpp b/cpp/src/io/orc/reader_impl_helpers.cpp
index 7e5db4b7617..1d4aaaf51ef 100644
--- a/cpp/src/io/orc/reader_impl_helpers.cpp
+++ b/cpp/src/io/orc/reader_impl_helpers.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -34,7 +34,7 @@ std::unique_ptr<column> create_empty_column(size_type orc_col_id,
                                  to_cudf_decimal_type(decimal128_columns, metadata, orc_col_id));
 
   switch (kind) {
-    case orc::LIST: {
+    case LIST: {
       schema_info.children.emplace_back("offsets");
       schema_info.children.emplace_back("");
       return make_lists_column(0,
@@ -50,7 +50,7 @@ std::unique_ptr<column> create_empty_column(size_type orc_col_id,
                                rmm::device_buffer{0, stream},
                                stream);
     }
-    case orc::MAP: {
+    case MAP: {
       schema_info.children.emplace_back("offsets");
       schema_info.children.emplace_back("struct");
       auto const child_column_ids = metadata.get_col_type(orc_col_id).subtypes;
@@ -76,7 +76,7 @@ std::unique_ptr<column> create_empty_column(size_type orc_col_id,
         stream);
     }
 
-    case orc::STRUCT: {
+    case STRUCT: {
       std::vector<std::unique_ptr<column>> child_columns;
       for (auto const col : metadata.get_col_type(orc_col_id).subtypes) {
         schema_info.children.emplace_back("");
@@ -92,7 +92,7 @@ std::unique_ptr<column> create_empty_column(size_type orc_col_id,
         0, std::move(child_columns), 0, rmm::device_buffer{0, stream}, stream);
     }
 
-    case orc::DECIMAL: {
+    case DECIMAL: {
       int32_t scale = 0;
       if (type == type_id::DECIMAL32 or type == type_id::DECIMAL64 or type == type_id::DECIMAL128) {
         scale = -static_cast<int32_t>(metadata.get_types()[orc_col_id].scale.value_or(0));
@@ -119,8 +119,8 @@ column_buffer assemble_buffer(size_type orc_col_id,
   col_buffer.name = metadata.column_name(0, orc_col_id);
   auto kind       = metadata.get_col_type(orc_col_id).kind;
   switch (kind) {
-    case orc::LIST:
-    case orc::STRUCT: {
+    case LIST:
+    case STRUCT: {
       auto const& children_indices = selected_columns.children.at(orc_col_id);
       for (auto const child_id : children_indices) {
         col_buffer.children.emplace_back(assemble_buffer(
@@ -128,7 +128,7 @@ column_buffer assemble_buffer(size_type orc_col_id,
       }
     } break;
 
-    case orc::MAP: {
+    case MAP: {
       std::vector<column_buffer> child_col_buffers;
       // Get child buffers
       auto const& children_indices = selected_columns.children.at(orc_col_id);
diff --git a/cpp/src/io/orc/reader_impl_helpers.hpp b/cpp/src/io/orc/reader_impl_helpers.hpp
index 4cded30d89b..f2e746b312f 100644
--- a/cpp/src/io/orc/reader_impl_helpers.hpp
+++ b/cpp/src/io/orc/reader_impl_helpers.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -61,36 +61,36 @@ struct reader_column_meta {
 /**
  * @brief Function that translates ORC data kind to cuDF type enum
  */
-inline constexpr type_id to_cudf_type(orc::TypeKind kind,
+inline constexpr type_id to_cudf_type(TypeKind kind,
                                       bool use_np_dtypes,
                                       type_id timestamp_type_id,
                                       type_id decimal_type_id)
 {
   switch (kind) {
-    case orc::BOOLEAN: return type_id::BOOL8;
-    case orc::BYTE: return type_id::INT8;
-    case orc::SHORT: return type_id::INT16;
-    case orc::INT: return type_id::INT32;
-    case orc::LONG: return type_id::INT64;
-    case orc::FLOAT: return type_id::FLOAT32;
-    case orc::DOUBLE: return type_id::FLOAT64;
-    case orc::STRING:
-    case orc::BINARY:
-    case orc::VARCHAR:
-    case orc::CHAR:
+    case BOOLEAN: return type_id::BOOL8;
+    case BYTE: return type_id::INT8;
+    case SHORT: return type_id::INT16;
+    case INT: return type_id::INT32;
+    case LONG: return type_id::INT64;
+    case FLOAT: return type_id::FLOAT32;
+    case DOUBLE: return type_id::FLOAT64;
+    case STRING:
+    case BINARY:
+    case VARCHAR:
+    case CHAR:
       // Variable-length types can all be mapped to STRING
       return type_id::STRING;
-    case orc::TIMESTAMP:
+    case TIMESTAMP:
       return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
                                                    : type_id::TIMESTAMP_NANOSECONDS;
-    case orc::DATE:
+    case DATE:
       // There isn't a (DAYS -> np.dtype) mapping
       return (use_np_dtypes) ? type_id::TIMESTAMP_MILLISECONDS : type_id::TIMESTAMP_DAYS;
-    case orc::DECIMAL: return decimal_type_id;
+    case DECIMAL: return decimal_type_id;
     // Need to update once cuDF plans to support map type
-    case orc::MAP:
-    case orc::LIST: return type_id::LIST;
-    case orc::STRUCT: return type_id::STRUCT;
+    case MAP:
+    case LIST: return type_id::LIST;
+    case STRUCT: return type_id::STRUCT;
     default: break;
   }
 
diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu
index 5f4c1e0696d..e81c74ae1a6 100644
--- a/cpp/src/io/orc/stats_enc.cu
+++ b/cpp/src/io/orc/stats_enc.cu
@@ -24,7 +24,7 @@
 
 #include <cuda/std/utility>
 
-namespace cudf::io::orc::gpu {
+namespace cudf::io::orc::detail {
 
 using strings::detail::fixed_point_string_size;
 
@@ -502,4 +502,4 @@ void orc_encode_statistics(uint8_t* blob_bfr,
     blob_bfr, groups, chunks, statistics_count);
 }
 
-}  // namespace cudf::io::orc::gpu
+}  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu
index f560b806894..c7947b0e4c9 100644
--- a/cpp/src/io/orc/stripe_data.cu
+++ b/cpp/src/io/orc/stripe_data.cu
@@ -15,6 +15,7 @@
  */
 
 #include "io/utilities/block_utils.cuh"
+#include "io/utilities/column_buffer.hpp"
 #include "orc_gpu.hpp"
 
 #include <cudf/io/orc_types.hpp>
@@ -23,10 +24,7 @@
 
 #include <cub/cub.cuh>
 
-namespace cudf {
-namespace io {
-namespace orc {
-namespace gpu {
+namespace cudf::io::orc::detail {
 
 using cudf::io::detail::string_index_pair;
 
@@ -2096,7 +2094,4 @@ void __host__ DecodeOrcColumnData(ColumnDesc* chunks,
     chunks, global_dictionary, tz_table, row_groups, first_row, rowidx_stride, level, error_count);
 }
 
-}  // namespace gpu
-}  // namespace orc
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index 857daeb5856..15ce1aadb17 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include "io/comp/gpuinflate.hpp"
 #include "io/utilities/block_utils.cuh"
 #include "io/utilities/time_utils.cuh"
 #include "orc_gpu.hpp"
@@ -40,10 +39,7 @@
 #include <thrust/transform.h>
 #include <thrust/tuple.h>
 
-namespace cudf {
-namespace io {
-namespace orc {
-namespace gpu {
+namespace cudf::io::orc::detail {
 
 using cudf::detail::device_2dspan;
 using cudf::io::detail::compression_result;
@@ -1421,7 +1417,4 @@ void decimal_sizes_to_offsets(device_2dspan<rowgroup_rows const> rg_bounds,
     <<<num_blocks, block_size, 0, stream.value()>>>(rg_bounds, d_sizes);
 }
 
-}  // namespace gpu
-}  // namespace orc
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/stripe_init.cu b/cpp/src/io/orc/stripe_init.cu
index 5e23bc5adcc..a72b71a83ca 100644
--- a/cpp/src/io/orc/stripe_init.cu
+++ b/cpp/src/io/orc/stripe_init.cu
@@ -26,10 +26,7 @@
 #include <thrust/copy.h>
 #include <thrust/execution_policy.h>
 
-namespace cudf {
-namespace io {
-namespace orc {
-namespace gpu {
+namespace cudf::io::orc::detail {
 
 struct comp_in_out {
   uint8_t const* in_ptr{};
@@ -605,7 +602,4 @@ void __host__ reduce_pushdown_masks(device_span<orc_column_device_view const> co
     <<<num_blocks, block_size, 0, stream.value()>>>(columns, rowgroups, valid_counts);
 }
 
-}  // namespace gpu
-}  // namespace orc
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 5c3377a1aeb..ed900105968 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -21,7 +21,6 @@
 
 #include "io/orc/orc_gpu.hpp"
 #include "io/statistics/column_statistics.cuh"
-#include "io/utilities/column_utils.cuh"
 #include "writer_impl.hpp"
 
 #include <cudf/detail/iterator.cuh>
@@ -40,6 +39,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
 
 #include <cooperative_groups.h>
 #include <cooperative_groups/memcpy_async.h>
@@ -92,15 +92,15 @@ namespace {
 /**
  * @brief Translates cuDF compression to ORC compression.
  */
-orc::CompressionKind to_orc_compression(compression_type compression)
+CompressionKind to_orc_compression(compression_type compression)
 {
   switch (compression) {
     case compression_type::AUTO:
-    case compression_type::SNAPPY: return orc::CompressionKind::SNAPPY;
-    case compression_type::ZLIB: return orc::CompressionKind::ZLIB;
-    case compression_type::ZSTD: return orc::CompressionKind::ZSTD;
-    case compression_type::LZ4: return orc::CompressionKind::LZ4;
-    case compression_type::NONE: return orc::CompressionKind::NONE;
+    case compression_type::SNAPPY: return CompressionKind::SNAPPY;
+    case compression_type::ZLIB: return CompressionKind::ZLIB;
+    case compression_type::ZSTD: return CompressionKind::ZSTD;
+    case compression_type::LZ4: return CompressionKind::LZ4;
+    case compression_type::NONE: return CompressionKind::NONE;
     default: CUDF_FAIL("Unsupported compression type");
   }
 }
@@ -119,7 +119,7 @@ size_t compression_block_size(compression_type compression)
 /**
  * @brief Translates cuDF dtype to ORC datatype
  */
-constexpr orc::TypeKind to_orc_type(cudf::type_id id, bool list_column_as_map)
+constexpr TypeKind to_orc_type(cudf::type_id id, bool list_column_as_map)
 {
   switch (id) {
     case cudf::type_id::INT8: return TypeKind::BYTE;
@@ -237,8 +237,8 @@ class orc_column_view {
   [[nodiscard]] auto const& decimal_offsets() const { return d_decimal_offsets; }
   void attach_decimal_offsets(uint32_t* sizes_ptr) { d_decimal_offsets = sizes_ptr; }
 
-  void attach_stripe_dicts(host_span<gpu::stripe_dictionary const> host_stripe_dicts,
-                           device_span<gpu::stripe_dictionary const> dev_stripe_dicts)
+  void attach_stripe_dicts(host_span<stripe_dictionary const> host_stripe_dicts,
+                           device_span<stripe_dictionary const> dev_stripe_dicts)
   {
     stripe_dicts   = host_stripe_dicts;
     d_stripe_dicts = dev_stripe_dicts;
@@ -301,8 +301,8 @@ class orc_column_view {
 
   host_span<size_type const> rowgroup_char_counts;
 
-  host_span<gpu::stripe_dictionary const> stripe_dicts;
-  device_span<gpu::stripe_dictionary const> d_stripe_dicts;
+  host_span<stripe_dictionary const> stripe_dicts;
+  device_span<stripe_dictionary const> d_stripe_dicts;
 
   // Offsets for encoded decimal elements. Used to enable direct writing of encoded decimal elements
   // into the output stream.
@@ -498,17 +498,17 @@ size_t RLE_stream_size(TypeKind kind, size_t count)
     case TypeKind::BYTE:
       return div_rounding_up_unsafe(count, byte_rle_max_len) * (byte_rle_max_len + 1);
     case TypeKind::SHORT:
-      return div_rounding_up_unsafe(count, gpu::encode_block_size) *
-             (gpu::encode_block_size * max_varint_size<int16_t>() + 2);
+      return div_rounding_up_unsafe(count, encode_block_size) *
+             (encode_block_size * max_varint_size<int16_t>() + 2);
     case TypeKind::FLOAT:
     case TypeKind::INT:
     case TypeKind::DATE:
-      return div_rounding_up_unsafe(count, gpu::encode_block_size) *
-             (gpu::encode_block_size * max_varint_size<int32_t>() + 2);
+      return div_rounding_up_unsafe(count, encode_block_size) *
+             (encode_block_size * max_varint_size<int32_t>() + 2);
     case TypeKind::LONG:
     case TypeKind::DOUBLE:
-      return div_rounding_up_unsafe(count, gpu::encode_block_size) *
-             (gpu::encode_block_size * max_varint_size<int64_t>() + 2);
+      return div_rounding_up_unsafe(count, encode_block_size) *
+             (encode_block_size * max_varint_size<int64_t>() + 2);
     default: CUDF_FAIL("Unsupported ORC type for RLE stream size: " + std::to_string(kind));
   }
 }
@@ -536,7 +536,7 @@ orc_streams create_streams(host_span<orc_column_view> columns,
     return Stream{ROW_INDEX, col.id()};
   });
 
-  std::vector<int32_t> ids(columns.size() * gpu::CI_NUM_STREAMS, -1);
+  std::vector<int32_t> ids(columns.size() * CI_NUM_STREAMS, -1);
   std::vector<TypeKind> types(streams.size(), INVALID_TYPE_KIND);
 
   for (auto& column : columns) {
@@ -568,41 +568,39 @@ orc_streams create_streams(host_span<orc_column_view> columns,
     auto const kind = column.orc_kind();
 
     auto add_stream =
-      [&](gpu::StreamIndexType index_type, StreamKind kind, TypeKind type_kind, size_t size) {
+      [&](StreamIndexType index_type, StreamKind kind, TypeKind type_kind, size_t size) {
         auto const max_alignment_padding = compress_required_chunk_alignment(compression) - 1;
-        const auto base                  = column.index() * gpu::CI_NUM_STREAMS;
+        const auto base                  = column.index() * CI_NUM_STREAMS;
         ids[base + index_type]           = streams.size();
-        streams.push_back(orc::Stream{
-          kind,
-          column.id(),
-          (size == 0) ? 0 : size + max_alignment_padding * segmentation.num_rowgroups()});
+        streams.push_back(
+          Stream{kind,
+                 column.id(),
+                 (size == 0) ? 0 : size + max_alignment_padding * segmentation.num_rowgroups()});
         types.push_back(type_kind);
       };
 
-    auto add_RLE_stream = [&](
-                            gpu::StreamIndexType index_type, StreamKind kind, TypeKind type_kind) {
+    auto add_RLE_stream = [&](StreamIndexType index_type, StreamKind kind, TypeKind type_kind) {
       add_stream(index_type, kind, type_kind, RLE_column_size(type_kind));
     };
 
-    if (is_nullable) { add_RLE_stream(gpu::CI_PRESENT, PRESENT, TypeKind::BOOLEAN); }
+    if (is_nullable) { add_RLE_stream(CI_PRESENT, PRESENT, TypeKind::BOOLEAN); }
     switch (kind) {
       case TypeKind::BOOLEAN:
       case TypeKind::BYTE:
-        add_RLE_stream(gpu::CI_DATA, DATA, kind);
+        add_RLE_stream(CI_DATA, DATA, kind);
         column.set_orc_encoding(DIRECT);
         break;
       case TypeKind::SHORT:
       case TypeKind::INT:
       case TypeKind::LONG:
       case TypeKind::DATE:
-        add_RLE_stream(gpu::CI_DATA, DATA, kind);
+        add_RLE_stream(CI_DATA, DATA, kind);
         column.set_orc_encoding(DIRECT_V2);
         break;
       case TypeKind::FLOAT:
       case TypeKind::DOUBLE:
         // Pass through if no nulls (no RLE encoding for floating point)
-        add_stream(
-          gpu::CI_DATA, DATA, kind, (column.null_count() != 0) ? RLE_column_size(kind) : 0);
+        add_stream(CI_DATA, DATA, kind, (column.null_count() != 0) ? RLE_column_size(kind) : 0);
         column.set_orc_encoding(DIRECT);
         break;
       case TypeKind::STRING: {
@@ -632,35 +630,34 @@ orc_streams create_streams(host_span<orc_column_view> columns,
 
         // Decide between direct or dictionary encoding
         if (enable_dict && dict_data_size < direct_data_size) {
-          add_RLE_stream(gpu::CI_DATA, DATA, TypeKind::INT);
-          add_stream(gpu::CI_DATA2, LENGTH, TypeKind::INT, dict_lengths_div512 * (512 * 4 + 2));
-          add_stream(
-            gpu::CI_DICTIONARY, DICTIONARY_DATA, TypeKind::CHAR, std::max(dict_data_size, 1ul));
+          add_RLE_stream(CI_DATA, DATA, TypeKind::INT);
+          add_stream(CI_DATA2, LENGTH, TypeKind::INT, dict_lengths_div512 * (512 * 4 + 2));
+          add_stream(CI_DICTIONARY, DICTIONARY_DATA, TypeKind::CHAR, std::max(dict_data_size, 1ul));
           column.set_orc_encoding(DICTIONARY_V2);
         } else {
-          add_stream(gpu::CI_DATA, DATA, TypeKind::CHAR, std::max<size_t>(direct_data_size, 1));
-          add_RLE_stream(gpu::CI_DATA2, LENGTH, TypeKind::INT);
+          add_stream(CI_DATA, DATA, TypeKind::CHAR, std::max<size_t>(direct_data_size, 1));
+          add_RLE_stream(CI_DATA2, LENGTH, TypeKind::INT);
           column.set_orc_encoding(DIRECT_V2);
         }
         break;
       }
       case TypeKind::TIMESTAMP:
-        add_RLE_stream(gpu::CI_DATA, DATA, TypeKind::LONG);
-        add_RLE_stream(gpu::CI_DATA2, SECONDARY, TypeKind::LONG);
+        add_RLE_stream(CI_DATA, DATA, TypeKind::LONG);
+        add_RLE_stream(CI_DATA2, SECONDARY, TypeKind::LONG);
         column.set_orc_encoding(DIRECT_V2);
         break;
       case TypeKind::DECIMAL:
         // varint values (NO RLE)
         // data_stream_size = decimal_column_sizes.at(column.index());
-        add_stream(gpu::CI_DATA, DATA, TypeKind::DECIMAL, decimal_column_sizes.at(column.index()));
+        add_stream(CI_DATA, DATA, TypeKind::DECIMAL, decimal_column_sizes.at(column.index()));
         // scale stream TODO: compute exact size since all elems are equal
-        add_RLE_stream(gpu::CI_DATA2, SECONDARY, TypeKind::INT);
+        add_RLE_stream(CI_DATA2, SECONDARY, TypeKind::INT);
         column.set_orc_encoding(DIRECT_V2);
         break;
       case TypeKind::LIST:
       case TypeKind::MAP:
         // no data stream, only lengths
-        add_RLE_stream(gpu::CI_DATA2, LENGTH, TypeKind::INT);
+        add_RLE_stream(CI_DATA2, LENGTH, TypeKind::INT);
         column.set_orc_encoding(DIRECT_V2);
         break;
       case TypeKind::STRUCT:
@@ -683,7 +680,7 @@ std::vector<std::vector<rowgroup_rows>> calculate_aligned_rowgroup_bounds(
     orc_table.num_columns() * segmentation.num_rowgroups(), stream);
   auto const d_pd_set_counts =
     device_2dspan<cudf::size_type>{d_pd_set_counts_data, orc_table.num_columns()};
-  gpu::reduce_pushdown_masks(orc_table.d_columns, segmentation.rowgroups, d_pd_set_counts, stream);
+  reduce_pushdown_masks(orc_table.d_columns, segmentation.rowgroups, d_pd_set_counts, stream);
 
   auto aligned_rgs = hostdevice_2dvector<rowgroup_rows>(
     segmentation.num_rowgroups(), orc_table.num_columns(), stream);
@@ -838,7 +835,7 @@ encoded_data encode_columns(orc_table_view const& orc_table,
                             rmm::cuda_stream_view stream)
 {
   auto const num_columns = orc_table.num_columns();
-  hostdevice_2dvector<gpu::EncChunk> chunks(num_columns, segmentation.num_rowgroups(), stream);
+  hostdevice_2dvector<EncChunk> chunks(num_columns, segmentation.num_rowgroups(), stream);
 
   auto const aligned_rowgroups = calculate_aligned_rowgroup_bounds(orc_table, segmentation, stream);
 
@@ -911,7 +908,7 @@ encoded_data encode_columns(orc_table_view const& orc_table,
       " Please see https://github.com/rapidsai/cudf/issues/6763 for more information.");
   }
 
-  hostdevice_2dvector<gpu::encoder_chunk_streams> chunk_streams(
+  hostdevice_2dvector<encoder_chunk_streams> chunk_streams(
     num_columns, segmentation.num_rowgroups(), stream);
   // per-stripe, per-stream owning buffers
   std::vector<std::vector<rmm::device_uvector<uint8_t>>> encoded_data(segmentation.num_stripes());
@@ -921,10 +918,10 @@ encoded_data encode_columns(orc_table_view const& orc_table,
     });
 
     for (size_t col_idx = 0; col_idx < num_columns; col_idx++) {
-      for (int strm_type = 0; strm_type < gpu::CI_NUM_STREAMS; ++strm_type) {
+      for (int strm_type = 0; strm_type < CI_NUM_STREAMS; ++strm_type) {
         auto const& column = orc_table.column(col_idx);
         auto col_streams   = chunk_streams[col_idx];
-        auto const strm_id = streams.id(col_idx * gpu::CI_NUM_STREAMS + strm_type);
+        auto const strm_id = streams.id(col_idx * CI_NUM_STREAMS + strm_type);
 
         std::for_each(stripe.cbegin(), stripe.cend(), [&](auto rg_idx) {
           col_streams[rg_idx].ids[strm_type]     = strm_id;
@@ -938,25 +935,25 @@ encoded_data encode_columns(orc_table_view const& orc_table,
             auto const& ck = chunks[col_idx][rg_idx];
             auto& strm     = col_streams[rg_idx];
 
-            if ((strm_type == gpu::CI_DICTIONARY) ||
-                (strm_type == gpu::CI_DATA2 && ck.encoding_kind == DICTIONARY_V2)) {
+            if ((strm_type == CI_DICTIONARY) ||
+                (strm_type == CI_DATA2 && ck.encoding_kind == DICTIONARY_V2)) {
               if (rg_idx == *stripe.cbegin()) {
                 auto const stripe_dict = column.host_stripe_dict(stripe.id);
                 strm.lengths[strm_type] =
-                  (strm_type == gpu::CI_DICTIONARY)
+                  (strm_type == CI_DICTIONARY)
                     ? stripe_dict.char_count
                     : (((stripe_dict.entry_count + 0x1ff) >> 9) * (512 * 4 + 2));
               } else {
                 strm.lengths[strm_type] = 0;
               }
-            } else if (strm_type == gpu::CI_DATA && ck.type_kind == TypeKind::STRING &&
+            } else if (strm_type == CI_DATA && ck.type_kind == TypeKind::STRING &&
                        ck.encoding_kind == DIRECT_V2) {
               strm.lengths[strm_type] = std::max(column.rowgroup_char_count(rg_idx), 1);
-            } else if (strm_type == gpu::CI_DATA && streams[strm_id].length == 0 &&
+            } else if (strm_type == CI_DATA && streams[strm_id].length == 0 &&
                        (ck.type_kind == DOUBLE || ck.type_kind == FLOAT)) {
               // Pass-through
               strm.lengths[strm_type] = ck.num_rows * ck.dtype_len;
-            } else if (ck.type_kind == DECIMAL && strm_type == gpu::CI_DATA) {
+            } else if (ck.type_kind == DECIMAL && strm_type == CI_DATA) {
               strm.lengths[strm_type] = dec_chunk_sizes.rg_sizes.at(col_idx)[rg_idx];
             } else {
               strm.lengths[strm_type] = RLE_stream_size(streams.type(strm_id), ck.num_rows);
@@ -974,12 +971,12 @@ encoded_data encode_columns(orc_table_view const& orc_table,
           auto const& ck    = chunks[col_idx][rg_idx];
           auto& strm        = col_streams[rg_idx];
 
-          if (strm_id < 0 or (strm_type == gpu::CI_DATA && streams[strm_id].length == 0 &&
+          if (strm_id < 0 or (strm_type == CI_DATA && streams[strm_id].length == 0 &&
                               (ck.type_kind == DOUBLE || ck.type_kind == FLOAT))) {
             strm.data_ptrs[strm_type] = nullptr;
           } else {
-            if ((strm_type == gpu::CI_DICTIONARY) ||
-                (strm_type == gpu::CI_DATA2 && ck.encoding_kind == DICTIONARY_V2)) {
+            if ((strm_type == CI_DICTIONARY) ||
+                (strm_type == CI_DATA2 && ck.encoding_kind == DICTIONARY_V2)) {
               strm.data_ptrs[strm_type] = encoded_data[stripe.id][strm_id].data();
             } else {
               strm.data_ptrs[strm_type] = (rg_idx_it == stripe.cbegin())
@@ -1003,16 +1000,16 @@ encoded_data encode_columns(orc_table_view const& orc_table,
   if (orc_table.num_rows() > 0) {
     if (orc_table.num_string_columns() != 0) {
       auto d_stripe_dict = orc_table.string_column(0).device_stripe_dicts();
-      gpu::EncodeStripeDictionaries(d_stripe_dict.data(),
-                                    orc_table.d_columns,
-                                    chunks,
-                                    orc_table.num_string_columns(),
-                                    segmentation.num_stripes(),
-                                    chunk_streams,
-                                    stream);
+      EncodeStripeDictionaries(d_stripe_dict.data(),
+                               orc_table.d_columns,
+                               chunks,
+                               orc_table.num_string_columns(),
+                               segmentation.num_stripes(),
+                               chunk_streams,
+                               stream);
     }
 
-    gpu::EncodeOrcColumnData(chunks, chunk_streams, stream);
+    EncodeOrcColumnData(chunks, chunk_streams, stream);
   }
   chunk_streams.device_to_host_sync(stream);
 
@@ -1034,7 +1031,7 @@ encoded_data encode_columns(orc_table_view const& orc_table,
 std::vector<StripeInformation> gather_stripes(size_t num_index_streams,
                                               file_segmentation const& segmentation,
                                               encoded_data* enc_data,
-                                              hostdevice_2dvector<gpu::StripeStream>* strm_desc,
+                                              hostdevice_2dvector<StripeStream>* strm_desc,
                                               rmm::cuda_stream_view stream)
 {
   if (segmentation.num_stripes() == 0) { return {}; }
@@ -1051,7 +1048,7 @@ std::vector<StripeInformation> gather_stripes(size_t num_index_streams,
     for (size_t col_idx = 0; col_idx < enc_data->streams.size().first; col_idx++) {
       auto const& col_streams = (enc_data->streams)[col_idx];
       // Assign stream data of column data stream(s)
-      for (int k = 0; k < gpu::CI_INDEX; k++) {
+      for (int k = 0; k < CI_INDEX; k++) {
         auto const stream_id = col_streams[0].ids[k];
         if (stream_id != -1) {
           auto const actual_stripe_size = std::accumulate(
@@ -1091,7 +1088,7 @@ std::vector<StripeInformation> gather_stripes(size_t num_index_streams,
 
   strm_desc->host_to_device_async(stream);
   // TODO: use cub::DeviceMemcpy::Batched
-  gpu::CompactOrcDataStreams(*strm_desc, enc_data->streams, stream);
+  CompactOrcDataStreams(*strm_desc, enc_data->streams, stream);
   strm_desc->device_to_host_async(stream);
   enc_data->streams.device_to_host_sync(stream);
 
@@ -1123,17 +1120,17 @@ cudf::detail::hostdevice_vector<uint8_t> allocate_and_encode_blobs(
   rmm::cuda_stream_view stream)
 {
   // figure out the buffer size needed for protobuf format
-  gpu::orc_init_statistics_buffersize(
+  orc_init_statistics_buffersize(
     stats_merge_groups.device_ptr(), stat_chunks.data(), num_stat_blobs, stream);
   auto max_blobs = stats_merge_groups.element(num_stat_blobs - 1, stream);
 
   cudf::detail::hostdevice_vector<uint8_t> blobs(max_blobs.start_chunk + max_blobs.num_chunks,
                                                  stream);
-  gpu::orc_encode_statistics(blobs.device_ptr(),
-                             stats_merge_groups.device_ptr(),
-                             stat_chunks.data(),
-                             num_stat_blobs,
-                             stream);
+  orc_encode_statistics(blobs.device_ptr(),
+                        stats_merge_groups.device_ptr(),
+                        stat_chunks.data(),
+                        num_stat_blobs,
+                        stream);
   stats_merge_groups.device_to_host_async(stream);
   blobs.device_to_host_sync(stream);
   return blobs;
@@ -1238,7 +1235,7 @@ intermediate_statistics gather_statistic_blobs(statistics_freq const stats_freq,
   auto stripe_stat_chunks   = stripe_chunks.data();
 
   rmm::device_uvector<statistics_group> rowgroup_groups(num_rowgroup_blobs, stream);
-  gpu::orc_init_statistics_groups(
+  orc_init_statistics_groups(
     rowgroup_groups.data(), stat_desc.device_ptr(), segmentation.rowgroups, stream);
 
   detail::calculate_group_statistics<detail::io_file_format::ORC>(
@@ -1440,8 +1437,8 @@ void write_index_stream(int32_t stripe_id,
                         int32_t stream_id,
                         host_span<orc_column_view const> columns,
                         file_segmentation const& segmentation,
-                        host_2dspan<gpu::encoder_chunk_streams const> enc_streams,
-                        host_2dspan<gpu::StripeStream const> strm_desc,
+                        host_2dspan<encoder_chunk_streams const> enc_streams,
+                        host_2dspan<StripeStream const> strm_desc,
                         host_span<compression_result const> comp_res,
                         host_span<ColStatsBlob const> rg_stats,
                         StripeInformation* stripe,
@@ -1455,8 +1452,7 @@ void write_index_stream(int32_t stripe_id,
   row_group_index_info data2;
   auto const column_id = stream_id - 1;
 
-  auto find_record = [=, &strm_desc](gpu::encoder_chunk_streams const& stream,
-                                     gpu::StreamIndexType type) {
+  auto find_record = [=, &strm_desc](encoder_chunk_streams const& stream, StreamIndexType type) {
     row_group_index_info record;
     if (stream.ids[type] > 0) {
       record.pos = 0;
@@ -1469,8 +1465,8 @@ void write_index_stream(int32_t stripe_id,
     }
     return record;
   };
-  auto scan_record = [=, &comp_res](gpu::encoder_chunk_streams const& stream,
-                                    gpu::StreamIndexType type,
+  auto scan_record = [=, &comp_res](encoder_chunk_streams const& stream,
+                                    StreamIndexType type,
                                     row_group_index_info& record) {
     if (record.pos >= 0) {
       record.pos += stream.lengths[type];
@@ -1489,9 +1485,9 @@ void write_index_stream(int32_t stripe_id,
   // TBD: Not sure we need an empty index stream for column 0
   if (stream_id != 0) {
     auto const& strm = enc_streams[column_id][0];
-    present          = find_record(strm, gpu::CI_PRESENT);
-    data             = find_record(strm, gpu::CI_DATA);
-    data2            = find_record(strm, gpu::CI_DATA2);
+    present          = find_record(strm, CI_PRESENT);
+    data             = find_record(strm, CI_DATA);
+    data2            = find_record(strm, CI_DATA2);
 
     // Change string dictionary to int from index point of view
     kind = columns[column_id].orc_kind();
@@ -1518,9 +1514,9 @@ void write_index_stream(int32_t stripe_id,
 
     if (stream_id != 0) {
       const auto& strm = enc_streams[column_id][rowgroup];
-      scan_record(strm, gpu::CI_PRESENT, present);
-      scan_record(strm, gpu::CI_DATA, data);
-      scan_record(strm, gpu::CI_DATA2, data2);
+      scan_record(strm, CI_PRESENT, present);
+      scan_record(strm, CI_DATA, data);
+      scan_record(strm, CI_DATA2, data2);
     }
   });
 
@@ -1549,8 +1545,8 @@ void write_index_stream(int32_t stripe_id,
  * @param[in] stream CUDA stream used for device memory operations and kernel launches
  * @return An std::future that should be synchronized to ensure the writing is complete
  */
-std::future<void> write_data_stream(gpu::StripeStream const& strm_desc,
-                                    gpu::encoder_chunk_streams const& enc_stream,
+std::future<void> write_data_stream(StripeStream const& strm_desc,
+                                    encoder_chunk_streams const& enc_stream,
                                     uint8_t const* compressed_data,
                                     host_span<uint8_t> bounce_buffer,
                                     StripeInformation* stripe,
@@ -1944,7 +1940,7 @@ encoder_decimal_info decimal_chunk_sizes(orc_table_view& orc_table,
   if (elem_sizes.empty()) return {};
 
   // Compute element offsets within each row group
-  gpu::decimal_sizes_to_offsets(segmentation.rowgroups, elem_sizes, stream);
+  decimal_sizes_to_offsets(segmentation.rowgroups, elem_sizes, stream);
 
   // Gather the row group sizes and copy to host
   auto d_tmp_rowgroup_sizes = rmm::device_uvector<uint32_t>(segmentation.num_rowgroups(), stream);
@@ -2011,11 +2007,11 @@ auto set_rowgroup_char_counts(orc_table_view& orc_table,
 
   auto counts         = rmm::device_uvector<size_type>(num_str_cols * num_rowgroups, stream);
   auto counts_2d_view = device_2dspan<size_type>(counts, num_rowgroups);
-  gpu::rowgroup_char_counts(counts_2d_view,
-                            orc_table.d_columns,
-                            rowgroup_bounds,
-                            orc_table.d_string_column_indices,
-                            stream);
+  rowgroup_char_counts(counts_2d_view,
+                       orc_table.d_columns,
+                       rowgroup_bounds,
+                       orc_table.d_string_column_indices,
+                       stream);
 
   auto const h_counts = cudf::detail::make_host_vector_sync(counts, stream);
 
@@ -2030,7 +2026,7 @@ auto set_rowgroup_char_counts(orc_table_view& orc_table,
 
 // Holds the stripe dictionary descriptors and dictionary buffers.
 struct stripe_dictionaries {
-  hostdevice_2dvector<gpu::stripe_dictionary> views;       // descriptors [string_column][stripe]
+  hostdevice_2dvector<stripe_dictionary> views;            // descriptors [string_column][stripe]
   std::vector<rmm::device_uvector<uint32_t>> data_owner;   // dictionary data owner, per stripe
   std::vector<rmm::device_uvector<uint32_t>> index_owner;  // dictionary index owner, per stripe
   std::vector<rmm::device_uvector<uint32_t>> order_owner;  // dictionary order owner, per stripe
@@ -2082,17 +2078,17 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table,
                          : segmentation.rowgroups[stripe.first + stripe.size - 1][col_idx].end -
                              segmentation.rowgroups[stripe.first][col_idx].begin;
       hash_maps_storage_offsets[str_column.str_index()].emplace_back(total_map_storage_size);
-      total_map_storage_size += stripe_num_rows * gpu::occupancy_factor;
+      total_map_storage_size += stripe_num_rows * occupancy_factor;
     }
     hash_maps_storage_offsets[str_column.str_index()].emplace_back(total_map_storage_size);
   }
 
-  hostdevice_2dvector<gpu::stripe_dictionary> stripe_dicts(
+  hostdevice_2dvector<stripe_dictionary> stripe_dicts(
     orc_table.num_string_columns(), segmentation.num_stripes(), stream);
   if (stripe_dicts.count() == 0) return {std::move(stripe_dicts), {}, {}};
 
   // Create a single bulk storage to use for all sub-dictionaries
-  auto map_storage = std::make_unique<gpu::storage_type>(
+  auto map_storage = std::make_unique<storage_type>(
     total_map_storage_size,
     cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream});
 
@@ -2121,8 +2117,8 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table,
   }
   stripe_dicts.host_to_device_async(stream);
 
-  map_storage->initialize_async({gpu::KEY_SENTINEL, gpu::VALUE_SENTINEL}, {stream.value()});
-  gpu::populate_dictionary_hash_maps(stripe_dicts, orc_table.d_columns, stream);
+  map_storage->initialize_async({KEY_SENTINEL, VALUE_SENTINEL}, {stream.value()});
+  populate_dictionary_hash_maps(stripe_dicts, orc_table.d_columns, stream);
   // Copy the entry counts and char counts from the device to the host
   stripe_dicts.device_to_host_sync(stream);
 
@@ -2169,8 +2165,8 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table,
   // Synchronize to ensure the copy is complete before we clear `map_slots`
   stripe_dicts.host_to_device_sync(stream);
 
-  gpu::collect_map_entries(stripe_dicts, stream);
-  gpu::get_dictionary_indices(stripe_dicts, orc_table.d_columns, stream);
+  collect_map_entries(stripe_dicts, stream);
+  get_dictionary_indices(stripe_dicts, orc_table.d_columns, stream);
 
   // deallocate hash map storage, unused after this point
   map_storage.reset();
@@ -2299,7 +2295,7 @@ auto convert_table_to_orc_data(table_view const& input,
   // Assemble individual disparate column chunks into contiguous data streams
   size_type const num_index_streams = (orc_table.num_columns() + 1);
   auto const num_data_streams       = streams.size() - num_index_streams;
-  hostdevice_2dvector<gpu::StripeStream> strm_descs(
+  hostdevice_2dvector<StripeStream> strm_descs(
     segmentation.num_stripes(), num_data_streams, stream);
   auto stripes = gather_stripes(num_index_streams, segmentation, &enc_data, &strm_descs, stream);
 
@@ -2353,17 +2349,17 @@ auto convert_table_to_orc_data(table_view const& input,
                compression_result{0, compression_status::FAILURE});
   if (compression != compression_type::NONE) {
     strm_descs.host_to_device_async(stream);
-    compression_stats = gpu::CompressOrcDataStreams(compressed_data,
-                                                    num_compressed_blocks,
-                                                    compression,
-                                                    compression_blocksize,
-                                                    max_compressed_block_size,
-                                                    block_align,
-                                                    collect_compression_stats,
-                                                    strm_descs,
-                                                    enc_data.streams,
-                                                    comp_results,
-                                                    stream);
+    compression_stats = CompressOrcDataStreams(compressed_data,
+                                               num_compressed_blocks,
+                                               compression,
+                                               compression_blocksize,
+                                               max_compressed_block_size,
+                                               block_align,
+                                               collect_compression_stats,
+                                               strm_descs,
+                                               enc_data.streams,
+                                               comp_results,
+                                               stream);
 
     // deallocate encoded data as it is not needed anymore
     enc_data.data.clear();
@@ -2535,7 +2531,7 @@ void writer::impl::write_orc_data_to_sink(encoded_data const& enc_data,
                                           orc_table_view const& orc_table,
                                           device_span<uint8_t const> compressed_data,
                                           host_span<compression_result const> comp_results,
-                                          host_2dspan<gpu::StripeStream const> strm_descs,
+                                          host_2dspan<StripeStream const> strm_descs,
                                           host_span<ColStatsBlob const> rg_stats,
                                           orc_streams& streams,
                                           host_span<StripeInformation> stripes,
diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp
index 7d23482cb17..b6a27d5a6c5 100644
--- a/cpp/src/io/orc/writer_impl.hpp
+++ b/cpp/src/io/orc/writer_impl.hpp
@@ -136,7 +136,7 @@ struct file_segmentation {
  */
 struct encoded_data {
   std::vector<std::vector<rmm::device_uvector<uint8_t>>> data;  // Owning array of the encoded data
-  hostdevice_2dvector<gpu::encoder_chunk_streams> streams;  // streams of encoded data, per chunk
+  hostdevice_2dvector<encoder_chunk_streams> streams;  // streams of encoded data, per chunk
 };
 
 /**
@@ -309,7 +309,7 @@ class writer::impl {
                               orc_table_view const& orc_table,
                               device_span<uint8_t const> compressed_data,
                               host_span<compression_result const> comp_results,
-                              host_2dspan<gpu::StripeStream const> strm_descs,
+                              host_2dspan<StripeStream const> strm_descs,
                               host_span<ColStatsBlob const> rg_stats,
                               orc_streams& streams,
                               host_span<StripeInformation> stripes,
diff --git a/cpp/src/io/parquet/bloom_filter_reader.cu b/cpp/src/io/parquet/bloom_filter_reader.cu
index af524e1f70a..a883981a467 100644
--- a/cpp/src/io/parquet/bloom_filter_reader.cu
+++ b/cpp/src/io/parquet/bloom_filter_reader.cu
@@ -599,9 +599,11 @@ std::vector<Type> aggregate_reader_metadata::get_parquet_types(
   return parquet_types;
 }
 
-std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::apply_bloom_filters(
+std::pair<std::optional<std::vector<std::vector<size_type>>>, bool>
+aggregate_reader_metadata::apply_bloom_filters(
   host_span<std::unique_ptr<datasource> const> sources,
   host_span<std::vector<size_type> const> input_row_group_indices,
+  size_type total_row_groups,
   host_span<data_type const> output_dtypes,
   host_span<int const> output_column_schemas,
   std::reference_wrapper<ast::expression const> filter,
@@ -610,17 +612,6 @@ std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::ap
   // Number of input table columns
   auto const num_input_columns = static_cast<cudf::size_type>(output_dtypes.size());
 
-  // Total number of row groups after StatsAST filtration
-  auto const total_row_groups = std::accumulate(
-    input_row_group_indices.begin(),
-    input_row_group_indices.end(),
-    size_t{0},
-    [](size_t sum, auto const& per_file_row_groups) { return sum + per_file_row_groups.size(); });
-
-  // Check if we have less than 2B total row groups.
-  CUDF_EXPECTS(total_row_groups <= std::numeric_limits<cudf::size_type>::max(),
-               "Total number of row groups exceed the size_type's limit");
-
   // Collect equality literals for each input table column
   auto const equality_literals =
     equality_literals_collector{filter.get(), num_input_columns}.get_equality_literals();
@@ -635,7 +626,7 @@ std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::ap
                   [](auto& eq_literals) { return not eq_literals.empty(); });
 
   // Return early if no column with equality predicate(s)
-  if (equality_col_schemas.empty()) { return std::nullopt; }
+  if (equality_col_schemas.empty()) { return {std::nullopt, false}; }
 
   // Required alignment:
   // https://github.com/NVIDIA/cuCollections/blob/deab5799f3e4226cb8a49acf2199c03b14941ee4/include/cuco/detail/bloom_filter/bloom_filter_impl.cuh#L55-L67
@@ -654,8 +645,8 @@ std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::ap
   auto bloom_filter_data = read_bloom_filters(
     sources, input_row_group_indices, equality_col_schemas, total_row_groups, stream, aligned_mr);
 
-  // No bloom filter buffers, return the original row group indices
-  if (bloom_filter_data.empty()) { return std::nullopt; }
+  // No bloom filter buffers, return early
+  if (bloom_filter_data.empty()) { return {std::nullopt, false}; }
 
   // Get parquet types for the predicate columns
   auto const parquet_types = get_parquet_types(input_row_group_indices, equality_col_schemas);
@@ -676,8 +667,10 @@ std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::ap
     h_bloom_filter_spans, stream, cudf::get_current_device_resource_ref());
 
   // Create a bloom filter query table caster
-  bloom_filter_caster const bloom_filter_col{
-    bloom_filter_spans, parquet_types, total_row_groups, equality_col_schemas.size()};
+  bloom_filter_caster const bloom_filter_col{bloom_filter_spans,
+                                             parquet_types,
+                                             static_cast<size_t>(total_row_groups),
+                                             equality_col_schemas.size()};
 
   // Converts bloom filter membership for equality predicate columns to a table
   // containing a column for each `col[i] == literal` predicate to be evaluated.
@@ -714,10 +707,11 @@ std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::ap
 
   // Filter bloom filter membership table with the BloomfilterAST expression and collect
   // filtered row group indices
-  return collect_filtered_row_group_indices(bloom_filter_membership_table,
-                                            bloom_filter_expr.get_bloom_filter_expr(),
-                                            input_row_group_indices,
-                                            stream);
+  return {collect_filtered_row_group_indices(bloom_filter_membership_table,
+                                             bloom_filter_expr.get_bloom_filter_expr(),
+                                             input_row_group_indices,
+                                             stream),
+          true};
 }
 
 }  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp
index 0e307bac097..1508b7eef8b 100644
--- a/cpp/src/io/parquet/predicate_pushdown.cpp
+++ b/cpp/src/io/parquet/predicate_pushdown.cpp
@@ -388,40 +388,17 @@ class stats_expression_converter : public ast::detail::expression_transformer {
 };
 }  // namespace
 
-std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::filter_row_groups(
+std::pair<std::optional<std::vector<std::vector<size_type>>>, surviving_row_group_metrics>
+aggregate_reader_metadata::filter_row_groups(
   host_span<std::unique_ptr<datasource> const> sources,
-  host_span<std::vector<size_type> const> row_group_indices,
+  host_span<std::vector<size_type> const> input_row_group_indices,
+  size_type total_row_groups,
   host_span<data_type const> output_dtypes,
   host_span<int const> output_column_schemas,
   std::reference_wrapper<ast::expression const> filter,
   rmm::cuda_stream_view stream) const
 {
   auto mr = cudf::get_current_device_resource_ref();
-  // Create row group indices.
-  std::vector<std::vector<size_type>> all_row_group_indices;
-  host_span<std::vector<size_type> const> input_row_group_indices;
-  if (row_group_indices.empty()) {
-    std::transform(per_file_metadata.cbegin(),
-                   per_file_metadata.cend(),
-                   std::back_inserter(all_row_group_indices),
-                   [](auto const& file_meta) {
-                     std::vector<size_type> rg_idx(file_meta.row_groups.size());
-                     std::iota(rg_idx.begin(), rg_idx.end(), 0);
-                     return rg_idx;
-                   });
-    input_row_group_indices = host_span<std::vector<size_type> const>(all_row_group_indices);
-  } else {
-    input_row_group_indices = row_group_indices;
-  }
-  auto const total_row_groups = std::accumulate(
-    input_row_group_indices.begin(),
-    input_row_group_indices.end(),
-    size_t{0},
-    [](size_t sum, auto const& per_file_row_groups) { return sum + per_file_row_groups.size(); });
-
-  // Check if we have less than 2B total row groups.
-  CUDF_EXPECTS(total_row_groups <= std::numeric_limits<cudf::size_type>::max(),
-               "Total number of row groups exceed the size_type's limit");
 
   // Converts Column chunk statistics to a table
   // where min(col[i]) = columns[i*2], max(col[i])=columns[i*2+1]
@@ -451,16 +428,22 @@ std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::fi
   // Converts AST to StatsAST with reference to min, max columns in above `stats_table`.
   stats_expression_converter const stats_expr{filter.get(),
                                               static_cast<size_type>(output_dtypes.size())};
-  auto stats_ast     = stats_expr.get_stats_expr();
-  auto predicate_col = cudf::detail::compute_column(stats_table, stats_ast.get(), stream, mr);
-  auto predicate     = predicate_col->view();
-  CUDF_EXPECTS(predicate.type().id() == cudf::type_id::BOOL8,
-               "Filter expression must return a boolean column");
 
   // Filter stats table with StatsAST expression and collect filtered row group indices
   auto const filtered_row_group_indices = collect_filtered_row_group_indices(
     stats_table, stats_expr.get_stats_expr(), input_row_group_indices, stream);
 
+  // Number of surviving row groups after applying stats filter
+  auto const num_stats_filtered_row_groups =
+    filtered_row_group_indices.has_value()
+      ? std::accumulate(filtered_row_group_indices.value().cbegin(),
+                        filtered_row_group_indices.value().cend(),
+                        size_type{0},
+                        [](auto& sum, auto const& per_file_row_groups) {
+                          return sum + per_file_row_groups.size();
+                        })
+      : total_row_groups;
+
   // Span of row groups to apply bloom filtering on.
   auto const bloom_filter_input_row_groups =
     filtered_row_group_indices.has_value()
@@ -468,12 +451,32 @@ std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::fi
       : input_row_group_indices;
 
   // Apply bloom filtering on the bloom filter input row groups
-  auto const bloom_filtered_row_groups = apply_bloom_filters(
-    sources, bloom_filter_input_row_groups, output_dtypes, output_column_schemas, filter, stream);
+  auto const [bloom_filtered_row_groups, bloom_filters_exist] =
+    apply_bloom_filters(sources,
+                        bloom_filter_input_row_groups,
+                        num_stats_filtered_row_groups,
+                        output_dtypes,
+                        output_column_schemas,
+                        filter,
+                        stream);
+
+  // Number of surviving row groups after applying bloom filter
+  auto const num_bloom_filtered_row_groups =
+    bloom_filters_exist
+      ? (bloom_filtered_row_groups.has_value()
+           ? std::make_optional(std::accumulate(bloom_filtered_row_groups.value().cbegin(),
+                                                bloom_filtered_row_groups.value().cend(),
+                                                size_type{0},
+                                                [](auto& sum, auto const& per_file_row_groups) {
+                                                  return sum + per_file_row_groups.size();
+                                                }))
+           : std::make_optional(num_stats_filtered_row_groups))
+      : std::nullopt;
 
   // Return bloom filtered row group indices iff collected
-  return bloom_filtered_row_groups.has_value() ? bloom_filtered_row_groups
-                                               : filtered_row_group_indices;
+  return {
+    bloom_filtered_row_groups.has_value() ? bloom_filtered_row_groups : filtered_row_group_indices,
+    {std::make_optional(num_stats_filtered_row_groups), num_bloom_filtered_row_groups}};
 }
 
 // convert column named expression to column index reference expression
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 9dd4e19de52..87e358e89f8 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -610,6 +610,17 @@ table_with_metadata reader::impl::read_chunk_internal(read_mode mode)
   auto out_columns = std::vector<std::unique_ptr<column>>{};
   out_columns.reserve(_output_buffers.size());
 
+  // Copy number of total input row groups and number of surviving row groups from predicate
+  // pushdown.
+  out_metadata.num_input_row_groups = _file_itm_data.num_input_row_groups;
+  // Copy the number surviving row groups from each predicate pushdown only if the filter has value.
+  if (_expr_conv.get_converted_expr().has_value()) {
+    out_metadata.num_row_groups_after_stats_filter =
+      _file_itm_data.surviving_row_groups.after_stats_filter;
+    out_metadata.num_row_groups_after_bloom_filter =
+      _file_itm_data.surviving_row_groups.after_bloom_filter;
+  }
+
   // no work to do (this can happen on the first pass if we have no rows to read)
   if (!has_more_work()) {
     // Check if number of rows per source should be included in output metadata.
diff --git a/cpp/src/io/parquet/reader_impl_chunking.hpp b/cpp/src/io/parquet/reader_impl_chunking.hpp
index 4a773fbced1..294eaf9ac16 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.hpp
+++ b/cpp/src/io/parquet/reader_impl_chunking.hpp
@@ -47,6 +47,11 @@ struct file_intermediate_data {
   // partial sum of the number of rows per data source
   std::vector<std::size_t> exclusive_sum_num_rows_per_source{};
 
+  size_type num_input_row_groups{0};  // total number of input row groups across all data sources
+
+  // struct containing the number of remaining row groups after each predicate pushdown filter
+  surviving_row_group_metrics surviving_row_groups;
+
   size_t _current_input_pass{0};  // current input pass index
   size_t _output_chunk_count{0};  // how many output chunks we have produced
 
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index 7d3b6a39d5b..768ca384352 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -408,10 +408,16 @@ int64_t aggregate_reader_metadata::calc_num_rows() const
 
 size_type aggregate_reader_metadata::calc_num_row_groups() const
 {
-  return std::accumulate(
-    per_file_metadata.cbegin(), per_file_metadata.cend(), 0, [](auto& sum, auto& pfm) {
+  auto const total_row_groups = std::accumulate(
+    per_file_metadata.cbegin(), per_file_metadata.cend(), size_t{0}, [](size_t& sum, auto& pfm) {
       return sum + pfm.row_groups.size();
     });
+
+  // Check if we have less than 2B total row groups.
+  CUDF_EXPECTS(total_row_groups <= std::numeric_limits<cudf::size_type>::max(),
+               "Total number of row groups exceed the size_type's limit");
+
+  return static_cast<size_type>(total_row_groups);
 }
 
 // Copies info from the column and offset indexes into the passed in row_group_info.
@@ -1029,7 +1035,12 @@ std::vector<std::string> aggregate_reader_metadata::get_pandas_index_names() con
   return names;
 }
 
-std::tuple<int64_t, size_type, std::vector<row_group_info>, std::vector<size_t>>
+std::tuple<int64_t,
+           size_type,
+           std::vector<row_group_info>,
+           std::vector<size_t>,
+           size_type,
+           surviving_row_group_metrics>
 aggregate_reader_metadata::select_row_groups(
   host_span<std::unique_ptr<datasource> const> sources,
   host_span<std::vector<size_type> const> row_group_indices,
@@ -1040,17 +1051,63 @@ aggregate_reader_metadata::select_row_groups(
   std::optional<std::reference_wrapper<ast::expression const>> filter,
   rmm::cuda_stream_view stream) const
 {
+  // Compute total number of input row groups
+  size_type total_row_groups = [&]() {
+    if (not row_group_indices.empty()) {
+      size_t const total_row_groups =
+        std::accumulate(row_group_indices.begin(),
+                        row_group_indices.end(),
+                        size_t{0},
+                        [](size_t& sum, auto const& pfm) { return sum + pfm.size(); });
+
+      // Check if we have less than 2B total row groups.
+      CUDF_EXPECTS(total_row_groups <= std::numeric_limits<cudf::size_type>::max(),
+                   "Total number of row groups exceed the size_type's limit");
+      return static_cast<size_type>(total_row_groups);
+    } else {
+      return num_row_groups;
+    }
+  }();
+
+  // Pair to store the number of row groups after stats and bloom filtering respectively. Initialize
+  // to total_row_groups.
+  surviving_row_group_metrics num_row_groups_after_filters{};
+
   std::optional<std::vector<std::vector<size_type>>> filtered_row_group_indices;
   // if filter is not empty, then gather row groups to read after predicate pushdown
   if (filter.has_value()) {
-    filtered_row_group_indices = filter_row_groups(
-      sources, row_group_indices, output_dtypes, output_column_schemas, filter.value(), stream);
+    // Span of input row group indices for predicate pushdown
+    host_span<std::vector<size_type> const> input_row_group_indices;
+    std::vector<std::vector<size_type>> all_row_group_indices;
+    if (row_group_indices.empty()) {
+      std::transform(per_file_metadata.cbegin(),
+                     per_file_metadata.cend(),
+                     std::back_inserter(all_row_group_indices),
+                     [](auto const& file_meta) {
+                       std::vector<size_type> rg_idx(file_meta.row_groups.size());
+                       std::iota(rg_idx.begin(), rg_idx.end(), 0);
+                       return rg_idx;
+                     });
+      input_row_group_indices = host_span<std::vector<size_type> const>(all_row_group_indices);
+    } else {
+      input_row_group_indices = row_group_indices;
+    }
+    // Predicate pushdown: Filter row groups using stats and bloom filters
+    std::tie(filtered_row_group_indices, num_row_groups_after_filters) =
+      filter_row_groups(sources,
+                        input_row_group_indices,
+                        total_row_groups,
+                        output_dtypes,
+                        output_column_schemas,
+                        filter.value(),
+                        stream);
     if (filtered_row_group_indices.has_value()) {
       row_group_indices =
         host_span<std::vector<size_type> const>(filtered_row_group_indices.value());
     }
   }
-  std::vector<row_group_info> selection;
+
+  // Compute the number of rows to read and skip
   auto [rows_to_skip, rows_to_read] = [&]() {
     if (not row_group_indices.empty()) { return std::pair<int64_t, size_type>{}; }
     auto const from_opts = cudf::io::detail::skip_rows_num_rows_from_options(
@@ -1061,7 +1118,9 @@ aggregate_reader_metadata::select_row_groups(
                      static_cast<size_type>(from_opts.second)};
   }();
 
-  // Get number of rows in each data source
+  // Vector to hold the `row_group_info` of selected row groups
+  std::vector<row_group_info> selection;
+  // Number of rows in each data source
   std::vector<size_t> num_rows_per_source(per_file_metadata.size(), 0);
 
   if (!row_group_indices.empty()) {
@@ -1083,6 +1142,10 @@ aggregate_reader_metadata::select_row_groups(
       }
     }
   } else {
+    // Reset and recompute input row group count to adjust for num_rows and skip_rows. Here, the
+    // output from predicate pushdown was empty. i.e., no row groups filtered.
+    total_row_groups = 0;
+
     size_type count = 0;
     for (size_t src_idx = 0; src_idx < per_file_metadata.size(); ++src_idx) {
       auto const& fmd = per_file_metadata[src_idx];
@@ -1093,6 +1156,9 @@ aggregate_reader_metadata::select_row_groups(
         auto const chunk_start_row = count;
         count += rg.num_rows;
         if (count > rows_to_skip || count == 0) {
+          // Keep this row group, increase count
+          total_row_groups++;
+
           // start row of this row group adjusted with rows_to_skip
           num_rows_per_source[src_idx] += count;
           num_rows_per_source[src_idx] -=
@@ -1113,9 +1179,24 @@ aggregate_reader_metadata::select_row_groups(
         }
       }
     }
+
+    // If filter had a value and no row groups were filtered, set the number of row groups after
+    // filters to the number of adjusted input row groups
+    auto const after_stats_filter = num_row_groups_after_filters.after_stats_filter.has_value()
+                                      ? std::make_optional(total_row_groups)
+                                      : std::nullopt;
+    auto const after_bloom_filter = num_row_groups_after_filters.after_bloom_filter.has_value()
+                                      ? std::make_optional(total_row_groups)
+                                      : std::nullopt;
+    num_row_groups_after_filters  = {after_stats_filter, after_bloom_filter};
   }
 
-  return {rows_to_skip, rows_to_read, std::move(selection), std::move(num_rows_per_source)};
+  return {rows_to_skip,
+          rows_to_read,
+          std::move(selection),
+          std::move(num_rows_per_source),
+          total_row_groups,
+          std::move(num_row_groups_after_filters)};
 }
 
 std::tuple<std::vector<input_column_info>,
diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp
index ba5e53e3104..c4372b2c1ff 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.hpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.hpp
@@ -125,6 +125,14 @@ struct arrow_schema_data_types {
   data_type type{type_id::EMPTY};
 };
 
+/**
+ * @brief Struct to store the number of row groups surviving each predicate pushdown filter.
+ */
+struct surviving_row_group_metrics {
+  std::optional<size_type> after_stats_filter;  // number of surviving row groups after stats filter
+  std::optional<size_type> after_bloom_filter;  // number of surviving row groups after bloom filter
+};
+
 class aggregate_reader_metadata {
   std::vector<metadata> per_file_metadata;
   std::vector<std::unordered_map<std::string, std::string>> keyval_maps;
@@ -358,40 +366,47 @@ class aggregate_reader_metadata {
    * @brief Filters the row groups based on predicate filter
    *
    * @param sources Lists of input datasources
-   * @param row_group_indices Lists of row groups to read, one per source
+   * @param input_row_group_indices Lists of input row groups, one per source
+   * @param total_row_groups Total number of row groups in `input_row_group_indices`
    * @param output_dtypes Datatypes of output columns
    * @param output_column_schemas schema indices of output columns
    * @param filter AST expression to filter row groups based on Column chunk statistics
    * @param stream CUDA stream used for device memory operations and kernel launches
-   * @return Filtered row group indices, if any is filtered
+   * @return A pair of a list of filtered row group indices if any are filtered, and a struct
+   *         containing the number of row groups surviving each predicate pushdown filter
    */
-  [[nodiscard]] std::optional<std::vector<std::vector<size_type>>> filter_row_groups(
-    host_span<std::unique_ptr<datasource> const> sources,
-    host_span<std::vector<size_type> const> row_group_indices,
-    host_span<data_type const> output_dtypes,
-    host_span<int const> output_column_schemas,
-    std::reference_wrapper<ast::expression const> filter,
-    rmm::cuda_stream_view stream) const;
+  [[nodiscard]] std::pair<std::optional<std::vector<std::vector<size_type>>>,
+                          surviving_row_group_metrics>
+  filter_row_groups(host_span<std::unique_ptr<datasource> const> sources,
+                    host_span<std::vector<size_type> const> input_row_group_indices,
+                    size_type total_row_groups,
+                    host_span<data_type const> output_dtypes,
+                    host_span<int const> output_column_schemas,
+                    std::reference_wrapper<ast::expression const> filter,
+                    rmm::cuda_stream_view stream) const;
 
   /**
    * @brief Filters the row groups using bloom filters
    *
    * @param sources Dataset sources
-   * @param row_group_indices Lists of input row groups to read, one per source
+   * @param input_row_group_indices Lists of input row groups, one per source
+   * @param total_row_groups Total number of row groups in `input_row_group_indices`
    * @param output_dtypes Datatypes of output columns
    * @param output_column_schemas schema indices of output columns
    * @param filter AST expression to filter row groups based on bloom filter membership
    * @param stream CUDA stream used for device memory operations and kernel launches
    *
-   * @return Filtered row group indices, if any is filtered
+   * @return A pair of filtered row group indices if any is filtered, and a boolean indicating if
+   *         bloom filtering was applied
    */
-  [[nodiscard]] std::optional<std::vector<std::vector<size_type>>> apply_bloom_filters(
-    host_span<std::unique_ptr<datasource> const> sources,
-    host_span<std::vector<size_type> const> input_row_group_indices,
-    host_span<data_type const> output_dtypes,
-    host_span<int const> output_column_schemas,
-    std::reference_wrapper<ast::expression const> filter,
-    rmm::cuda_stream_view stream) const;
+  [[nodiscard]] std::pair<std::optional<std::vector<std::vector<size_type>>>, bool>
+  apply_bloom_filters(host_span<std::unique_ptr<datasource> const> sources,
+                      host_span<std::vector<size_type> const> input_row_group_indices,
+                      size_type total_row_groups,
+                      host_span<data_type const> output_dtypes,
+                      host_span<int const> output_column_schemas,
+                      std::reference_wrapper<ast::expression const> filter,
+                      rmm::cuda_stream_view stream) const;
 
   /**
    * @brief Filters and reduces down to a selection of row groups
@@ -408,9 +423,15 @@ class aggregate_reader_metadata {
    * @param filter Optional AST expression to filter row groups based on Column chunk statistics
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @return A tuple of corrected row_start, row_count, list of row group indexes and its
-   *         starting row, and list of number of rows per source
+   *         starting row, list of number of rows per source, number of input row groups, and a
+   *         struct containing the number of row groups surviving each predicate pushdown filter
    */
-  [[nodiscard]] std::tuple<int64_t, size_type, std::vector<row_group_info>, std::vector<size_t>>
+  [[nodiscard]] std::tuple<int64_t,
+                           size_type,
+                           std::vector<row_group_info>,
+                           std::vector<size_t>,
+                           size_type,
+                           surviving_row_group_metrics>
   select_row_groups(host_span<std::unique_ptr<datasource> const> sources,
                     host_span<std::vector<size_type> const> row_group_indices,
                     int64_t row_start,
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 3874346e471..b6134947b0c 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -1285,7 +1285,9 @@ void reader::impl::preprocess_file(read_mode mode)
   std::tie(_file_itm_data.global_skip_rows,
            _file_itm_data.global_num_rows,
            _file_itm_data.row_groups,
-           _file_itm_data.num_rows_per_source) =
+           _file_itm_data.num_rows_per_source,
+           _file_itm_data.num_input_row_groups,
+           _file_itm_data.surviving_row_groups) =
     _metadata->select_row_groups(_sources,
                                  _options.row_group_indices,
                                  _options.skip_rows,
diff --git a/cpp/src/unary/math_ops.cu b/cpp/src/unary/math_ops.cu
index 1d506c59cd9..4e96f900bf3 100644
--- a/cpp/src/unary/math_ops.cu
+++ b/cpp/src/unary/math_ops.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -234,6 +234,16 @@ struct DeviceNot {
   }
 };
 
+// negation
+
+struct DeviceNegate {
+  template <typename T>
+  T __device__ operator()(T data)
+  {
+    return -data;
+  }
+};
+
 // fixed_point ops
 
 /*
@@ -278,6 +288,12 @@ struct fixed_point_abs {
   __device__ T operator()(T data) { return numeric::detail::abs(data); }
 };
 
+template <typename T>
+struct fixed_point_negate {
+  T n;
+  __device__ T operator()(T data) { return -data; }
+};
+
 template <typename T, template <typename> typename FixedPointFunctor>
 std::unique_ptr<column> unary_op_with(column_view const& input,
                                       rmm::cuda_stream_view stream,
@@ -414,6 +430,34 @@ struct MathOpDispatcher {
   }
 };
 
+template <typename UFN>
+struct NegateOpDispatcher {
+  template <typename T>
+  static constexpr bool is_supported()
+  {
+    return std::is_signed_v<T> || cudf::is_duration<T>();
+  }
+
+  template <typename T, std::enable_if_t<is_supported<T>()>* = nullptr>
+  std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::device_async_resource_ref mr)
+  {
+    return transform_fn<T, UFN>(input.begin<T>(),
+                                input.end<T>(),
+                                cudf::detail::copy_bitmask(input, stream, mr),
+                                input.null_count(),
+                                stream,
+                                mr);
+  }
+
+  template <typename T, typename... Args>
+  std::enable_if_t<!is_supported<T>(), std::unique_ptr<cudf::column>> operator()(Args&&...)
+  {
+    CUDF_FAIL("Unsupported data type for negate operation");
+  }
+};
+
 template <typename UFN>
 struct BitwiseOpDispatcher {
   template <typename T, std::enable_if_t<std::is_integral_v<T>>* = nullptr>
@@ -550,9 +594,10 @@ struct FixedPointOpDispatcher {
   {
     // clang-format off
     switch (op) {
-      case cudf::unary_operator::CEIL:  return unary_op_with<T, fixed_point_ceil>(input, stream, mr);
-      case cudf::unary_operator::FLOOR: return unary_op_with<T, fixed_point_floor>(input, stream, mr);
-      case cudf::unary_operator::ABS:   return unary_op_with<T, fixed_point_abs>(input, stream, mr);
+      case cudf::unary_operator::CEIL:   return unary_op_with<T, fixed_point_ceil>(input, stream, mr);
+      case cudf::unary_operator::FLOOR:  return unary_op_with<T, fixed_point_floor>(input, stream, mr);
+      case cudf::unary_operator::ABS:    return unary_op_with<T, fixed_point_abs>(input, stream, mr);
+      case cudf::unary_operator::NEGATE: return unary_op_with<T, fixed_point_negate>(input, stream, mr);
       default: CUDF_FAIL("Unsupported fixed_point unary operation");
     }
     // clang-format on
@@ -639,6 +684,9 @@ std::unique_ptr<cudf::column> unary_operation(cudf::column_view const& input,
     case cudf::unary_operator::NOT:
       return cudf::type_dispatcher(
         input.type(), detail::LogicalOpDispatcher<detail::DeviceNot>{}, input, stream, mr);
+    case cudf::unary_operator::NEGATE:
+      return cudf::type_dispatcher(
+        input.type(), detail::NegateOpDispatcher<detail::DeviceNegate>{}, input, stream, mr);
     default: CUDF_FAIL("Undefined unary operation");
   }
 }
diff --git a/cpp/tests/io/parquet_reader_test.cpp b/cpp/tests/io/parquet_reader_test.cpp
index 177e6163d4f..b96c423917a 100644
--- a/cpp/tests/io/parquet_reader_test.cpp
+++ b/cpp/tests/io/parquet_reader_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -1328,6 +1328,26 @@ TEST_F(ParquetReaderTest, ReorderedReadMultipleFiles)
   CUDF_TEST_EXPECT_TABLES_EQUAL(sliced[1], swapped2);
 }
 
+TEST_F(ParquetReaderTest, NoFilter)
+{
+  srand(31337);
+  auto expected = create_random_fixed_table<int>(9, 9, false);
+
+  auto filepath = temp_env->get_temp_filepath("FilterSimple.parquet");
+  cudf::io::parquet_writer_options args =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, *expected);
+  cudf::io::write_parquet(args);
+
+  cudf::io::parquet_reader_options read_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(read_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *expected);
+  EXPECT_EQ(result.metadata.num_input_row_groups, 1);
+  EXPECT_FALSE(result.metadata.num_row_groups_after_stats_filter.has_value());
+  EXPECT_FALSE(result.metadata.num_row_groups_after_bloom_filter.has_value());
+}
+
 TEST_F(ParquetReaderTest, FilterSimple)
 {
   srand(31337);
@@ -2681,52 +2701,107 @@ TYPED_TEST(ParquetReaderPredicatePushdownTest, FilterTyped)
 
   auto const [src, filepath] = create_parquet_typed_with_stats<T>("FilterTyped.parquet");
   auto const written_table   = src.view();
+  auto const col_name_0      = cudf::ast::column_name_reference("col0");
+  auto const col_ref_0       = cudf::ast::column_reference(0);
 
-  // Filtering AST
-  auto literal_value = []() {
-    if constexpr (cudf::is_timestamp<T>()) {
-      // table[0] < 10000 timestamp days/seconds/milliseconds/microseconds/nanoseconds
-      return cudf::timestamp_scalar<T>(T(typename T::duration(10000)));  // i (0-20,000)
-    } else if constexpr (cudf::is_duration<T>()) {
-      // table[0] < 10000 day/seconds/milliseconds/microseconds/nanoseconds
-      return cudf::duration_scalar<T>(T(10000));  // i (0-20,000)
-    } else if constexpr (std::is_same_v<T, cudf::string_view>) {
-      // table[0] < "000010000"
-      return cudf::string_scalar("000010000");  // i (0-20,000)
+  auto const test_predicate_pushdown = [&](cudf::ast::operation const& filter_expression,
+                                           cudf::ast::operation const& ref_filter,
+                                           cudf::size_type expected_total_row_groups,
+                                           cudf::size_type expected_stats_filtered_row_groups) {
+    // Expected result
+    auto const predicate = cudf::compute_column(written_table, ref_filter);
+    EXPECT_EQ(predicate->view().type().id(), cudf::type_id::BOOL8)
+      << "Predicate filter should return a boolean";
+    auto const expected = cudf::apply_boolean_mask(written_table, *predicate);
+
+    // Reading with Predicate Pushdown
+    cudf::io::parquet_reader_options read_opts =
+      cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath})
+        .filter(filter_expression);
+    auto const result       = cudf::io::read_parquet(read_opts);
+    auto const result_table = result.tbl->view();
+
+    // Tests
+    EXPECT_EQ(static_cast<int>(written_table.column(0).type().id()),
+              static_cast<int>(result_table.column(0).type().id()))
+      << "col0 type mismatch";
+
+    // To make sure AST filters out some elements if row groups must be filtered
+    if (expected_stats_filtered_row_groups < expected_total_row_groups) {
+      EXPECT_LT(expected->num_rows(), written_table.num_rows());
     } else {
-      // table[0] < 0 or 100u
-      return cudf::numeric_scalar<T>((100 - 100 * std::is_signed_v<T>));  // i/100 (-100-100/ 0-200)
+      EXPECT_LE(expected->num_rows(), written_table.num_rows());
     }
-  }();
-  auto literal           = cudf::ast::literal(literal_value);
-  auto col_name_0        = cudf::ast::column_name_reference("col0");
-  auto filter_expression = cudf::ast::operation(cudf::ast::ast_operator::LESS, col_name_0, literal);
-  auto col_ref_0         = cudf::ast::column_reference(0);
-  auto ref_filter        = cudf::ast::operation(cudf::ast::ast_operator::LESS, col_ref_0, literal);
-
-  // Expected result
-  auto predicate = cudf::compute_column(written_table, ref_filter);
-  EXPECT_EQ(predicate->view().type().id(), cudf::type_id::BOOL8)
-    << "Predicate filter should return a boolean";
-  auto expected = cudf::apply_boolean_mask(written_table, *predicate);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result_table);
+    EXPECT_EQ(result.metadata.num_input_row_groups, expected_total_row_groups);
+    EXPECT_TRUE(result.metadata.num_row_groups_after_stats_filter.has_value());
+    EXPECT_EQ(result.metadata.num_row_groups_after_stats_filter.value(),
+              expected_stats_filtered_row_groups);
+    EXPECT_FALSE(result.metadata.num_row_groups_after_bloom_filter.has_value());
+  };
 
-  // Reading with Predicate Pushdown
-  cudf::io::parquet_reader_options read_opts =
-    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath})
-      .filter(filter_expression);
-  auto result       = cudf::io::read_parquet(read_opts);
-  auto result_table = result.tbl->view();
+  // The `literal_value` and stats should filter out 2 out of 4 row groups.
+  {
+    auto constexpr expected_total_row_groups          = 4;
+    auto constexpr expected_stats_filtered_row_groups = 2;
+
+    // Filtering AST
+    auto literal_value = []() {
+      if constexpr (cudf::is_timestamp<T>()) {
+        // table[0] < 10000 timestamp days/seconds/milliseconds/microseconds/nanoseconds
+        return cudf::timestamp_scalar<T>(T(typename T::duration(10000)));  // i (0-20,000)
+      } else if constexpr (cudf::is_duration<T>()) {
+        // table[0] < 10000 day/seconds/milliseconds/microseconds/nanoseconds
+        return cudf::duration_scalar<T>(T(10000));  // i (0-20,000)
+      } else if constexpr (std::is_same_v<T, cudf::string_view>) {
+        // table[0] < "000010000"
+        return cudf::string_scalar("000010000");  // i (0-20,000)
+      } else {
+        // table[0] < 0 or 100u
+        return cudf::numeric_scalar<T>(
+          (100 - 100 * std::is_signed_v<T>));  // i/100 (-100-100/ 0-200)
+      }
+    }();
+
+    auto const literal = cudf::ast::literal(literal_value);
+    auto const filter_expression =
+      cudf::ast::operation(cudf::ast::ast_operator::LESS, col_name_0, literal);
+    auto const ref_filter = cudf::ast::operation(cudf::ast::ast_operator::LESS, col_ref_0, literal);
+    test_predicate_pushdown(
+      filter_expression, ref_filter, expected_total_row_groups, expected_stats_filtered_row_groups);
+  }
 
-  // tests
-  EXPECT_EQ(int(written_table.column(0).type().id()), int(result_table.column(0).type().id()))
-    << "col0 type mismatch";
-  // To make sure AST filters out some elements
-  EXPECT_LT(expected->num_rows(), written_table.num_rows());
-  EXPECT_EQ(result_table.num_rows(), expected->num_rows());
-  EXPECT_EQ(result_table.num_columns(), expected->num_columns());
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result_table);
+  // The `literal_value` and stats should not filter any of the 4 row groups.
+  {
+    auto constexpr expected_total_row_groups          = 4;
+    auto constexpr expected_stats_filtered_row_groups = 4;
+
+    // Filtering AST
+    auto literal_value = []() {
+      if constexpr (cudf::is_timestamp<T>()) {
+        return cudf::timestamp_scalar<T>(T(typename T::duration(20000)));
+      } else if constexpr (cudf::is_duration<T>()) {
+        return cudf::duration_scalar<T>(T(20000));
+      } else if constexpr (std::is_same_v<T, cudf::string_view>) {
+        return cudf::string_scalar("000020000");
+      } else {
+        return cudf::numeric_scalar<T>(std::numeric_limits<T>::max());
+      }
+    }();
+
+    auto const literal = cudf::ast::literal(literal_value);
+    auto const filter_expression =
+      cudf::ast::operation(cudf::ast::ast_operator::LESS_EQUAL, col_name_0, literal);
+    auto const ref_filter =
+      cudf::ast::operation(cudf::ast::ast_operator::LESS_EQUAL, col_ref_0, literal);
+    test_predicate_pushdown(
+      filter_expression, ref_filter, expected_total_row_groups, expected_stats_filtered_row_groups);
+  }
 }
 
+//////////////////////
+// wide tables tests
+
 // The test below requires several minutes to complete with memcheck, thus it is disabled by
 // default.
 TEST_F(ParquetReaderTest, DISABLED_ListsWideTable)
diff --git a/cpp/tests/unary/math_ops_test.cpp b/cpp/tests/unary/math_ops_test.cpp
index 663a919f3f4..bcb84d4574c 100644
--- a/cpp/tests/unary/math_ops_test.cpp
+++ b/cpp/tests/unary/math_ops_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,69 @@
 
 #include <vector>
 
+using TypesToNegate = cudf::test::Types<int8_t,
+                                        int16_t,
+                                        int32_t,
+                                        int64_t,
+                                        float,
+                                        double,
+                                        cudf::duration_D,
+                                        cudf::duration_s,
+                                        cudf::duration_ms,
+                                        cudf::duration_us,
+                                        cudf::duration_ns>;
+
+template <typename T>
+struct UnaryNegateTests : public cudf::test::BaseFixture {};
+
+TYPED_TEST_SUITE(UnaryNegateTests, TypesToNegate);
+
+TYPED_TEST(UnaryNegateTests, SimpleNEGATE)
+{
+  using T = TypeParam;
+  cudf::test::fixed_width_column_wrapper<T> input{{0, 1, 2, 3}};
+  auto const v = cudf::test::make_type_param_vector<T>({0, -1, -2, -3});
+  cudf::test::fixed_width_column_wrapper<T> expected(v.begin(), v.end());
+  auto output = cudf::unary_operation(input, cudf::unary_operator::NEGATE);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, output->view());
+}
+
+using TypesNotToNegate = cudf::test::Types<uint8_t,
+                                           uint16_t,
+                                           uint32_t,
+                                           uint64_t,
+                                           cudf::timestamp_D,
+                                           cudf::timestamp_s,
+                                           cudf::timestamp_ms,
+                                           cudf::timestamp_us,
+                                           cudf::timestamp_ns>;
+
+template <typename T>
+struct UnaryNegateErrorTests : public cudf::test::BaseFixture {};
+
+TYPED_TEST_SUITE(UnaryNegateErrorTests, TypesNotToNegate);
+
+TYPED_TEST(UnaryNegateErrorTests, UnsupportedTypesFail)
+{
+  using T = TypeParam;
+  cudf::test::fixed_width_column_wrapper<T> input({1, 2, 3, 4});
+  EXPECT_THROW(cudf::unary_operation(input, cudf::unary_operator::NEGATE), cudf::logic_error);
+}
+
+struct UnaryNegateComplexTypesErrorTests : public cudf::test::BaseFixture {};
+
+TEST_F(UnaryNegateComplexTypesErrorTests, NegateStringColumnFail)
+{
+  cudf::test::strings_column_wrapper input({"foo", "bar"});
+  EXPECT_THROW(cudf::unary_operation(input, cudf::unary_operator::NEGATE), cudf::logic_error);
+}
+
+TEST_F(UnaryNegateComplexTypesErrorTests, NegateListsColumnFail)
+{
+  cudf::test::lists_column_wrapper<int32_t> input{{1, 2}, {3, 4}};
+  EXPECT_THROW(cudf::unary_operation(input, cudf::unary_operator::NEGATE), cudf::logic_error);
+}
+
 template <typename T>
 struct UnaryLogicalOpsTest : public cudf::test::BaseFixture {};
 
@@ -274,7 +337,7 @@ TYPED_TEST(UnaryMathFloatOpsTest, SimpleTANH)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, output->view());
 }
 
-TYPED_TEST(UnaryMathFloatOpsTest, SimpleiASINH)
+TYPED_TEST(UnaryMathFloatOpsTest, SimpleASINH)
 {
   cudf::test::fixed_width_column_wrapper<TypeParam> input{{0.0}};
   cudf::test::fixed_width_column_wrapper<TypeParam> expected{{0.0}};
diff --git a/cpp/tests/unary/unary_ops_test.cpp b/cpp/tests/unary/unary_ops_test.cpp
index 3c616461c74..d7989c6b053 100644
--- a/cpp/tests/unary/unary_ops_test.cpp
+++ b/cpp/tests/unary/unary_ops_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -266,6 +266,20 @@ struct FixedPointUnaryTests : public cudf::test::BaseFixture {};
 
 TYPED_TEST_SUITE(FixedPointUnaryTests, cudf::test::FixedPointTypes);
 
+TYPED_TEST(FixedPointUnaryTests, FixedPointUnaryNegate)
+{
+  using namespace numeric;
+  using decimalXX  = TypeParam;
+  using RepType    = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  auto const input    = fp_wrapper{{0, -1234, -3456, -6789, 1234, 3456, 6789}, scale_type{-3}};
+  auto const expected = fp_wrapper{{0, 1234, 3456, 6789, -1234, -3456, -6789}, scale_type{-3}};
+  auto const result   = cudf::unary_operation(input, cudf::unary_operator::NEGATE);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
 TYPED_TEST(FixedPointUnaryTests, FixedPointUnaryAbs)
 {
   using namespace numeric;
diff --git a/dependencies.yaml b/dependencies.yaml
index 501128d278e..b1378fae6d7 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -638,7 +638,7 @@ dependencies:
       - output_types: [conda]
         packages:
           - breathe>=4.35.0
-          - dask-cuda==25.2.*,>=0.0.0a0
+          - dask-cuda==25.4.*,>=0.0.0a0
           - *doxygen
           - make
           - myst-nb
@@ -786,13 +786,13 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - rapids-dask-dependency==25.2.*,>=0.0.0a0
+          - rapids-dask-dependency==25.4.*,>=0.0.0a0
   run_dask_cudf:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
           - pynvml>=12.0.0,<13.0.0a0
-          - rapids-dask-dependency==25.2.*,>=0.0.0a0
+          - rapids-dask-dependency==25.4.*,>=0.0.0a0
   run_custreamz:
     common:
       - output_types: conda
@@ -930,7 +930,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - dask-cuda==25.2.*,>=0.0.0a0
+          - dask-cuda==25.4.*,>=0.0.0a0
           - *numba-cuda-dep
           - *numba-dep
     specific:
@@ -951,7 +951,7 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &libcudf_unsuffixed libcudf==25.2.*,>=0.0.0a0
+          - &libcudf_unsuffixed libcudf==25.4.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -965,18 +965,18 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - libcudf-cu12==25.2.*,>=0.0.0a0
+              - libcudf-cu12==25.4.*,>=0.0.0a0
           - matrix:
               cuda: "11.*"
               cuda_suffixed: "true"
             packages:
-              - libcudf-cu11==25.2.*,>=0.0.0a0
+              - libcudf-cu11==25.4.*,>=0.0.0a0
           - {matrix: null, packages: [*libcudf_unsuffixed]}
   depends_on_pylibcudf:
     common:
       - output_types: conda
         packages:
-          - &pylibcudf_unsuffixed pylibcudf==25.2.*,>=0.0.0a0
+          - &pylibcudf_unsuffixed pylibcudf==25.4.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -990,18 +990,18 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - pylibcudf-cu12==25.2.*,>=0.0.0a0
+              - pylibcudf-cu12==25.4.*,>=0.0.0a0
           - matrix:
               cuda: "11.*"
               cuda_suffixed: "true"
             packages:
-              - pylibcudf-cu11==25.2.*,>=0.0.0a0
+              - pylibcudf-cu11==25.4.*,>=0.0.0a0
           - {matrix: null, packages: [*pylibcudf_unsuffixed]}
   depends_on_cudf:
     common:
       - output_types: conda
         packages:
-          - &cudf_unsuffixed cudf==25.2.*,>=0.0.0a0
+          - &cudf_unsuffixed cudf==25.4.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -1015,18 +1015,18 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - cudf-cu12==25.2.*,>=0.0.0a0
+              - cudf-cu12==25.4.*,>=0.0.0a0
           - matrix:
               cuda: "11.*"
               cuda_suffixed: "true"
             packages:
-              - cudf-cu11==25.2.*,>=0.0.0a0
+              - cudf-cu11==25.4.*,>=0.0.0a0
           - {matrix: null, packages: [*cudf_unsuffixed]}
   depends_on_cudf_kafka:
     common:
       - output_types: conda
         packages:
-          - &cudf_kafka_unsuffixed cudf_kafka==25.2.*,>=0.0.0a0
+          - &cudf_kafka_unsuffixed cudf_kafka==25.4.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -1040,12 +1040,12 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - cudf_kafka-cu12==25.2.*,>=0.0.0a0
+              - cudf_kafka-cu12==25.4.*,>=0.0.0a0
           - matrix:
               cuda: "11.*"
               cuda_suffixed: "true"
             packages:
-              - cudf_kafka-cu11==25.2.*,>=0.0.0a0
+              - cudf_kafka-cu11==25.4.*,>=0.0.0a0
           - {matrix: null, packages: [*cudf_kafka_unsuffixed]}
   depends_on_cupy:
     common:
@@ -1066,7 +1066,7 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &libkvikio_unsuffixed libkvikio==25.2.*,>=0.0.0a0
+          - &libkvikio_unsuffixed libkvikio==25.4.*,>=0.0.0a0
       - output_types: requirements
         packages:
           - --extra-index-url=https://pypi.nvidia.com
@@ -1078,12 +1078,12 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - libkvikio-cu12==25.2.*,>=0.0.0a0
+              - libkvikio-cu12==25.4.*,>=0.0.0a0
           - matrix:
               cuda: "11.*"
               cuda_suffixed: "true"
             packages:
-              - libkvikio-cu11==25.2.*,>=0.0.0a0
+              - libkvikio-cu11==25.4.*,>=0.0.0a0
           - matrix:
             packages:
               - *libkvikio_unsuffixed
@@ -1091,7 +1091,7 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &librmm_unsuffixed librmm==25.2.*,>=0.0.0a0
+          - &librmm_unsuffixed librmm==25.4.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -1105,12 +1105,12 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - librmm-cu12==25.2.*,>=0.0.0a0
+              - librmm-cu12==25.4.*,>=0.0.0a0
           - matrix:
               cuda: "11.*"
               cuda_suffixed: "true"
             packages:
-              - librmm-cu11==25.2.*,>=0.0.0a0
+              - librmm-cu11==25.4.*,>=0.0.0a0
           - matrix:
             packages:
               - *librmm_unsuffixed
@@ -1118,7 +1118,7 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &rmm_unsuffixed rmm==25.2.*,>=0.0.0a0
+          - &rmm_unsuffixed rmm==25.4.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -1132,12 +1132,12 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - rmm-cu12==25.2.*,>=0.0.0a0
+              - rmm-cu12==25.4.*,>=0.0.0a0
           - matrix:
               cuda: "11.*"
               cuda_suffixed: "true"
             packages:
-              - rmm-cu11==25.2.*,>=0.0.0a0
+              - rmm-cu11==25.4.*,>=0.0.0a0
           - matrix:
             packages:
               - *rmm_unsuffixed
diff --git a/java/ci/README.md b/java/ci/README.md
index bfb35bc1d23..cc8ab77bf6c 100644
--- a/java/ci/README.md
+++ b/java/ci/README.md
@@ -34,7 +34,7 @@ nvidia-docker run -it cudf-build:11.8.0-devel-rocky8 bash
 You can download the cuDF repo in the docker container or you can mount it into the container.
 Here I choose to download again in the container.
 ```bash
-git clone --recursive https://github.com/rapidsai/cudf.git -b branch-25.02
+git clone --recursive https://github.com/rapidsai/cudf.git -b branch-25.04
 ```
 
 ### Build cuDF jar with devtoolset
@@ -47,4 +47,4 @@ scl enable gcc-toolset-11 "java/ci/build-in-docker.sh"
 
 ### The output
 
-You can find the cuDF jar in java/target/ like cudf-25.02.0-SNAPSHOT-cuda11.jar.
+You can find the cuDF jar in java/target/ like cudf-25.04.0-SNAPSHOT-cuda11.jar.
diff --git a/java/pom.xml b/java/pom.xml
index 8bbeac20c99..1f80381dd7e 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -21,7 +21,7 @@
 
     <groupId>ai.rapids</groupId>
     <artifactId>cudf</artifactId>
-    <version>25.02.0-SNAPSHOT</version>
+    <version>25.04.0-SNAPSHOT</version>
 
     <name>cudfjni</name>
     <description>
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index d4ef09e44e8..e24cf72bab3 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -8,7 +8,6 @@
 import numpy as np
 import pandas as pd
 import pyarrow as pa
-from numba.np import numpy_support
 from typing_extensions import Self
 
 import pylibcudf as plc
@@ -24,7 +23,6 @@
 from cudf.core.mixins import BinaryOperand
 from cudf.core.scalar import pa_scalar_to_plc_scalar
 from cudf.errors import MixedTypeError
-from cudf.utils import cudautils
 from cudf.utils.dtypes import (
     find_common_type,
     min_column_type,
@@ -33,7 +31,7 @@
 )
 
 if TYPE_CHECKING:
-    from collections.abc import Callable, Sequence
+    from collections.abc import Sequence
 
     from cudf._typing import (
         ColumnBinaryOperand,
@@ -45,13 +43,6 @@
     from cudf.core.buffer import Buffer
     from cudf.core.column import DecimalBaseColumn
 
-_unaryop_map = {
-    "ASIN": "ARCSIN",
-    "ACOS": "ARCCOS",
-    "ATAN": "ARCTAN",
-    "INVERT": "BIT_INVERT",
-}
-
 
 class NumericalColumn(NumericalBaseColumn):
     """
@@ -197,24 +188,6 @@ def transform(self, compiled_op, np_dtype: np.dtype) -> ColumnBase:
         )
         return type(self).from_pylibcudf(plc_column)
 
-    def unary_operator(self, unaryop: str | Callable) -> ColumnBase:
-        if callable(unaryop):
-            nb_type = numpy_support.from_dtype(self.dtype)
-            nb_signature = (nb_type,)
-            compiled_op = cudautils.compile_udf(unaryop, nb_signature)
-            np_dtype = np.dtype(compiled_op[1])
-            return self.transform(compiled_op, np_dtype)
-
-        unaryop = unaryop.upper()
-        unaryop = _unaryop_map.get(unaryop, unaryop)
-        unaryop = plc.unary.UnaryOperator[unaryop]
-        with acquire_spill_lock():
-            return type(self).from_pylibcudf(
-                plc.unary.unary_operation(
-                    self.to_pylibcudf(mode="read"), unaryop
-                )
-            )
-
     def __invert__(self):
         if self.dtype.kind in "ui":
             return self.unary_operator("invert")
diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py
index d8c316a4c8f..2674b92bb21 100644
--- a/python/cudf/cudf/core/column/numerical_base.py
+++ b/python/cudf/cudf/core/column/numerical_base.py
@@ -6,6 +6,7 @@
 from typing import TYPE_CHECKING, Literal, cast
 
 import numpy as np
+from numba.np import numpy_support
 
 import pylibcudf as plc
 
@@ -14,12 +15,23 @@
 from cudf.core.column.column import ColumnBase
 from cudf.core.missing import NA
 from cudf.core.mixins import Scannable
+from cudf.utils import cudautils
 
 if TYPE_CHECKING:
+    from collections.abc import Callable
+
     from cudf._typing import ScalarLike
     from cudf.core.column.decimal import DecimalDtype
 
 
+_unaryop_map = {
+    "ASIN": "ARCSIN",
+    "ACOS": "ARCCOS",
+    "ATAN": "ARCTAN",
+    "INVERT": "BIT_INVERT",
+}
+
+
 class NumericalBaseColumn(ColumnBase, Scannable):
     """
     A column composed of numerical (bool, integer, float, decimal) data.
@@ -268,3 +280,26 @@ def _scan(self, op: str) -> ColumnBase:
         return self.scan(op.replace("cum", ""), True)._with_type_metadata(
             self.dtype
         )
+
+    def unary_operator(self, unaryop: str | Callable) -> ColumnBase:
+        if callable(unaryop):
+            nb_type = numpy_support.from_dtype(self.dtype)
+            nb_signature = (nb_type,)
+            compiled_op = cudautils.compile_udf(unaryop, nb_signature)
+            np_dtype = np.dtype(compiled_op[1])
+            return self.transform(compiled_op, np_dtype)
+
+        unaryop = unaryop.upper()
+        unaryop = _unaryop_map.get(unaryop, unaryop)
+        unaryop = plc.unary.UnaryOperator[unaryop]
+        with acquire_spill_lock():
+            return type(self).from_pylibcudf(
+                plc.unary.unary_operation(
+                    self.to_pylibcudf(mode="read"), unaryop
+                )
+            )
+
+    def transform(self, compiled_op, np_dtype: np.dtype) -> ColumnBase:
+        raise NotImplementedError(
+            "transform is not implemented for NumericalBaseColumn"
+        )
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index 3f5aea19307..c7243d01325 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -41,6 +41,16 @@
 }
 
 
+@functools.cache
+def get_np_td_unit_conversion(
+    reso: str, dtype: None | np.dtype
+) -> np.timedelta64:
+    td = np.timedelta64(_unit_to_nanoseconds_conversion[reso], "ns")
+    if dtype is not None:
+        return td.astype(dtype)
+    return td
+
+
 class TimeDeltaColumn(ColumnBase):
     """
     Parameters
@@ -483,74 +493,33 @@ def components(self) -> dict[str, ColumnBase]:
         3      0      0       35       35           656             0            0
         4     37     13       12       14           234             0            0
         """
-
         date_meta = {
+            "hours": ["D", "h"],
+            "minutes": ["h", "m"],
             "seconds": ["m", "s"],
             "milliseconds": ["s", "ms"],
             "microseconds": ["ms", "us"],
             "nanoseconds": ["us", "ns"],
         }
-        data = {
-            "days": self
-            // cudf.Scalar(
-                np.timedelta64(
-                    _unit_to_nanoseconds_conversion["D"], "ns"
-                ).astype(self.dtype)
-            ),
-            "hours": (
-                self
-                % cudf.Scalar(
-                    np.timedelta64(
-                        _unit_to_nanoseconds_conversion["D"], "ns"
-                    ).astype(self.dtype)
-                )
-            )
-            // cudf.Scalar(
-                np.timedelta64(
-                    _unit_to_nanoseconds_conversion["h"], "ns"
-                ).astype(self.dtype)
-            ),
-            "minutes": (
-                self
-                % cudf.Scalar(
-                    np.timedelta64(
-                        _unit_to_nanoseconds_conversion["h"], "ns"
-                    ).astype(self.dtype)
-                )
-            )
-            // cudf.Scalar(
-                np.timedelta64(
-                    _unit_to_nanoseconds_conversion["m"], "ns"
-                ).astype(self.dtype)
-            ),
-        }
-        keys_list = iter(date_meta.keys())
-        for name in keys_list:
-            value = date_meta[name]
-            data[name] = (
-                self
-                % cudf.Scalar(
-                    np.timedelta64(
-                        _unit_to_nanoseconds_conversion[value[0]], "ns"
-                    ).astype(self.dtype)
+        data = {"days": self // get_np_td_unit_conversion("D", self.dtype)}
+        reached_self_unit = False
+        for result_key, (mod_unit, div_unit) in date_meta.items():
+            if not reached_self_unit:
+                res_col = (
+                    self % get_np_td_unit_conversion(mod_unit, self.dtype)
+                ) // get_np_td_unit_conversion(div_unit, self.dtype)
+                reached_self_unit = self.time_unit == div_unit
+            else:
+                res_col = column.as_column(
+                    0, length=len(self), dtype=np.dtype(np.int64)
                 )
-            ) // cudf.Scalar(
-                np.timedelta64(
-                    _unit_to_nanoseconds_conversion[value[1]], "ns"
-                ).astype(self.dtype)
-            )
-            if self.time_unit == value[1]:
-                break
-
-        for name in keys_list:
-            res_col = column.as_column(0, length=len(self), dtype="int64")
-            if self.nullable:
-                res_col = res_col.set_mask(self.mask)
-            data[name] = res_col
+                if self.nullable:
+                    res_col = res_col.set_mask(self.mask)
+            data[result_key] = res_col
         return data
 
     @property
-    def days(self) -> "cudf.core.column.NumericalColumn":
+    def days(self) -> cudf.core.column.NumericalColumn:
         """
         Number of days for each element.
 
@@ -558,14 +527,10 @@ def days(self) -> "cudf.core.column.NumericalColumn":
         -------
         NumericalColumn
         """
-        return self // cudf.Scalar(
-            np.timedelta64(_unit_to_nanoseconds_conversion["D"], "ns").astype(
-                self.dtype
-            )
-        )
+        return self // get_np_td_unit_conversion("D", self.dtype)
 
     @property
-    def seconds(self) -> "cudf.core.column.NumericalColumn":
+    def seconds(self) -> cudf.core.column.NumericalColumn:
         """
         Number of seconds (>= 0 and less than 1 day).
 
@@ -579,18 +544,11 @@ def seconds(self) -> "cudf.core.column.NumericalColumn":
         # division operation to extract the number of seconds.
 
         return (
-            self
-            % cudf.Scalar(
-                np.timedelta64(
-                    _unit_to_nanoseconds_conversion["D"], "ns"
-                ).astype(self.dtype)
-            )
-        ) // cudf.Scalar(
-            np.timedelta64(_unit_to_nanoseconds_conversion["s"], "ns")
-        )
+            self % get_np_td_unit_conversion("D", self.dtype)
+        ) // get_np_td_unit_conversion("s", None)
 
     @property
-    def microseconds(self) -> "cudf.core.column.NumericalColumn":
+    def microseconds(self) -> cudf.core.column.NumericalColumn:
         """
         Number of microseconds (>= 0 and less than 1 second).
 
@@ -604,16 +562,11 @@ def microseconds(self) -> "cudf.core.column.NumericalColumn":
         # division operation to extract the number of microseconds.
 
         return (
-            self
-            % np.timedelta64(
-                _unit_to_nanoseconds_conversion["s"], "ns"
-            ).astype(self.dtype)
-        ) // cudf.Scalar(
-            np.timedelta64(_unit_to_nanoseconds_conversion["us"], "ns")
-        )
+            self % get_np_td_unit_conversion("s", self.dtype)
+        ) // get_np_td_unit_conversion("us", None)
 
     @property
-    def nanoseconds(self) -> "cudf.core.column.NumericalColumn":
+    def nanoseconds(self) -> cudf.core.column.NumericalColumn:
         """
         Return the number of nanoseconds (n), where 0 <= n < 1 microsecond.
 
@@ -633,13 +586,8 @@ def nanoseconds(self) -> "cudf.core.column.NumericalColumn":
                 res_col = res_col.set_mask(self.mask)
             return cast("cudf.core.column.NumericalColumn", res_col)
         return (
-            self
-            % cudf.Scalar(
-                np.timedelta64(_unit_to_nanoseconds_conversion["us"], "ns")
-            )
-        ) // cudf.Scalar(
-            np.timedelta64(_unit_to_nanoseconds_conversion["ns"], "ns")
-        )
+            self % get_np_td_unit_conversion("us", None)
+        ) // get_np_td_unit_conversion("ns", None)
 
 
 def determine_out_dtype(lhs_dtype: Dtype, rhs_dtype: Dtype) -> Dtype:
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 08f8e49a98c..fcf5a3cd8e9 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -1644,7 +1644,7 @@ def __neg__(self):
                 (
                     col.unary_operator("not")
                     if col.dtype.kind == "b"
-                    else -1 * col
+                    else col.unary_operator("negate")
                     for col in self._columns
                 )
             )
diff --git a/python/cudf/cudf/pandas/_wrappers/numpy.py b/python/cudf/cudf/pandas/_wrappers/numpy.py
index d5e669cb58f..1fc53bbbaae 100644
--- a/python/cudf/cudf/pandas/_wrappers/numpy.py
+++ b/python/cudf/cudf/pandas/_wrappers/numpy.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
@@ -149,6 +149,7 @@ def ndarray__array_ufunc__(self, ufunc, method, *inputs, **kwargs):
         # Special wrapping to handle scalar values
         "_fsproxy_wrap": classmethod(wrap_ndarray),
         "base": _FastSlowAttribute("base", private=True),
+        "data": _FastSlowAttribute("data", private=True),
     },
 )
 
diff --git a/python/cudf/cudf/tests/test_unaops.py b/python/cudf/cudf/tests/test_unaops.py
index bbd01eaa311..7ed0d370822 100644
--- a/python/cudf/cudf/tests/test_unaops.py
+++ b/python/cudf/cudf/tests/test_unaops.py
@@ -3,6 +3,7 @@
 import itertools
 import operator
 import re
+from decimal import Decimal
 
 import numpy as np
 import pandas as pd
@@ -134,3 +135,9 @@ def test_series_bool_neg():
     sr = Series([True, False, True, None, False, None, True, True])
     psr = sr.to_pandas(nullable=True)
     assert_eq((-sr).to_pandas(nullable=True), -psr, check_dtype=True)
+
+
+def test_series_decimal_neg():
+    sr = Series([Decimal("0.0"), Decimal("1.23"), Decimal("4.567")])
+    psr = sr.to_pandas()
+    assert_eq((-sr).to_pandas(), -psr, check_dtype=True)
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index 938d22de076..3e8b6d5786c 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -1927,3 +1927,12 @@ def test_series_dtype_property():
     expected = np.dtype(s)
     actual = np.dtype(xs)
     assert expected == actual
+
+
+def test_numpy_data_access():
+    s = pd.Series([1, 2, 3])
+    xs = xpd.Series([1, 2, 3])
+    expected = s.values.data
+    actual = xs.values.data
+
+    assert type(expected) is type(actual)
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml b/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml
index 03068d2268a..977d25184b5 100644
--- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml
@@ -182,7 +182,7 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - cudf==25.2.*,>=0.0.0a0
+          - cudf==25.4.*,>=0.0.0a0
           - pandas
           - pytest
           - pytest-xdist
@@ -248,13 +248,13 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - cuml==25.2.*,>=0.0.0a0
+          - cuml==25.4.*,>=0.0.0a0
           - scikit-learn
   test_cugraph:
     common:
       - output_types: conda
         packages:
-          - cugraph==25.2.*,>=0.0.0a0
+          - cugraph==25.4.*,>=0.0.0a0
           - networkx
   test_ibis:
     common:
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/conftest.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/conftest.py
index 33b6ffdbd5c..553d9c4459e 100644
--- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/conftest.py
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/conftest.py
@@ -1,7 +1,8 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Copyright (c) 2023-2025, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
+import glob
 import os
 import pickle
 from typing import TYPE_CHECKING, BinaryIO
@@ -75,23 +76,40 @@ def swap_xfail(item: _pytest.nodes.Item, name: str):
             swap_xfail(item, "xfail_compare")
 
 
+def get_full_nodeid(pyfuncitem):
+    # Get the full path to the test file
+    filepath = pyfuncitem.path
+    # Get the test name and any parameters
+    test_name = "::".join(pyfuncitem.nodeid.split("::")[1:])
+    # Combine the full file path with the test name
+    full_nodeid = f"{filepath}::{test_name}"
+    return full_nodeid
+
+
+def read_all_results(pattern):
+    results = {}
+    for filepath in glob.glob(pattern):
+        with open(filepath, "rb") as f:
+            results.update(dict(read_results(f)))
+    return results
+
+
 def pytest_configure(config: _pytest.config.Config):
     gold_basename = "results-gold"
     cudf_basename = "results-cudf-pandas"
     test_folder = os.path.join(os.path.dirname(__file__))
 
     if config.getoption("--compare"):
-        # Everyone reads everything
-        gold_path = os.path.join(test_folder, f"{gold_basename}.pickle")
-        cudf_path = os.path.join(test_folder, f"{cudf_basename}.pickle")
+        gold_path = os.path.join(test_folder, f"{gold_basename}*.pickle")
+        cudf_path = os.path.join(test_folder, f"{cudf_basename}*.pickle")
         with disable_module_accelerator():
-            with open(gold_path, "rb") as f:
-                gold_results = dict(read_results(f))
-        with open(cudf_path, "rb") as f:
-            cudf_results = dict(read_results(f))
+            gold_results = read_all_results(gold_path)
+        cudf_results = read_all_results(cudf_path)
         config.stash[results] = (gold_results, cudf_results)
     else:
-        if "cudf.pandas" in config.option.plugins:
+        if any(
+            plugin.strip() == "cudf.pandas" for plugin in config.option.plugins
+        ):
             basename = cudf_basename
         else:
             basename = gold_basename
@@ -112,7 +130,7 @@ def pytest_configure(config: _pytest.config.Config):
 def pytest_pyfunc_call(pyfuncitem: _pytest.python.Function):
     if pyfuncitem.config.getoption("--compare"):
         gold_results, cudf_results = pyfuncitem.config.stash[results]
-        key = pyfuncitem.nodeid
+        key = get_full_nodeid(pyfuncitem)
         try:
             gold = gold_results[key]
         except KeyError:
@@ -140,7 +158,7 @@ def pytest_pyfunc_call(pyfuncitem: _pytest.python.Function):
         # Tuple-based key-value pairs, key is the node-id
         try:
             pickle.dump(
-                (pyfuncitem.nodeid, result),
+                (get_full_nodeid(pyfuncitem), result),
                 pyfuncitem.config.stash[file_handle_key],
             )
         except pickle.PicklingError:
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index bd2a710e84a..33c8e041b88 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -23,7 +23,7 @@ dependencies = [
     "cuda-python>=11.8.5,<12.0a0",
     "cupy-cuda11x>=12.0.0",
     "fsspec>=0.6.0",
-    "libcudf==25.2.*,>=0.0.0a0",
+    "libcudf==25.4.*,>=0.0.0a0",
     "numba-cuda>=0.2.0,<0.3.0a0",
     "numba>=0.59.1,<0.61.0a0",
     "numpy>=1.23,<3.0a0",
@@ -33,9 +33,9 @@ dependencies = [
     "ptxcompiler",
     "pyarrow>=14.0.0,<20.0.0a0,!=17.0.0; platform_machine=='aarch64'",
     "pyarrow>=14.0.0,<20.0.0a0; platform_machine=='x86_64'",
-    "pylibcudf==25.2.*,>=0.0.0a0",
+    "pylibcudf==25.4.*,>=0.0.0a0",
     "rich",
-    "rmm==25.2.*,>=0.0.0a0",
+    "rmm==25.4.*,>=0.0.0a0",
     "typing_extensions>=4.0.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
@@ -120,11 +120,11 @@ matrix-entry = "cuda_suffixed=true"
 requires = [
     "cmake>=3.26.4,!=3.30.0",
     "cython>=3.0.3",
-    "libcudf==25.2.*,>=0.0.0a0",
-    "librmm==25.2.*,>=0.0.0a0",
+    "libcudf==25.4.*,>=0.0.0a0",
+    "librmm==25.4.*,>=0.0.0a0",
     "ninja",
-    "pylibcudf==25.2.*,>=0.0.0a0",
-    "rmm==25.2.*,>=0.0.0a0",
+    "pylibcudf==25.4.*,>=0.0.0a0",
+    "rmm==25.4.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [tool.scikit-build]
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index a9d937435e9..a1f15574d2d 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+# Copyright (c) 2021-2025, NVIDIA CORPORATION.
 
 [build-system]
 build-backend = "rapids_build_backend.build"
@@ -18,7 +18,7 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.10"
 dependencies = [
-    "cudf==25.2.*,>=0.0.0a0",
+    "cudf==25.4.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.optional-dependencies]
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/unary.py b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py
index 3336c901e7f..3286c9ff8bc 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/unary.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
 # TODO: remove need for this
 """DSL nodes for unary operations."""
@@ -119,6 +119,7 @@ class UnaryFunction(Expr):
         "abs": plc.unary.UnaryOperator.ABS,
         "bit_invert": plc.unary.UnaryOperator.BIT_INVERT,
         "not": plc.unary.UnaryOperator.NOT,
+        "negate": plc.unary.UnaryOperator.NEGATE,
     }
     _supported_misc_fns = frozenset(
         {
diff --git a/python/cudf_polars/cudf_polars/experimental/base.py b/python/cudf_polars/cudf_polars/experimental/base.py
index 8f660632df2..36c7745c3f4 100644
--- a/python/cudf_polars/cudf_polars/experimental/base.py
+++ b/python/cudf_polars/cudf_polars/experimental/base.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
 """Multi-partition base classes."""
 
@@ -12,20 +12,26 @@
     from collections.abc import Iterator, Sequence
 
     from cudf_polars.containers import DataFrame
+    from cudf_polars.dsl.expr import NamedExpr
     from cudf_polars.dsl.nodebase import Node
 
 
 class PartitionInfo:
-    """
-    Partitioning information.
-
-    This class only tracks the partition count (for now).
-    """
-
-    __slots__ = ("count",)
-
-    def __init__(self, count: int):
+    """Partitioning information."""
+
+    __slots__ = ("count", "partitioned_on")
+    count: int
+    """Partition count."""
+    partitioned_on: tuple[NamedExpr, ...]
+    """Columns the data is hash-partitioned on."""
+
+    def __init__(
+        self,
+        count: int,
+        partitioned_on: tuple[NamedExpr, ...] = (),
+    ):
         self.count = count
+        self.partitioned_on = partitioned_on
 
     def keys(self, node: Node) -> Iterator[tuple[str, int]]:
         """Return the partitioned keys for a given node."""
diff --git a/python/cudf_polars/cudf_polars/experimental/parallel.py b/python/cudf_polars/cudf_polars/experimental/parallel.py
index 6843ed9ee2e..5a5eaab8b2f 100644
--- a/python/cudf_polars/cudf_polars/experimental/parallel.py
+++ b/python/cudf_polars/cudf_polars/experimental/parallel.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
 """Multi-partition Dask execution."""
 
@@ -10,7 +10,8 @@
 from typing import TYPE_CHECKING, Any
 
 import cudf_polars.experimental.io
-import cudf_polars.experimental.select  # noqa: F401
+import cudf_polars.experimental.select
+import cudf_polars.experimental.shuffle  # noqa: F401
 from cudf_polars.dsl.ir import IR, Cache, Filter, HStack, Projection, Select, Union
 from cudf_polars.dsl.traversal import CachingVisitor, traversal
 from cudf_polars.experimental.base import PartitionInfo, _concat, get_key_name
diff --git a/python/cudf_polars/cudf_polars/experimental/shuffle.py b/python/cudf_polars/cudf_polars/experimental/shuffle.py
new file mode 100644
index 00000000000..d49f13375ed
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/experimental/shuffle.py
@@ -0,0 +1,204 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+"""Shuffle Logic."""
+
+from __future__ import annotations
+
+import json
+import operator
+from typing import TYPE_CHECKING, Any
+
+import pyarrow as pa
+
+import pylibcudf as plc
+
+from cudf_polars.containers import DataFrame
+from cudf_polars.dsl.ir import IR
+from cudf_polars.experimental.base import _concat, get_key_name
+from cudf_polars.experimental.dispatch import generate_ir_tasks, lower_ir_node
+
+if TYPE_CHECKING:
+    from collections.abc import Hashable, MutableMapping
+
+    from cudf_polars.dsl.expr import NamedExpr
+    from cudf_polars.experimental.dispatch import LowerIRTransformer
+    from cudf_polars.experimental.parallel import PartitionInfo
+    from cudf_polars.typing import Schema
+
+
+class Shuffle(IR):
+    """
+    Shuffle multi-partition data.
+
+    Notes
+    -----
+    Only hash-based partitioning is supported (for now).
+    """
+
+    __slots__ = ("keys", "options")
+    _non_child = ("schema", "keys", "options")
+    keys: tuple[NamedExpr, ...]
+    """Keys to shuffle on."""
+    options: dict[str, Any]
+    """Shuffling options."""
+
+    def __init__(
+        self,
+        schema: Schema,
+        keys: tuple[NamedExpr, ...],
+        options: dict[str, Any],
+        df: IR,
+    ):
+        self.schema = schema
+        self.keys = keys
+        self.options = options
+        self._non_child_args = (schema, keys, options)
+        self.children = (df,)
+
+    def get_hashable(self) -> Hashable:
+        """Hashable representation of the node."""
+        return (
+            type(self),
+            tuple(self.schema.items()),
+            self.keys,
+            json.dumps(self.options),
+            self.children,
+        )
+
+    @classmethod
+    def do_evaluate(
+        cls,
+        schema: Schema,
+        keys: tuple[NamedExpr, ...],
+        options: dict[str, Any],
+        df: DataFrame,
+    ):  # pragma: no cover
+        """Evaluate and return a dataframe."""
+        # Single-partition Shuffle evaluation is a no-op
+        return df
+
+
+def _partition_dataframe(
+    df: DataFrame,
+    keys: tuple[NamedExpr, ...],
+    count: int,
+) -> dict[int, DataFrame]:
+    """
+    Partition an input DataFrame for shuffling.
+
+    Notes
+    -----
+    This utility only supports hash partitioning (for now).
+
+    Parameters
+    ----------
+    df
+        DataFrame to partition.
+    keys
+        Shuffle key(s).
+    count
+        Total number of output partitions.
+
+    Returns
+    -------
+    A dictionary mapping between int partition indices and
+    DataFrame fragments.
+    """
+    # Hash the specified keys to calculate the output
+    # partition for each row
+    partition_map = plc.binaryop.binary_operation(
+        plc.hashing.murmurhash3_x86_32(
+            DataFrame([expr.evaluate(df) for expr in keys]).table
+        ),
+        plc.interop.from_arrow(pa.scalar(count, type="uint32")),
+        plc.binaryop.BinaryOperator.PYMOD,
+        plc.types.DataType(plc.types.TypeId.UINT32),
+    )
+
+    # Apply partitioning
+    t, offsets = plc.partitioning.partition(
+        df.table,
+        partition_map,
+        count,
+    )
+
+    # Split and return the partitioned result
+    return {
+        i: DataFrame.from_table(
+            split,
+            df.column_names,
+        )
+        for i, split in enumerate(plc.copying.split(t, offsets[1:-1]))
+    }
+
+
+def _simple_shuffle_graph(
+    name_out: str,
+    name_in: str,
+    keys: tuple[NamedExpr, ...],
+    count_in: int,
+    count_out: int,
+) -> MutableMapping[Any, Any]:
+    """Make a simple all-to-all shuffle graph."""
+    split_name = f"split-{name_out}"
+    inter_name = f"inter-{name_out}"
+
+    graph: MutableMapping[Any, Any] = {}
+    for part_out in range(count_out):
+        _concat_list = []
+        for part_in in range(count_in):
+            graph[(split_name, part_in)] = (
+                _partition_dataframe,
+                (name_in, part_in),
+                keys,
+                count_out,
+            )
+            _concat_list.append((inter_name, part_out, part_in))
+            graph[_concat_list[-1]] = (
+                operator.getitem,
+                (split_name, part_in),
+                part_out,
+            )
+        graph[(name_out, part_out)] = (_concat, _concat_list)
+    return graph
+
+
+@lower_ir_node.register(Shuffle)
+def _(
+    ir: Shuffle, rec: LowerIRTransformer
+) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
+    # Simple lower_ir_node handling for the default hash-based shuffle.
+    # More-complex logic (e.g. joining and sorting) should
+    # be handled separately.
+    from cudf_polars.experimental.parallel import PartitionInfo
+
+    (child,) = ir.children
+
+    new_child, pi = rec(child)
+    if pi[new_child].count == 1 or ir.keys == pi[new_child].partitioned_on:
+        # Already shuffled
+        return new_child, pi
+    new_node = ir.reconstruct([new_child])
+    pi[new_node] = PartitionInfo(
+        # Default shuffle preserves partition count
+        count=pi[new_child].count,
+        # Add partitioned_on info
+        partitioned_on=ir.keys,
+    )
+    return new_node, pi
+
+
+@generate_ir_tasks.register(Shuffle)
+def _(
+    ir: Shuffle, partition_info: MutableMapping[IR, PartitionInfo]
+) -> MutableMapping[Any, Any]:
+    # Use a simple all-to-all shuffle graph.
+
+    # TODO: Optionally use rapidsmp.
+    return _simple_shuffle_graph(
+        get_key_name(ir),
+        get_key_name(ir.children[0]),
+        ir.keys,
+        partition_info[ir.children[0]].count,
+        partition_info[ir].count,
+    )
diff --git a/python/cudf_polars/docs/overview.md b/python/cudf_polars/docs/overview.md
index a8cad5622fb..be48d500a36 100644
--- a/python/cudf_polars/docs/overview.md
+++ b/python/cudf_polars/docs/overview.md
@@ -8,7 +8,7 @@ You will need:
    preferred configuration. Or else, use
    [rustup](https://www.rust-lang.org/tools/install)
 2. A [cudf development
-   environment](https://github.com/rapidsai/cudf/blob/branch-25.02/CONTRIBUTING.md#setting-up-your-build-environment).
+   environment](https://github.com/rapidsai/cudf/blob/branch-25.04/CONTRIBUTING.md#setting-up-your-build-environment).
    The combined devcontainer works, or whatever your favourite approach is.
 
 :::{note}
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index 15547f85d56..805d7925bb4 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -20,7 +20,7 @@ license = { text = "Apache 2.0" }
 requires-python = ">=3.10"
 dependencies = [
     "polars>=1.20,<1.22",
-    "pylibcudf==25.2.*,>=0.0.0a0",
+    "pylibcudf==25.4.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
@@ -41,7 +41,7 @@ test = [
     "pytest<8",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 experimental = [
-    "rapids-dask-dependency==25.2.*,>=0.0.0a0",
+    "rapids-dask-dependency==25.4.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.urls]
diff --git a/python/cudf_polars/tests/experimental/test_shuffle.py b/python/cudf_polars/tests/experimental/test_shuffle.py
new file mode 100644
index 00000000000..294557fd0d6
--- /dev/null
+++ b/python/cudf_polars/tests/experimental/test_shuffle.py
@@ -0,0 +1,66 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+from polars.testing import assert_frame_equal
+
+from cudf_polars import Translator
+from cudf_polars.dsl.expr import Col, NamedExpr
+from cudf_polars.experimental.parallel import evaluate_dask, lower_ir_graph
+from cudf_polars.experimental.shuffle import Shuffle
+
+
+@pytest.fixture(scope="module")
+def engine():
+    return pl.GPUEngine(
+        raise_on_fail=True,
+        executor="dask-experimental",
+        executor_options={"max_rows_per_partition": 4},
+    )
+
+
+@pytest.fixture(scope="module")
+def df():
+    return pl.LazyFrame(
+        {
+            "x": [1, 2, 3, 4, 5, 6, 7],
+            "y": [1, 1, 1, 1, 1, 1, 1],
+            "z": ["a", "b", "c", "d", "e", "f", "g"],
+        }
+    )
+
+
+def test_hash_shuffle(df, engine):
+    # Extract translated IR
+    qir = Translator(df._ldf.visit(), engine).translate_ir()
+
+    # Add first Shuffle node
+    keys = (NamedExpr("x", Col(qir.schema["x"], "x")),)
+    options = {}
+    qir1 = Shuffle(qir.schema, keys, options, qir)
+
+    # Add second Shuffle node (on the same keys)
+    qir2 = Shuffle(qir.schema, keys, options, qir1)
+
+    # Check that sequential shuffles on the same keys
+    # are replaced with a single shuffle node
+    partition_info = lower_ir_graph(qir2)[1]
+    assert len([node for node in partition_info if isinstance(node, Shuffle)]) == 1
+
+    # Add second Shuffle node (on different keys)
+    keys2 = (NamedExpr("z", Col(qir.schema["z"], "z")),)
+    qir3 = Shuffle(qir2.schema, keys2, options, qir2)
+
+    # Check that we have an additional shuffle
+    # node after shuffling on different keys
+    partition_info = lower_ir_graph(qir3)[1]
+    assert len([node for node in partition_info if isinstance(node, Shuffle)]) == 2
+
+    # Check that Dask evaluation works
+    result = evaluate_dask(qir3).to_polars()
+    expect = df.collect(engine="cpu")
+    assert_frame_equal(result, expect, check_row_order=False)
diff --git a/python/cudf_polars/tests/expressions/test_numeric_unaryops.py b/python/cudf_polars/tests/expressions/test_numeric_unaryops.py
index ac3aecf88e6..75bf0960e10 100644
--- a/python/cudf_polars/tests/expressions/test_numeric_unaryops.py
+++ b/python/cudf_polars/tests/expressions/test_numeric_unaryops.py
@@ -1,7 +1,9 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
+from datetime import timedelta
+
 import numpy as np
 import pytest
 
@@ -58,6 +60,7 @@ def ldf(with_nulls, dtype):
         {
             "a": pl.Series(values, dtype=dtype),
             "b": pl.Series([i - 4 for i in range(len(values))], dtype=pl.Float32),
+            "c": pl.Series([timedelta(hours=i) for i in range(len(values))]),
         }
     )
 
@@ -89,3 +92,9 @@ def test_log(ldf, natural):
     q = ldf.select(expr)
 
     assert_gpu_result_equal(q, check_exact=False)
+
+
+@pytest.mark.parametrize("col", ["a", "b", "c"])
+def test_negate(ldf, col):
+    q = ldf.select(-pl.col(col))
+    assert_gpu_result_equal(q)
diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml
index 7820157d89b..665b0a76ecf 100644
--- a/python/custreamz/pyproject.toml
+++ b/python/custreamz/pyproject.toml
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+# Copyright (c) 2021-2025, NVIDIA CORPORATION.
 
 [build-system]
 build-backend = "rapids_build_backend.build"
@@ -20,8 +20,8 @@ license = { text = "Apache 2.0" }
 requires-python = ">=3.10"
 dependencies = [
     "confluent-kafka>=2.5.0,<2.6.0a0",
-    "cudf==25.2.*,>=0.0.0a0",
-    "cudf_kafka==25.2.*,>=0.0.0a0",
+    "cudf==25.4.*,>=0.0.0a0",
+    "cudf_kafka==25.4.*,>=0.0.0a0",
     "streamz",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index 3725722a8ae..87bf282f376 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -19,13 +19,13 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.10"
 dependencies = [
-    "cudf==25.2.*,>=0.0.0a0",
+    "cudf==25.4.*,>=0.0.0a0",
     "cupy-cuda11x>=12.0.0",
     "fsspec>=0.6.0",
     "numpy>=1.23,<3.0a0",
     "pandas>=2.0,<2.2.4dev0",
     "pynvml>=12.0.0,<13.0.0a0",
-    "rapids-dask-dependency==25.2.*,>=0.0.0a0",
+    "rapids-dask-dependency==25.4.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
@@ -46,7 +46,7 @@ cudf = "dask_cudf.backends:CudfBackendEntrypoint"
 
 [project.optional-dependencies]
 test = [
-    "dask-cuda==25.2.*,>=0.0.0a0",
+    "dask-cuda==25.4.*,>=0.0.0a0",
     "numba-cuda>=0.2.0,<0.3.0a0",
     "numba>=0.59.1,<0.61.0a0",
     "pytest-cov",
diff --git a/python/libcudf/pyproject.toml b/python/libcudf/pyproject.toml
index d16ad97ec54..7d3b6c09c61 100644
--- a/python/libcudf/pyproject.toml
+++ b/python/libcudf/pyproject.toml
@@ -38,7 +38,7 @@ classifiers = [
     "Environment :: GPU :: NVIDIA CUDA",
 ]
 dependencies = [
-    "libkvikio==25.2.*,>=0.0.0a0",
+    "libkvikio==25.4.*,>=0.0.0a0",
     "nvidia-nvcomp==4.1.0.6",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
@@ -78,7 +78,7 @@ dependencies-file = "../../dependencies.yaml"
 matrix-entry = "cuda_suffixed=true;use_cuda_wheels=true"
 requires = [
     "cmake>=3.26.4,!=3.30.0",
-    "libkvikio==25.2.*,>=0.0.0a0",
-    "librmm==25.2.*,>=0.0.0a0",
+    "libkvikio==25.4.*,>=0.0.0a0",
+    "librmm==25.4.*,>=0.0.0a0",
     "ninja",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
diff --git a/python/pylibcudf/pylibcudf/libcudf/unary.pxd b/python/pylibcudf/pylibcudf/libcudf/unary.pxd
index 4666012623e..802d4b392a8 100644
--- a/python/pylibcudf/pylibcudf/libcudf/unary.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/unary.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 from libc.stdint cimport int32_t
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
@@ -33,6 +33,7 @@ cdef extern from "cudf/unary.hpp" namespace "cudf" nogil:
         RINT
         BIT_INVERT
         NOT
+        NEGATE
 
     cdef extern unique_ptr[column] unary_operation(
         column_view input,
diff --git a/python/pylibcudf/pylibcudf/unary.pyi b/python/pylibcudf/pylibcudf/unary.pyi
index 7aa23b618f4..4d06a51c03a 100644
--- a/python/pylibcudf/pylibcudf/unary.pyi
+++ b/python/pylibcudf/pylibcudf/unary.pyi
@@ -28,6 +28,7 @@ class UnaryOperator(IntEnum):
     RINT = ...
     BIT_INVERT = ...
     NOT = ...
+    NEGATE = ...
 
 def unary_operation(input: Column, op: UnaryOperator) -> Column: ...
 def is_null(input: Column) -> Column: ...
diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml
index efa3d301334..300138c9b4a 100644
--- a/python/pylibcudf/pyproject.toml
+++ b/python/pylibcudf/pyproject.toml
@@ -19,12 +19,12 @@ license = { text = "Apache 2.0" }
 requires-python = ">=3.10"
 dependencies = [
     "cuda-python>=11.8.5,<12.0a0",
-    "libcudf==25.2.*,>=0.0.0a0",
+    "libcudf==25.4.*,>=0.0.0a0",
     "nvtx>=0.2.1",
     "packaging",
     "pyarrow>=14.0.0,<20.0.0a0,!=17.0.0; platform_machine=='aarch64'",
     "pyarrow>=14.0.0,<20.0.0a0; platform_machine=='x86_64'",
-    "rmm==25.2.*,>=0.0.0a0",
+    "rmm==25.4.*,>=0.0.0a0",
     "typing_extensions>=4.0.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
@@ -111,10 +111,10 @@ matrix-entry = "cuda_suffixed=true"
 requires = [
     "cmake>=3.26.4,!=3.30.0",
     "cython>=3.0.3",
-    "libcudf==25.2.*,>=0.0.0a0",
-    "librmm==25.2.*,>=0.0.0a0",
+    "libcudf==25.4.*,>=0.0.0a0",
+    "librmm==25.4.*,>=0.0.0a0",
     "ninja",
-    "rmm==25.2.*,>=0.0.0a0",
+    "rmm==25.4.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [tool.scikit-build]