diff --git a/.devcontainer/cuda12.5-conda/devcontainer.json b/.devcontainer/cuda12.8-conda/devcontainer.json
similarity index 57%
rename from .devcontainer/cuda12.5-conda/devcontainer.json
rename to .devcontainer/cuda12.8-conda/devcontainer.json
index 244c624b871..3977a1c5f86 100644
--- a/.devcontainer/cuda12.5-conda/devcontainer.json
+++ b/.devcontainer/cuda12.8-conda/devcontainer.json
@@ -3,7 +3,7 @@
     "context": "${localWorkspaceFolder}/.devcontainer",
     "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
     "args": {
-      "CUDA": "12.5",
+      "CUDA": "12.8",
       "PYTHON_PACKAGE_MANAGER": "conda",
       "BASE": "rapidsai/devcontainers:25.04-cpp-mambaforge-ubuntu22.04"
     }
@@ -11,39 +11,49 @@
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.04-cuda12.5-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.04-cuda12.8-conda"
   ],
-  "hostRequirements": {"gpu": "optional"},
+  "hostRequirements": {
+    "gpu": "optional"
+  },
   "features": {
     "ghcr.io/rapidsai/devcontainers/features/cuda:25.4": {
-        "version": "12.5",
-        "installCompilers": false,
-        "installProfilers": true,
-        "installDevPackages": false,
-        "installcuDNN": false,
-        "installcuTensor": false,
-        "installNCCL": false,
-        "installCUDARuntime": false,
-        "installNVRTC": false,
-        "installOpenCL": false,
-        "installcuBLAS": false,
-        "installcuSPARSE": false,
-        "installcuFFT": false,
-        "installcuFile": false,
-        "installcuRAND": false,
-        "installcuSOLVER": false,
-        "installNPP": false,
-        "installnvJPEG": false,
-        "pruneStaticLibs": true
-      },
+      "version": "12.8",
+      "installCompilers": false,
+      "installProfilers": true,
+      "installDevPackages": false,
+      "installcuDNN": false,
+      "installcuTensor": false,
+      "installNCCL": false,
+      "installCUDARuntime": false,
+      "installNVRTC": false,
+      "installOpenCL": false,
+      "installcuBLAS": false,
+      "installcuSPARSE": false,
+      "installcuFFT": false,
+      "installcuFile": false,
+      "installcuRAND": false,
+      "installcuSOLVER": false,
+      "installNPP": false,
+      "installnvJPEG": false,
+      "pruneStaticLibs": true
+    },
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.4": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/cuda",
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
   ],
-  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.5-envs}"],
-  "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.8-envs}"
+  ],
+  "postAttachCommand": [
+    "/bin/bash",
+    "-c",
+    "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"
+  ],
   "workspaceFolder": "/home/coder",
   "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cudf,type=bind,consistency=consistent",
   "mounts": [
@@ -51,7 +61,7 @@
     "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/../.conda/pkgs,target=/home/coder/.conda/pkgs,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda12.5-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda12.8-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent"
   ],
   "customizations": {
     "vscode": {
diff --git a/.devcontainer/cuda12.5-pip/devcontainer.json b/.devcontainer/cuda12.8-pip/devcontainer.json
similarity index 67%
rename from .devcontainer/cuda12.5-pip/devcontainer.json
rename to .devcontainer/cuda12.8-pip/devcontainer.json
index a4b0c1acb7d..ab83fa88300 100644
--- a/.devcontainer/cuda12.5-pip/devcontainer.json
+++ b/.devcontainer/cuda12.8-pip/devcontainer.json
@@ -3,32 +3,42 @@
     "context": "${localWorkspaceFolder}/.devcontainer",
     "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
     "args": {
-      "CUDA": "12.5",
+      "CUDA": "12.8",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:25.04-cpp-cuda12.5-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:25.04-cpp-cuda12.8-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.04-cuda12.5-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.04-cuda12.8-pip"
   ],
-  "hostRequirements": {"gpu": "optional"},
+  "hostRequirements": {
+    "gpu": "optional"
+  },
   "features": {
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.4": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
   ],
-  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda12.5-venvs}"],
-  "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda12.8-venvs}"
+  ],
+  "postAttachCommand": [
+    "/bin/bash",
+    "-c",
+    "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"
+  ],
   "workspaceFolder": "/home/coder",
   "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cudf,type=bind,consistency=consistent",
   "mounts": [
     "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda12.5-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda12.8-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent"
   ],
   "customizations": {
     "vscode": {
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 9c0bb7d6840..65356ec8b73 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -133,6 +133,7 @@ jobs:
     uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.04
     with:
       build_type: pull-request
+      node_type: "cpu16"
   cpp-linters:
     secrets: inherit
     needs: checks
@@ -227,6 +228,7 @@ jobs:
       # build for every combination of arch and CUDA version, but only for the latest Python
       matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
       build_type: pull-request
+      node_type: "cpu16"
       script: "ci/build_wheel_libcudf.sh"
   wheel-build-pylibcudf:
     needs: [checks, wheel-build-libcudf]
@@ -302,8 +304,9 @@ jobs:
     needs: telemetry-setup
     uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-25.04
     with:
+      node_type: "cpu32"
       arch: '["amd64"]'
-      cuda: '["12.5"]'
+      cuda: '["12.8"]'
       build_command: |
         sccache -z;
         build-all -DBUILD_BENCHMARKS=ON --verbose;
@@ -336,7 +339,7 @@ jobs:
     needs: pandas-tests
     uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.04
     with:
-        node_type: cpu4
+        node_type: "cpu4"
         build_type: pull-request
         run_script: "ci/cudf_pandas_scripts/pandas-tests/diff.sh"
 
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 3db1ed35294..a6790032017 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -105,7 +105,7 @@ Instructions for a minimal build environment without conda are included below.
 # create the conda environment (assuming in base `cudf` directory)
 # note: RAPIDS currently doesn't support `channel_priority: strict`;
 # use `channel_priority: flexible` instead
-conda env create --name cudf_dev --file conda/environments/all_cuda-125_arch-x86_64.yaml
+conda env create --name cudf_dev --file conda/environments/all_cuda-128_arch-x86_64.yaml
 # activate the environment
 conda activate cudf_dev
 ```
diff --git a/README.md b/README.md
index 20b1d64a5e0..a240d6c2aa9 100644
--- a/README.md
+++ b/README.md
@@ -83,7 +83,7 @@ cuDF can be installed with conda (via [miniforge](https://github.com/conda-forge
 
 ```bash
 conda install -c rapidsai -c conda-forge -c nvidia \
-    cudf=25.04 python=3.12 cuda-version=12.5
+    cudf=25.04 python=3.12 cuda-version=12.8
 ```
 
 We also provide [nightly Conda packages](https://anaconda.org/rapidsai-nightly) built from the HEAD
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-128_arch-x86_64.yaml
similarity index 97%
rename from conda/environments/all_cuda-125_arch-x86_64.yaml
rename to conda/environments/all_cuda-128_arch-x86_64.yaml
index f822169990f..e719fd51573 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-128_arch-x86_64.yaml
@@ -23,7 +23,7 @@ dependencies:
 - cuda-nvtx-dev
 - cuda-python>=12.6.2,<13.0a0
 - cuda-sanitizer-api
-- cuda-version=12.5
+- cuda-version=12.8
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.3
@@ -99,4 +99,4 @@ dependencies:
 - transformers==4.39.3
 - typing_extensions>=4.0.0
 - zlib>=1.2.13
-name: all_cuda-125_arch-x86_64
+name: all_cuda-128_arch-x86_64
diff --git a/cpp/src/groupby/hash/compute_aggregations.cuh b/cpp/src/groupby/hash/compute_aggregations.cuh
index 9c9a4c97bff..df8fcf4690f 100644
--- a/cpp/src/groupby/hash/compute_aggregations.cuh
+++ b/cpp/src/groupby/hash/compute_aggregations.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -67,13 +67,17 @@ rmm::device_uvector<cudf::size_type> compute_aggregations(
   auto const grid_size =
     max_occupancy_grid_size<typename SetType::ref_type<cuco::insert_and_find_tag>>(num_rows);
   auto const available_shmem_size = get_available_shared_memory_size(grid_size);
-  auto const has_sufficient_shmem =
-    available_shmem_size > (compute_shmem_offsets_size(flattened_values.num_columns()) * 2);
-  auto const has_dictionary_request = std::any_of(
-    requests.begin(), requests.end(), [](cudf::groupby::aggregation_request const& request) {
-      return cudf::is_dictionary(request.values.type());
+  auto const offsets_buffer_size  = compute_shmem_offsets_size(flattened_values.num_columns()) * 2;
+  auto const data_buffer_size     = available_shmem_size - offsets_buffer_size;
+  auto const is_shared_memory_compatible = std::all_of(
+    requests.begin(), requests.end(), [&](cudf::groupby::aggregation_request const& request) {
+      if (cudf::is_dictionary(request.values.type())) { return false; }
+      // Ensure there is enough buffer space to store local aggregations up to the max cardinality
+      // for shared memory aggregations
+      auto const size = cudf::type_dispatcher<cudf::dispatch_storage_type>(request.values.type(),
+                                                                           size_of_functor{});
+      return static_cast<size_type>(data_buffer_size) >= (size * GROUPBY_CARDINALITY_THRESHOLD);
     });
-  auto const is_shared_memory_compatible = !has_dictionary_request and has_sufficient_shmem;
 
   // Performs naive global memory aggregations when the workload is not compatible with shared
   // memory, such as when aggregating dictionary columns or when there is insufficient dynamic
diff --git a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu
index f0361ccced2..ae7584da483 100644
--- a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu
+++ b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -35,15 +35,6 @@
 
 namespace cudf::groupby::detail::hash {
 namespace {
-/// Functor used by type dispatcher returning the size of the underlying C++ type
-struct size_of_functor {
-  template <typename T>
-  __device__ constexpr cudf::size_type operator()()
-  {
-    return sizeof(T);
-  }
-};
-
 /// Shared memory data alignment
 CUDF_HOST_DEVICE cudf::size_type constexpr ALIGNMENT = 8;
 
diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh
index 048c9252773..c02087072a0 100644
--- a/cpp/src/groupby/hash/single_pass_functors.cuh
+++ b/cpp/src/groupby/hash/single_pass_functors.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,15 @@
 #include <cuda/std/cstddef>
 
 namespace cudf::groupby::detail::hash {
+/// Functor used by type dispatcher returning the size of the underlying C++ type
+struct size_of_functor {
+  template <typename T>
+  CUDF_HOST_DEVICE constexpr cudf::size_type operator()()
+  {
+    return sizeof(T);
+  }
+};
+
 // TODO: TO BE REMOVED issue tracked via #17171
 template <typename T, cudf::aggregation::Kind k>
 __device__ constexpr bool is_supported()
diff --git a/dependencies.yaml b/dependencies.yaml
index ebc2284048f..b1378fae6d7 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -3,7 +3,7 @@ files:
   all:
     output: conda
     matrix:
-      cuda: ["11.8", "12.5"]
+      cuda: ["11.8", "12.8"]
       arch: [x86_64]
     includes:
       # Note that clang-tidy is not included here because cudf's preferred
@@ -525,6 +525,10 @@ dependencies:
               cuda: "12.5"
             packages:
               - cuda-version=12.5
+          - matrix:
+              cuda: "12.8"
+            packages:
+              - cuda-version=12.8
   cuda:
     specific:
       - output_types: conda
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml b/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml
index 56f0586f89a..977d25184b5 100644
--- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml
@@ -156,9 +156,9 @@ dependencies:
             packages:
               - cuda-version=12.5
           - matrix:
-              cuda: "12"
+              cuda: "12.8"
             packages:
-              - cuda-version=12.5
+              - cuda-version=12.8
   py_version:
     specific:
       - output_types: conda
diff --git a/python/libcudf/pyproject.toml b/python/libcudf/pyproject.toml
index 9c54864ccc8..7d3b6c09c61 100644
--- a/python/libcudf/pyproject.toml
+++ b/python/libcudf/pyproject.toml
@@ -54,7 +54,7 @@ select = [
 ]
 
 # PyPI limit is 600 MiB, fail CI before we get too close to that
-max_allowed_size_compressed = '525M'
+max_allowed_size_compressed = '575M'
 
 [tool.scikit-build]
 build-dir = "build/{wheel_tag}"