rapidsai · Matt711 · Jan 20, 2026 · Jan 21, 2026 · Jan 21, 2026 · Jan 21, 2026
@@ -1,12 +1,14 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
-from libc.stdint cimport uint8_t
+from libc.stdint cimport uint8_t, uintptr_t
 from libc.stddef cimport size_t
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
 from pylibcudf.libcudf.contiguous_split cimport packed_columns, chunked_pack
+from pylibcudf.libcudf.types cimport bitmask_type
+from pylibcudf.libcudf.utilities.span cimport device_span
 from rmm.pylibrmm.device_buffer cimport DeviceBuffer
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 from rmm.pylibrmm.stream cimport Stream
@@ -15,6 +17,8 @@ from .gpumemoryview cimport gpumemoryview
 from .table cimport Table
 
 
+cdef device_span[uint8_t] _get_device_span(object obj) except *
+
 cdef class HostBuffer:
     cdef unique_ptr[vector[uint8_t]] c_obj
     cdef size_t nbytes
@@ -46,10 +50,10 @@ cdef class ChunkedPack:
     cdef Stream stream
 
     cpdef bool has_next(self)
-    cpdef size_t next(self, DeviceBuffer buf)
+    cpdef size_t next(self, object buf)
     cpdef size_t get_total_contiguous_size(self)
     cpdef memoryview build_metadata(self)
-    cpdef tuple pack_to_host(self, DeviceBuffer buf)
+    cpdef tuple pack_to_host(self, object buf)
 
 
 cpdef PackedColumns pack(Table input)
@@ -58,6 +62,6 @@ cpdef Table unpack(PackedColumns input, Stream stream=*)
 
 cpdef Table unpack_from_memoryviews(
     memoryview metadata,
-    gpumemoryview gpu_data,
+    object gpu_data,
     Stream stream=*,
 )
@@ -1,11 +1,11 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
-from rmm import DeviceBuffer
 from rmm.mr import DeviceMemoryResource
 from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.gpumemoryview import gpumemoryview
+from pylibcudf.span import Span
 from pylibcudf.table import Table
 
 class PackedColumns:
@@ -18,7 +18,7 @@ def pack(input: Table, stream: Stream | None = None) -> PackedColumns: ...
 def unpack(input: PackedColumns, stream: Stream | None = None) -> Table: ...
 def unpack_from_memoryviews(
     metadata: memoryview[bytes],
-    gpu_data: gpumemoryview,
+    gpu_data: Span,
     stream: Stream | None = None,
 ) -> Table: ...
 
@@ -33,8 +33,8 @@ class ChunkedPack:
     ) -> ChunkedPack: ...
     def has_next(self) -> bool: ...
     def get_total_contiguous_size(self) -> int: ...
-    def next(self, buf: DeviceBuffer) -> int: ...
+    def next(self, buf: Span) -> int: ...
     def build_metadata(self) -> memoryview[bytes]: ...
     def pack_to_host(
-        self, buf: DeviceBuffer
+        self, buf: Span
     ) -> tuple[memoryview[bytes], memoryview[bytes]]: ...
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from cython.operator cimport dereference
@@ -33,6 +33,7 @@ from rmm.pylibrmm.stream cimport Stream
 
 from .gpumemoryview cimport gpumemoryview
 from .table cimport Table
+from .span import is_span
 from .utils cimport _get_stream, _get_memory_resource
 
 
@@ -44,6 +45,14 @@ __all__ = [
     "unpack_from_memoryviews",
 ]
 
+cdef device_span[uint8_t] _get_device_span(object obj) except *:
+    if not is_span(obj):
+        raise TypeError(
+            f"Object of type {type(obj)} does not implement the Span protocol"
+        )
+
+    return device_span[uint8_t](<uint8_t*><uintptr_t>obj.ptr, <size_t>obj.size)
+
 cdef class HostBuffer:
     """Owning host buffer that implements the buffer protocol"""
     @staticmethod
@@ -212,13 +221,13 @@ cdef class ChunkedPack:
         with nogil:
             return dereference(self.c_obj).get_total_contiguous_size()
 
-    cpdef size_t next(self, DeviceBuffer buf):
+    cpdef size_t next(self, object buf):
         """
         Pack the next chunk into the provided device buffer.
 
         Parameters
         ----------
-        buf
+        buf : Span-like object
             The device buffer to use as a staging buffer, must be at
             least as large as the `user_buffer_size` used to construct the
             packer.
@@ -232,9 +241,8 @@ cdef class ChunkedPack:
         This is stream-ordered with respect to the stream used when
         creating the `ChunkedPack`.
         """
-        cdef device_span[uint8_t] d_span = device_span[uint8_t](
-            <uint8_t *>buf.c_data(), buf.c_size()
-        )
+        cdef device_span[uint8_t] d_span = _get_device_span(buf)
+
         with nogil:
             return dereference(self.c_obj).next(d_span)
 
@@ -251,16 +259,16 @@ cdef class ChunkedPack:
             metadata = move(dereference(self.c_obj).build_metadata())
         return memoryview(HostBuffer.from_unique_ptr(move(metadata)))
 
-    cpdef tuple pack_to_host(self, DeviceBuffer buf):
+    cpdef tuple pack_to_host(self, object buf):
         """
         Pack the entire table into a host buffer.
 
         Parameters
         ----------
-        buf
-           The device buffer to use as a staging buffer, must be at
-           least as large as the `user_buffer_size` used to construct the
-           packer.
+        buf : Span-like object
+        The device buffer to use as a staging buffer, must be at
+        least as large as the `user_buffer_size` used to construct the
+        packer.
 
         Returns
         -------
@@ -279,10 +287,8 @@ cdef class ChunkedPack:
         """
         cdef size_t offset = 0
         cdef size_t size
-        cdef device_span[uint8_t] d_span = device_span[uint8_t](
-            <uint8_t *>buf.c_data(), buf.c_size()
-        )
-        cdef cudaError_t err
+        cdef device_span[uint8_t] d_span = _get_device_span(buf)
+        cdef cudaError_t err = cudaError.cudaSuccess
         cdef unique_ptr[vector[uint8_t]] h_buf = (
             make_unique[vector[uint8_t]](
                 dereference(self.c_obj).get_total_contiguous_size()
@@ -379,7 +385,7 @@ cpdef Table unpack(PackedColumns input, Stream stream=None):
 
 cpdef Table unpack_from_memoryviews(
     memoryview metadata,
-    gpumemoryview gpu_data,
+    object gpu_data,
     Stream stream=None,
 ):
     """Deserialize the result of `pack`.
@@ -392,7 +398,7 @@ cpdef Table unpack_from_memoryviews(
     ----------
     metadata : memoryview
         The packed metadata to unpack.
-    gpu_data : gpumemoryview
+    gpu_data : Span-like object
         The packed gpu_data to unpack.
     stream : Stream | None
         CUDA stream on which to perform the operation.
@@ -403,8 +409,10 @@ cpdef Table unpack_from_memoryviews(
         Copy of the packed columns.
     """
     stream = _get_stream(stream)
+    cdef device_span[uint8_t] d_span = _get_device_span(gpu_data)
+
     if metadata.nbytes == 0:
-        if gpu_data.__cuda_array_interface__["data"][0] != 0:
+        if d_span.data() != NULL:
             raise ValueError("Expected an empty gpu_data from unpacking an empty table")
         # For an empty table we just attach the default mr since it will not be
         # used for any operations.
@@ -417,7 +425,7 @@ cpdef Table unpack_from_memoryviews(
     # Extract the raw data pointers
     cdef const uint8_t[::1] _metadata = metadata
     cdef const uint8_t* metadata_ptr = &_metadata[0]
-    cdef const uint8_t* gpu_data_ptr = <uint8_t*>gpu_data.ptr
+    cdef const uint8_t* gpu_data_ptr = d_span.data()
 
     cdef table_view v
     with nogil: