Add fast paths for DataFrame.to_cupy (#18801)

Matt711 · web-flow · commit 7694248ec1f1 · 2025-05-15T04:53:50.000Z
Contributes to #16483 by adding fast paths to `DataFrame.to_cupy` (which is called when `DataFrame.values` is called). The PR follows up #18450 to add cython bindings for `cudf::table_to_array` to pylibcudf and plumbs those changes through cudf classic. I benchmarked the fast (True) and slow (False) when the dataframe has 1, 6, 20, and 100 columns. The fast paths use `cudf::table_to_array` if the number of columns is greater than 1 and `cp.asarray` directly if the dataframe has only one column. The slow path uses a [raw python loop + assignment](https://github.com/rapidsai/cudf/blob/35d58394e7fb5a090ff3cda351403ec092476af5/python/cudf/cudf/core/frame.py#L520) to create the cupy array. ![image](https://github.com/user-attachments/assets/4c9edfa0-e15d-4902-b597-675cfb02343d) I used the median because the CUDA overhead of calling `cudf::table_to_array` is large (so there are outliers in the times). Here is a profile of calling `to_cupy` twice for both the slow and fast paths. ![Screenshot from 2025-05-13 12-23-46](https://github.com/user-attachments/assets/d84fdfa3-3696-4df8-91b6-3eb9dde65430) In the first calls, the fast path takes 7.3 ms vs 4.8 ms for the slow path. The first call to `cudf::table_to_array` is the bottleneck. But if you compare the second calls, the fast path is much faster (79 us vs 2.3ms) Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Bradley Dice (https://github.com/bdice) - Matthew Roeschke (https://github.com/mroeschke) URL: #18801
diff --git a/docs/cudf/source/user_guide/api_docs/index_objects.rst b/docs/cudf/source/user_guide/api_docs/index_objects.rst
@@ -110,6 +110,8 @@ Conversion
    Index.to_frame
    Index.to_pandas
    Index.to_dlpack
+   Index.to_pylibcudf
+   Index.from_pylibcudf
    Index.from_pandas
    Index.from_arrow
 
diff --git a/python/cudf/benchmarks/API/bench_dataframe.py b/python/cudf/benchmarks/API/bench_dataframe.py
@@ -349,3 +349,11 @@ def bench_nsmallest(benchmark, dataframe, num_cols_to_sort, n):
 )
 def bench_where(benchmark, dataframe, cond, other):
     benchmark(dataframe.where, cond, other)
+
+
+@benchmark_with_object(
+    cls="dataframe", dtype="float", nulls=False, cols=20, rows=20
+)
+@pytest.mark.pandas_incompatible
+def bench_to_cupy(benchmark, dataframe):
+    benchmark(dataframe.to_cupy)
diff --git a/python/cudf/benchmarks/API/bench_series.py b/python/cudf/benchmarks/API/bench_series.py
@@ -23,13 +23,17 @@ def bench_series_nsmallest(benchmark, series, n):
     benchmark(series.nsmallest, n)
 
 
-@benchmark_with_object(cls="series", dtype="int")
+@benchmark_with_object(cls="series", dtype="int", nulls=False)
 def bench_series_cp_asarray(benchmark, series):
-    series = series.dropna()
     benchmark(cupy.asarray, series)
 
 
-@benchmark_with_object(cls="series", dtype="int")
+@benchmark_with_object(cls="series", dtype="int", nulls=False)
+@pytest.mark.pandas_incompatible
+def bench_to_cupy(benchmark, series):
+    benchmark(lambda: series.values)
+
+
+@benchmark_with_object(cls="series", dtype="int", nulls=False)
 def bench_series_values(benchmark, series):
-    series = series.dropna()
     benchmark(lambda: series.values)
diff --git a/python/cudf/benchmarks/common/config.py b/python/cudf/benchmarks/common/config.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+# Copyright (c) 2022-2025, NVIDIA CORPORATION.
 
 """Module used for global configuration of benchmarks.
 
@@ -64,7 +64,7 @@ def pytest_sessionfinish(session, exitstatus):
 # Constants used to define benchmarking standards.
 if "CUDF_BENCHMARKS_DEBUG_ONLY" in os.environ:
     NUM_ROWS = [10, 20]
-    NUM_COLS = [1, 6]
+    NUM_COLS = [1, 6, 20]
 else:
     NUM_ROWS = [100, 10_000, 1_000_000]
-    NUM_COLS = [1, 6]
+    NUM_COLS = [1, 6, 20]
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
@@ -587,6 +587,15 @@ def to_array(
                 matrix[:, i] = to_array(col, dtype)
             return matrix
 
+    @_performance_tracking
+    def to_pylibcudf(self) -> tuple[plc.Table, dict[str, Any]]:
+        """
+        Converts Frame to a pylibcudf.Table.
+        Note: This method should not be called directly on a Frame object
+        Instead, it should be called on subclasses like DataFrame/Series.
+        """
+        raise NotImplementedError(f"{type(self)} must implement to_pylibcudf")
+
     @_performance_tracking
     def to_cupy(
         self,
@@ -613,6 +622,51 @@ def to_cupy(
         -------
         cupy.ndarray
         """
+        if (
+            self._num_columns > 1
+            and na_value is None
+            and self._columns[0].dtype.kind in {"i", "u", "f", "b"}
+            and all(
+                not col.nullable and col.dtype == self._columns[0].dtype
+                for col in self._columns
+            )
+        ):
+            if dtype is None:
+                dtype = self._columns[0].dtype
+
+            shape = (len(self), self._num_columns)
+            out = cupy.empty(shape, dtype=dtype, order="F")
+
+            table = plc.Table(
+                [col.to_pylibcudf(mode="read") for col in self._columns]
+            )
+            plc.reshape.table_to_array(
+                table,
+                out.data.ptr,
+                out.nbytes,
+            )
+            return out
+        elif self._num_columns == 1:
+            col = self._columns[0]
+            final_dtype = col.dtype if dtype is None else dtype
+
+            if (
+                not copy
+                and col.dtype.kind in {"i", "u", "f", "b"}
+                and cupy.can_cast(col.dtype, final_dtype)
+            ):
+                if col.has_nulls():
+                    if na_value is not None:
+                        col = col.fillna(na_value)
+                    else:
+                        return self._to_array(
+                            lambda col: col.values,
+                            cupy,
+                            copy,
+                            dtype,
+                            na_value,
+                        )
+                return cupy.asarray(col, dtype=final_dtype).reshape((-1, 1))
         return self._to_array(
             lambda col: col.values,
             cupy,
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
@@ -323,6 +323,74 @@ def _from_data(cls, data: MutableMapping, name: Any = no_default) -> Self:
     def _from_data_like_self(self, data: MutableMapping) -> Self:
         return _index_from_data(data, self.name)
 
+    @_performance_tracking
+    def to_pylibcudf(self, copy=False) -> tuple[plc.Column, dict]:
+        """
+        Convert this Index to a pylibcudf.Column.
+
+        Parameters
+        ----------
+        copy : bool
+            Whether or not to generate a new copy of the underlying device data
+
+        Returns
+        -------
+        pylibcudf.Column
+            A new pylibcudf.Column referencing the same data.
+        dict
+            Dict of metadata (includes name)
+
+        Notes
+        -----
+        User requests to convert to pylibcudf must assume that the
+        data may be modified afterwards.
+        """
+        if copy:
+            raise NotImplementedError("copy=True is not supported")
+        metadata = {"name": self.name}
+        return self._column.to_pylibcudf(mode="write"), metadata
+
+    @classmethod
+    @_performance_tracking
+    def from_pylibcudf(
+        cls, col: plc.Column, metadata: dict | None = None
+    ) -> Self:
+        """
+        Create a Index from a pylibcudf.Column.
+
+        Parameters
+        ----------
+        col : pylibcudf.Column
+            The input Column.
+
+        Returns
+        -------
+        pylibcudf.Column
+            A new pylibcudf.Column referencing the same data.
+        metadata : dict | None
+            The Index metadata.
+
+        Notes
+        -----
+        This function will generate an Index which contains a Column
+        pointing to the provided pylibcudf Column.  It will directly access
+        the data and mask buffers of the pylibcudf Column, so the newly created
+        object is not tied to the lifetime of the original pylibcudf.Column.
+        """
+        name = None
+        if metadata is not None:
+            if not (
+                isinstance(metadata, dict)
+                and len(metadata) == 1
+                and set(metadata) == {"name"}
+            ):
+                raise ValueError("Metadata dict must only contain a name")
+            name = metadata.get("name")
+        return cls._from_column(
+            ColumnBase.from_pylibcudf(col, data_ptr_exposed=True),
+            name=name,
+        )
+
     @classmethod
     @_performance_tracking
     def from_arrow(cls, obj: pa.Array) -> Index | cudf.MultiIndex:
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
@@ -3820,6 +3820,8 @@ def from_pylibcudf(
         ----------
         col : pylibcudf.Column
             The input Column.
+        metadata : dict | None
+            The Series metadata.
 
         Returns
         -------
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
@@ -139,26 +139,11 @@ def to_cupy(
         -------
         cupy.ndarray
         """
-        col = self._column
-        final_dtype = (
-            col.dtype if dtype is None else dtype
-        )  # some types do not support | operator
-        if (
-            not copy
-            and col.dtype.kind in {"i", "u", "f", "b"}
-            and cp.can_cast(col.dtype, final_dtype)
-            and not col.has_nulls()
-        ):
-            if col.has_nulls():
-                if na_value is not None:
-                    col = col.fillna(na_value)
-                else:
-                    return super().to_cupy(
-                        dtype=dtype, copy=copy, na_value=na_value
-                    )
-            return cp.asarray(col, dtype=final_dtype)
-
-        return super().to_cupy(dtype=dtype, copy=copy, na_value=na_value)
+        return (
+            super()
+            .to_cupy(dtype=dtype, copy=copy, na_value=na_value)
+            .reshape(len(self), order="F")
+        )
 
     @property  # type: ignore
     @_performance_tracking
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
@@ -1270,6 +1270,34 @@ def test_dataframe_to_cupy():
         np.testing.assert_array_equal(df[k].to_numpy(), mat[:, i])
 
 
+@pytest.mark.parametrize("has_nulls", [False, True])
+@pytest.mark.parametrize("use_na_value", [False, True])
+def test_dataframe_to_cupy_single_column(has_nulls, use_na_value):
+    nelem = 10
+    data = np.arange(nelem, dtype=np.float64)
+
+    if has_nulls:
+        data = data.astype("object")
+        data[::2] = None
+
+    df = cudf.DataFrame({"a": data})
+
+    if has_nulls and not use_na_value:
+        with pytest.raises(ValueError, match="Column must have no nulls"):
+            df.to_cupy()
+        return
+
+    na_value = 0.0 if use_na_value else None
+    expected = (
+        cupy.asarray(df["a"].fillna(na_value))
+        if has_nulls
+        else cupy.asarray(df["a"])
+    )
+    result = df.to_cupy(na_value=na_value)
+    assert result.shape == (nelem, 1)
+    assert_eq(result.ravel(), expected)
+
+
 def test_dataframe_to_cupy_null_values():
     df = cudf.DataFrame()
 
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
@@ -3343,3 +3343,10 @@ def test_categoricalindex_from_codes(ordered, name):
         name=name,
     )
     assert_eq(result, expected)
+
+
+def test_roundtrip_index_plc_column():
+    index = cudf.Index([1])
+    expect = cudf.Index(index)
+    actual = cudf.Index.from_pylibcudf(*expect.to_pylibcudf())
+    assert_eq(expect, actual)
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
@@ -3091,6 +3091,7 @@ def test_series_to_cupy(dtype, has_nulls, use_na_value):
 
     if not has_nulls:
         assert_eq(sr.values, cp.asarray(sr))
+        return
 
     if has_nulls and not use_na_value:
         with pytest.raises(ValueError, match="Column must have no nulls"):
diff --git a/python/pylibcudf/pylibcudf/libcudf/reshape.pxd b/python/pylibcudf/pylibcudf/libcudf/reshape.pxd
@@ -1,10 +1,17 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION.
 from libcpp.memory cimport unique_ptr
 from pylibcudf.exception_handler cimport libcudf_exception_handler
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.table.table cimport table
 from pylibcudf.libcudf.table.table_view cimport table_view
-from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.types cimport size_type, data_type
+from pylibcudf.libcudf.utilities.span cimport device_span
+
+from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+
+cdef extern from "cuda/functional" namespace "cuda::std":
+    cdef cppclass byte:
+        pass
 
 
 cdef extern from "cudf/reshape.hpp" namespace "cudf" nogil:
@@ -14,3 +21,8 @@ cdef extern from "cudf/reshape.hpp" namespace "cudf" nogil:
     cdef unique_ptr[table] tile(
         table_view source_table, size_type count
     ) except +libcudf_exception_handler
+    cdef void table_to_array(
+        table_view input_table,
+        device_span[byte] output,
+        cuda_stream_view stream
+    ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/reshape.pxd b/python/pylibcudf/pylibcudf/reshape.pxd
@@ -1,11 +1,24 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
+
+from libc.stddef cimport size_t
+from libc.stdint cimport uintptr_t
 
 from pylibcudf.libcudf.types cimport size_type
 
+from rmm.pylibrmm.stream cimport Stream
+from rmm.pylibrmm.device_buffer cimport DeviceBuffer
+
 from .column cimport Column
 from .scalar cimport Scalar
 from .table cimport Table
+from .types cimport DataType
 
 
 cpdef Column interleave_columns(Table source_table)
 cpdef Table tile(Table source_table, size_type count)
+cpdef void table_to_array(
+    Table input_table,
+    uintptr_t ptr,
+    size_t size,
+    Stream stream=*
+)
diff --git a/python/pylibcudf/pylibcudf/reshape.pyi b/python/pylibcudf/pylibcudf/reshape.pyi
@@ -1,7 +1,15 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
+from rmm.pylibrmm.stream import Stream
+
 from pylibcudf.column import Column
 from pylibcudf.table import Table
 
 def interleave_columns(source_table: Table) -> Column: ...
 def tile(source_table: Table, count: int) -> Table: ...
+def table_to_array(
+    input_table: Table,
+    ptr: int,
+    size: int,
+    stream: Stream,
+) -> None: ...
diff --git a/python/pylibcudf/pylibcudf/reshape.pyx b/python/pylibcudf/pylibcudf/reshape.pyx
diff --git a/python/pylibcudf/pylibcudf/tests/test_reshape.py b/python/pylibcudf/pylibcudf/tests/test_reshape.py