pola-rs · stinodego · Aug 10, 2023 · Aug 7, 2023 · Aug 9, 2023 · Aug 9, 2023
@@ -16,7 +16,6 @@
 from polars.config import Config
 from polars.convert import (
     from_arrow,
-    from_dataframe,
     from_dict,
     from_dicts,
     from_numpy,
@@ -148,6 +147,7 @@
     when,
     zeros,
 )
+from polars.interchange.from_dataframe import from_dataframe
 from polars.io import (
     read_avro,
     read_csv,

@@ -15,12 +15,11 @@
     Struct,
     Utf8,
 )
-from polars.dependencies import _PYARROW_AVAILABLE
 from polars.dependencies import pandas as pd
 from polars.dependencies import pyarrow as pa
 from polars.exceptions import NoDataError
 from polars.io import read_csv
-from polars.utils.various import _cast_repr_strings_with_schema, parse_version
+from polars.utils.various import _cast_repr_strings_with_schema
 
 if TYPE_CHECKING:
     from polars import DataFrame, Series
@@ -726,80 +725,3 @@ def from_pandas(
         )
     else:
         raise ValueError(f"Expected pandas DataFrame or Series, got {type(data)}.")
-
-
-def from_dataframe(df: Any, *, allow_copy: bool = True) -> DataFrame:
-    """
-    Build a Polars DataFrame from any dataframe supporting the interchange protocol.
-
-    Parameters
-    ----------
-    df
-        Object supporting the dataframe interchange protocol, i.e. must have implemented
-        the ``__dataframe__`` method.
-    allow_copy
-        Allow memory to be copied to perform the conversion. If set to False, causes
-        conversions that are not zero-copy to fail.
-
-    Notes
-    -----
-    Details on the dataframe interchange protocol:
-    https://data-apis.org/dataframe-protocol/latest/index.html
-
-    Using a dedicated function like :func:`from_pandas` or :func:`from_arrow` is a more
-    efficient method of conversion.
-
-    Polars currently relies on pyarrow's implementation of the dataframe interchange
-    protocol. Therefore, pyarrow>=11.0.0 is required for this function to work.
-
-    Because Polars can not currently guarantee zero-copy conversion from Arrow for
-    categorical columns, ``allow_copy=False`` will not work if the dataframe contains
-    categorical data.
-
-    """
-    if isinstance(df, pl.DataFrame):
-        return df
-    if not hasattr(df, "__dataframe__"):
-        raise TypeError(
-            f"`df` of type {type(df)} does not support the dataframe interchange protocol."
-        )
-
-    pa_table = _df_to_pyarrow_table(df, allow_copy=allow_copy)
-    return from_arrow(pa_table, rechunk=allow_copy)  # type: ignore[return-value]
-
-
-def _df_to_pyarrow_table(df: Any, *, allow_copy: bool = False) -> pa.Table:
-    if not _PYARROW_AVAILABLE or parse_version(pa.__version__) < parse_version("11"):
-        raise ImportError(
-            "pyarrow>=11.0.0 is required for converting a dataframe interchange object"
-            " to a Polars dataframe."
-        )
-
-    import pyarrow.interchange  # noqa: F401
-
-    if not allow_copy:
-        return _df_to_pyarrow_table_zero_copy(df)
-
-    return pa.interchange.from_dataframe(df, allow_copy=True)
-
-
-def _df_to_pyarrow_table_zero_copy(df: Any) -> pa.Table:
-    dfi = df.__dataframe__(allow_copy=False)
-    if _dfi_contains_categorical_data(dfi):
-        raise TypeError(
-            "Polars can not currently guarantee zero-copy conversion from Arrow for "
-            " categorical columns. Set `allow_copy=True` or cast categorical columns to"
-            " string first."
-        )
-
-    if isinstance(df, pa.Table):
-        return df
-    elif isinstance(df, pa.RecordBatch):
-        return pa.Table.from_batches([df])
-    else:
-        return pa.interchange.from_dataframe(dfi, allow_copy=False)
-
-
-def _dfi_contains_categorical_data(dfi: Any) -> bool:
-    CATEGORICAL_DTYPE = 23
-    return any(c.dtype[0] == CATEGORICAL_DTYPE for c in dfi.get_columns())
@@ -47,7 +47,6 @@
     Time,
     Utf8,
     py_type_to_dtype,
-    unpack_dtypes,
 )
 from polars.dependencies import (
     _PYARROW_AVAILABLE,
@@ -112,18 +111,17 @@
 with contextlib.suppress(ImportError):  # Module not available when building docs
     from polars.polars import PyDataFrame
 
-
 if TYPE_CHECKING:
     import sys
     from datetime import timedelta
     from io import IOBase
     from typing import Literal
 
     import deltalake
-    from pyarrow.interchange.dataframe import _PyArrowDataFrame
     from xlsxwriter import Workbook
 
     from polars import Expr, LazyFrame, Series
+    from polars.interchange.dataframe import PolarsDataFrame
     from polars.type_aliases import (
         AsofJoinStrategy,
         AvroCompression,
@@ -1208,48 +1206,51 @@ def __array__(self, dtype: Any = None) -> np.ndarray[Any, Any]:
 
     def __dataframe__(
         self, nan_as_null: bool = False, allow_copy: bool = True
-    ) -> _PyArrowDataFrame:
+    ) -> PolarsDataFrame:
         """
         Convert to a dataframe object implementing the dataframe interchange protocol.
 
         Parameters
         ----------
         nan_as_null
             Overwrite null values in the data with ``NaN``.
+
+            .. warning::
+                This functionality has not been implemented and the parameter will be
+                removed in a future version.
+                Setting this to ``True`` will raise a ``NotImplementedError``.
         allow_copy
-            Allow memory to be copied to perform the conversion. If set to False, causes
-            conversions that are not zero-copy to fail.
+            Allow memory to be copied to perform the conversion. If set to ``False``,
+            causes conversions that are not zero-copy to fail.
 
         Notes
         -----
-        Details on the dataframe interchange protocol:
+        Details on the Python dataframe interchange protocol:
         https://data-apis.org/dataframe-protocol/latest/index.html
 
-        ``nan_as_null`` currently has no effect; once support for nullable extension
-        dtypes is added, this value should be propagated to columns.
-
-        Polars currently relies on pyarrow's implementation of the dataframe interchange
-        protocol. Therefore, pyarrow>=11.0.0 is required for this method to work.
+        Examples
+        --------
+        Convert a Polars dataframe to a generic dataframe object and access some
+        properties.
 
-        Because Polars can not currently guarantee zero-copy conversion to Arrow for
-        categorical columns, ``allow_copy=False`` will not work if the dataframe
-        contains categorical data.
+        >>> df = pl.DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["x", "y"]})
+        >>> dfi = df.__dataframe__()
+        >>> dfi.num_rows()
+        2
+        >>> dfi.get_column(1).dtype
+        (<DtypeKind.FLOAT: 2>, 64, 'g', '=')
 
         """
-        if not _PYARROW_AVAILABLE or parse_version(pa.__version__) < parse_version(
-            "11"
-        ):
-            raise ImportError(
-                "pyarrow>=11.0.0 is required for converting a Polars dataframe to a"
-                " dataframe interchange object."
-            )
-        if not allow_copy and Categorical in unpack_dtypes(*self.dtypes):
-            raise TypeError(
-                "Polars can not currently guarantee zero-copy conversion to Arrow for"
-                " categorical columns. Set `allow_copy=True` or cast categorical"
-                " columns to string first."
+        if nan_as_null:
+            raise NotImplementedError(
+                "functionality for `nan_as_null` has not been implemented and the"
+                " parameter will be removed in a future version."
+                " Use the default `nan_as_null=False`."
             )
-        return self.to_arrow().__dataframe__(nan_as_null, allow_copy)
+
+        from polars.interchange.dataframe import PolarsDataFrame
+
+        return PolarsDataFrame(self, allow_copy=allow_copy)
 
     def __dataframe_consortium_standard__(
         self, *, api_version: str | None = None

@@ -45,7 +45,7 @@ def __repr__(cls) -> str:
     def _string_repr(cls) -> str:
         return _dtype_str_repr(cls)
 
-    def base_type(cls) -> PolarsDataType:
+    def base_type(cls) -> DataTypeClass:
         """Return the base type."""
         return cls
 

@@ -0,0 +1,6 @@
+"""
+Module containing the implementation of the Python dataframe interchange protocol.
+
+Details on the protocol:
+https://data-apis.org/dataframe-protocol/latest/index.html
+"""
@@ -0,0 +1,73 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from polars.interchange.protocol import DlpackDeviceType, DtypeKind
+from polars.interchange.utils import polars_dtype_to_dtype
+
+if TYPE_CHECKING:
+    from typing import NoReturn
+
+    from polars import Series
+
+
+class PolarsBuffer:
+    """
+    A buffer object backed by a Polars Series consisting of a single chunk.
+
+    Parameters
+    ----------
+    data
+        The Polars Series backing the buffer object.
+    allow_copy
+        Allow data to be copied during operations on this column. If set to ``False``,
+        a RuntimeError will be raised if data would be copied.
+
+    """
+
+    def __init__(self, data: Series, *, allow_copy: bool = True):
+        if data.n_chunks() > 1:
+            if not allow_copy:
+                raise RuntimeError(
+                    "non-contiguous buffer must be made contiguous, which is not zero-copy"
+                )
+            data = data.rechunk()
+
+        self._data = data
+
+    @property
+    def bufsize(self) -> int:
+        """Buffer size in bytes."""
+        dtype = polars_dtype_to_dtype(self._data.dtype)
+
+        if dtype[0] == DtypeKind.STRING:
+            return self._data.str.lengths().sum()  # type: ignore[return-value]
+
+        n_bits = self._data.len() * dtype[1]
+
+        result, rest = divmod(n_bits, 8)
+        # Round up to the nearest byte
+        if rest:
+            return result + 1
+        else:
+            return result
+
+    @property
+    def ptr(self) -> int:
+        """Pointer to start of the buffer as an integer."""
+        _offset, _length, pointer = self._data._s.get_ptr()
+        return pointer
+
+    def __dlpack__(self) -> NoReturn:
+        """Represent this structure as DLPack interface."""
+        raise NotImplementedError("__dlpack__")
+
+    def __dlpack_device__(self) -> tuple[DlpackDeviceType, None]:
+        """Device type and device ID for where the data in the buffer resides."""
+        return (DlpackDeviceType.CPU, None)
+
+    def __repr__(self) -> str:
+        bufsize = self.bufsize
+        ptr = self.ptr
+        device = self.__dlpack_device__()[0].name
+        return f"PolarsBuffer(bufsize={bufsize}, ptr={ptr}, device={device!r})"