Skip to content

Commit 878da45

Browse files
authored
feat(python): Native implementation of dataframe interchange protocol (#10267)
1 parent 6c6d6b0 commit 878da45

File tree

19 files changed

+1767
-162
lines changed

19 files changed

+1767
-162
lines changed

py-polars/polars/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
from polars.config import Config
1717
from polars.convert import (
1818
from_arrow,
19-
from_dataframe,
2019
from_dict,
2120
from_dicts,
2221
from_numpy,
@@ -148,6 +147,7 @@
148147
when,
149148
zeros,
150149
)
150+
from polars.interchange.from_dataframe import from_dataframe
151151
from polars.io import (
152152
read_avro,
153153
read_csv,

py-polars/polars/convert.py

Lines changed: 1 addition & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,11 @@
1515
Struct,
1616
Utf8,
1717
)
18-
from polars.dependencies import _PYARROW_AVAILABLE
1918
from polars.dependencies import pandas as pd
2019
from polars.dependencies import pyarrow as pa
2120
from polars.exceptions import NoDataError
2221
from polars.io import read_csv
23-
from polars.utils.various import _cast_repr_strings_with_schema, parse_version
22+
from polars.utils.various import _cast_repr_strings_with_schema
2423

2524
if TYPE_CHECKING:
2625
from polars import DataFrame, Series
@@ -726,80 +725,3 @@ def from_pandas(
726725
)
727726
else:
728727
raise ValueError(f"Expected pandas DataFrame or Series, got {type(data)}.")
729-
730-
731-
def from_dataframe(df: Any, *, allow_copy: bool = True) -> DataFrame:
732-
"""
733-
Build a Polars DataFrame from any dataframe supporting the interchange protocol.
734-
735-
Parameters
736-
----------
737-
df
738-
Object supporting the dataframe interchange protocol, i.e. must have implemented
739-
the ``__dataframe__`` method.
740-
allow_copy
741-
Allow memory to be copied to perform the conversion. If set to False, causes
742-
conversions that are not zero-copy to fail.
743-
744-
Notes
745-
-----
746-
Details on the dataframe interchange protocol:
747-
https://data-apis.org/dataframe-protocol/latest/index.html
748-
749-
Using a dedicated function like :func:`from_pandas` or :func:`from_arrow` is a more
750-
efficient method of conversion.
751-
752-
Polars currently relies on pyarrow's implementation of the dataframe interchange
753-
protocol. Therefore, pyarrow>=11.0.0 is required for this function to work.
754-
755-
Because Polars can not currently guarantee zero-copy conversion from Arrow for
756-
categorical columns, ``allow_copy=False`` will not work if the dataframe contains
757-
categorical data.
758-
759-
"""
760-
if isinstance(df, pl.DataFrame):
761-
return df
762-
if not hasattr(df, "__dataframe__"):
763-
raise TypeError(
764-
f"`df` of type {type(df)} does not support the dataframe interchange protocol."
765-
)
766-
767-
pa_table = _df_to_pyarrow_table(df, allow_copy=allow_copy)
768-
return from_arrow(pa_table, rechunk=allow_copy) # type: ignore[return-value]
769-
770-
771-
def _df_to_pyarrow_table(df: Any, *, allow_copy: bool = False) -> pa.Table:
772-
if not _PYARROW_AVAILABLE or parse_version(pa.__version__) < parse_version("11"):
773-
raise ImportError(
774-
"pyarrow>=11.0.0 is required for converting a dataframe interchange object"
775-
" to a Polars dataframe."
776-
)
777-
778-
import pyarrow.interchange # noqa: F401
779-
780-
if not allow_copy:
781-
return _df_to_pyarrow_table_zero_copy(df)
782-
783-
return pa.interchange.from_dataframe(df, allow_copy=True)
784-
785-
786-
def _df_to_pyarrow_table_zero_copy(df: Any) -> pa.Table:
787-
dfi = df.__dataframe__(allow_copy=False)
788-
if _dfi_contains_categorical_data(dfi):
789-
raise TypeError(
790-
"Polars can not currently guarantee zero-copy conversion from Arrow for "
791-
" categorical columns. Set `allow_copy=True` or cast categorical columns to"
792-
" string first."
793-
)
794-
795-
if isinstance(df, pa.Table):
796-
return df
797-
elif isinstance(df, pa.RecordBatch):
798-
return pa.Table.from_batches([df])
799-
else:
800-
return pa.interchange.from_dataframe(dfi, allow_copy=False)
801-
802-
803-
def _dfi_contains_categorical_data(dfi: Any) -> bool:
804-
CATEGORICAL_DTYPE = 23
805-
return any(c.dtype[0] == CATEGORICAL_DTYPE for c in dfi.get_columns())

py-polars/polars/dataframe/frame.py

Lines changed: 29 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,6 @@
4747
Time,
4848
Utf8,
4949
py_type_to_dtype,
50-
unpack_dtypes,
5150
)
5251
from polars.dependencies import (
5352
_PYARROW_AVAILABLE,
@@ -112,18 +111,17 @@
112111
with contextlib.suppress(ImportError): # Module not available when building docs
113112
from polars.polars import PyDataFrame
114113

115-
116114
if TYPE_CHECKING:
117115
import sys
118116
from datetime import timedelta
119117
from io import IOBase
120118
from typing import Literal
121119

122120
import deltalake
123-
from pyarrow.interchange.dataframe import _PyArrowDataFrame
124121
from xlsxwriter import Workbook
125122

126123
from polars import Expr, LazyFrame, Series
124+
from polars.interchange.dataframe import PolarsDataFrame
127125
from polars.type_aliases import (
128126
AsofJoinStrategy,
129127
AvroCompression,
@@ -1208,48 +1206,51 @@ def __array__(self, dtype: Any = None) -> np.ndarray[Any, Any]:
12081206

12091207
def __dataframe__(
12101208
self, nan_as_null: bool = False, allow_copy: bool = True
1211-
) -> _PyArrowDataFrame:
1209+
) -> PolarsDataFrame:
12121210
"""
12131211
Convert to a dataframe object implementing the dataframe interchange protocol.
12141212
12151213
Parameters
12161214
----------
12171215
nan_as_null
12181216
Overwrite null values in the data with ``NaN``.
1217+
1218+
.. warning::
1219+
This functionality has not been implemented and the parameter will be
1220+
removed in a future version.
1221+
Setting this to ``True`` will raise a ``NotImplementedError``.
12191222
allow_copy
1220-
Allow memory to be copied to perform the conversion. If set to False, causes
1221-
conversions that are not zero-copy to fail.
1223+
Allow memory to be copied to perform the conversion. If set to ``False``,
1224+
causes conversions that are not zero-copy to fail.
12221225
12231226
Notes
12241227
-----
1225-
Details on the dataframe interchange protocol:
1228+
Details on the Python dataframe interchange protocol:
12261229
https://data-apis.org/dataframe-protocol/latest/index.html
12271230
1228-
``nan_as_null`` currently has no effect; once support for nullable extension
1229-
dtypes is added, this value should be propagated to columns.
1230-
1231-
Polars currently relies on pyarrow's implementation of the dataframe interchange
1232-
protocol. Therefore, pyarrow>=11.0.0 is required for this method to work.
1231+
Examples
1232+
--------
1233+
Convert a Polars dataframe to a generic dataframe object and access some
1234+
properties.
12331235
1234-
Because Polars can not currently guarantee zero-copy conversion to Arrow for
1235-
categorical columns, ``allow_copy=False`` will not work if the dataframe
1236-
contains categorical data.
1236+
>>> df = pl.DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["x", "y"]})
1237+
>>> dfi = df.__dataframe__()
1238+
>>> dfi.num_rows()
1239+
2
1240+
>>> dfi.get_column(1).dtype
1241+
(<DtypeKind.FLOAT: 2>, 64, 'g', '=')
12371242
12381243
"""
1239-
if not _PYARROW_AVAILABLE or parse_version(pa.__version__) < parse_version(
1240-
"11"
1241-
):
1242-
raise ImportError(
1243-
"pyarrow>=11.0.0 is required for converting a Polars dataframe to a"
1244-
" dataframe interchange object."
1245-
)
1246-
if not allow_copy and Categorical in unpack_dtypes(*self.dtypes):
1247-
raise TypeError(
1248-
"Polars can not currently guarantee zero-copy conversion to Arrow for"
1249-
" categorical columns. Set `allow_copy=True` or cast categorical"
1250-
" columns to string first."
1244+
if nan_as_null:
1245+
raise NotImplementedError(
1246+
"functionality for `nan_as_null` has not been implemented and the"
1247+
" parameter will be removed in a future version."
1248+
" Use the default `nan_as_null=False`."
12511249
)
1252-
return self.to_arrow().__dataframe__(nan_as_null, allow_copy)
1250+
1251+
from polars.interchange.dataframe import PolarsDataFrame
1252+
1253+
return PolarsDataFrame(self, allow_copy=allow_copy)
12531254

12541255
def __dataframe_consortium_standard__(
12551256
self, *, api_version: str | None = None

py-polars/polars/datatypes/classes.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ def __repr__(cls) -> str:
4545
def _string_repr(cls) -> str:
4646
return _dtype_str_repr(cls)
4747

48-
def base_type(cls) -> PolarsDataType:
48+
def base_type(cls) -> DataTypeClass:
4949
"""Return the base type."""
5050
return cls
5151

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
"""
2+
Module containing the implementation of the Python dataframe interchange protocol.
3+
4+
Details on the protocol:
5+
https://data-apis.org/dataframe-protocol/latest/index.html
6+
"""
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
from __future__ import annotations
2+
3+
from typing import TYPE_CHECKING
4+
5+
from polars.interchange.protocol import DlpackDeviceType, DtypeKind
6+
from polars.interchange.utils import polars_dtype_to_dtype
7+
8+
if TYPE_CHECKING:
9+
from typing import NoReturn
10+
11+
from polars import Series
12+
13+
14+
class PolarsBuffer:
15+
"""
16+
A buffer object backed by a Polars Series consisting of a single chunk.
17+
18+
Parameters
19+
----------
20+
data
21+
The Polars Series backing the buffer object.
22+
allow_copy
23+
Allow data to be copied during operations on this column. If set to ``False``,
24+
a RuntimeError will be raised if data would be copied.
25+
26+
"""
27+
28+
def __init__(self, data: Series, *, allow_copy: bool = True):
29+
if data.n_chunks() > 1:
30+
if not allow_copy:
31+
raise RuntimeError(
32+
"non-contiguous buffer must be made contiguous, which is not zero-copy"
33+
)
34+
data = data.rechunk()
35+
36+
self._data = data
37+
38+
@property
39+
def bufsize(self) -> int:
40+
"""Buffer size in bytes."""
41+
dtype = polars_dtype_to_dtype(self._data.dtype)
42+
43+
if dtype[0] == DtypeKind.STRING:
44+
return self._data.str.lengths().sum() # type: ignore[return-value]
45+
46+
n_bits = self._data.len() * dtype[1]
47+
48+
result, rest = divmod(n_bits, 8)
49+
# Round up to the nearest byte
50+
if rest:
51+
return result + 1
52+
else:
53+
return result
54+
55+
@property
56+
def ptr(self) -> int:
57+
"""Pointer to start of the buffer as an integer."""
58+
_offset, _length, pointer = self._data._s.get_ptr()
59+
return pointer
60+
61+
def __dlpack__(self) -> NoReturn:
62+
"""Represent this structure as DLPack interface."""
63+
raise NotImplementedError("__dlpack__")
64+
65+
def __dlpack_device__(self) -> tuple[DlpackDeviceType, None]:
66+
"""Device type and device ID for where the data in the buffer resides."""
67+
return (DlpackDeviceType.CPU, None)
68+
69+
def __repr__(self) -> str:
70+
bufsize = self.bufsize
71+
ptr = self.ptr
72+
device = self.__dlpack_device__()[0].name
73+
return f"PolarsBuffer(bufsize={bufsize}, ptr={ptr}, device={device!r})"

0 commit comments

Comments
 (0)