Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion py-polars/polars/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
from polars.config import Config
from polars.convert import (
from_arrow,
from_dataframe,
from_dict,
from_dicts,
from_numpy,
Expand Down Expand Up @@ -148,6 +147,7 @@
when,
zeros,
)
from polars.interchange.from_dataframe import from_dataframe
from polars.io import (
read_avro,
read_csv,
Expand Down
80 changes: 1 addition & 79 deletions py-polars/polars/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,11 @@
Struct,
Utf8,
)
from polars.dependencies import _PYARROW_AVAILABLE
from polars.dependencies import pandas as pd
from polars.dependencies import pyarrow as pa
from polars.exceptions import NoDataError
from polars.io import read_csv
from polars.utils.various import _cast_repr_strings_with_schema, parse_version
from polars.utils.various import _cast_repr_strings_with_schema

if TYPE_CHECKING:
from polars import DataFrame, Series
Expand Down Expand Up @@ -726,80 +725,3 @@ def from_pandas(
)
else:
raise ValueError(f"Expected pandas DataFrame or Series, got {type(data)}.")


def from_dataframe(df: Any, *, allow_copy: bool = True) -> DataFrame:
"""
Build a Polars DataFrame from any dataframe supporting the interchange protocol.
Parameters
----------
df
Object supporting the dataframe interchange protocol, i.e. must have implemented
the ``__dataframe__`` method.
allow_copy
Allow memory to be copied to perform the conversion. If set to False, causes
conversions that are not zero-copy to fail.
Notes
-----
Details on the dataframe interchange protocol:
https://data-apis.org/dataframe-protocol/latest/index.html
Using a dedicated function like :func:`from_pandas` or :func:`from_arrow` is a more
efficient method of conversion.
Polars currently relies on pyarrow's implementation of the dataframe interchange
protocol. Therefore, pyarrow>=11.0.0 is required for this function to work.
Because Polars can not currently guarantee zero-copy conversion from Arrow for
categorical columns, ``allow_copy=False`` will not work if the dataframe contains
categorical data.
"""
if isinstance(df, pl.DataFrame):
return df
if not hasattr(df, "__dataframe__"):
raise TypeError(
f"`df` of type {type(df)} does not support the dataframe interchange protocol."
)

pa_table = _df_to_pyarrow_table(df, allow_copy=allow_copy)
return from_arrow(pa_table, rechunk=allow_copy) # type: ignore[return-value]


def _df_to_pyarrow_table(df: Any, *, allow_copy: bool = False) -> pa.Table:
if not _PYARROW_AVAILABLE or parse_version(pa.__version__) < parse_version("11"):
raise ImportError(
"pyarrow>=11.0.0 is required for converting a dataframe interchange object"
" to a Polars dataframe."
)

import pyarrow.interchange # noqa: F401

if not allow_copy:
return _df_to_pyarrow_table_zero_copy(df)

return pa.interchange.from_dataframe(df, allow_copy=True)


def _df_to_pyarrow_table_zero_copy(df: Any) -> pa.Table:
dfi = df.__dataframe__(allow_copy=False)
if _dfi_contains_categorical_data(dfi):
raise TypeError(
"Polars can not currently guarantee zero-copy conversion from Arrow for "
" categorical columns. Set `allow_copy=True` or cast categorical columns to"
" string first."
)

if isinstance(df, pa.Table):
return df
elif isinstance(df, pa.RecordBatch):
return pa.Table.from_batches([df])
else:
return pa.interchange.from_dataframe(dfi, allow_copy=False)


def _dfi_contains_categorical_data(dfi: Any) -> bool:
CATEGORICAL_DTYPE = 23
return any(c.dtype[0] == CATEGORICAL_DTYPE for c in dfi.get_columns())
57 changes: 29 additions & 28 deletions py-polars/polars/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@
Time,
Utf8,
py_type_to_dtype,
unpack_dtypes,
)
from polars.dependencies import (
_PYARROW_AVAILABLE,
Expand Down Expand Up @@ -112,18 +111,17 @@
with contextlib.suppress(ImportError): # Module not available when building docs
from polars.polars import PyDataFrame


if TYPE_CHECKING:
import sys
from datetime import timedelta
from io import IOBase
from typing import Literal

import deltalake
from pyarrow.interchange.dataframe import _PyArrowDataFrame
from xlsxwriter import Workbook

from polars import Expr, LazyFrame, Series
from polars.interchange.dataframe import PolarsDataFrame
from polars.type_aliases import (
AsofJoinStrategy,
AvroCompression,
Expand Down Expand Up @@ -1208,48 +1206,51 @@ def __array__(self, dtype: Any = None) -> np.ndarray[Any, Any]:

def __dataframe__(
self, nan_as_null: bool = False, allow_copy: bool = True
) -> _PyArrowDataFrame:
) -> PolarsDataFrame:
"""
Convert to a dataframe object implementing the dataframe interchange protocol.

Parameters
----------
nan_as_null
Overwrite null values in the data with ``NaN``.

.. warning::
This functionality has not been implemented and the parameter will be
removed in a future version.
Setting this to ``True`` will raise a ``NotImplementedError``.
allow_copy
Allow memory to be copied to perform the conversion. If set to False, causes
conversions that are not zero-copy to fail.
Allow memory to be copied to perform the conversion. If set to ``False``,
causes conversions that are not zero-copy to fail.

Notes
-----
Details on the dataframe interchange protocol:
Details on the Python dataframe interchange protocol:
https://data-apis.org/dataframe-protocol/latest/index.html

``nan_as_null`` currently has no effect; once support for nullable extension
dtypes is added, this value should be propagated to columns.

Polars currently relies on pyarrow's implementation of the dataframe interchange
protocol. Therefore, pyarrow>=11.0.0 is required for this method to work.
Examples
--------
Convert a Polars dataframe to a generic dataframe object and access some
properties.

Because Polars can not currently guarantee zero-copy conversion to Arrow for
categorical columns, ``allow_copy=False`` will not work if the dataframe
contains categorical data.
>>> df = pl.DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["x", "y"]})
>>> dfi = df.__dataframe__()
>>> dfi.num_rows()
2
>>> dfi.get_column(1).dtype
(<DtypeKind.FLOAT: 2>, 64, 'g', '=')

"""
if not _PYARROW_AVAILABLE or parse_version(pa.__version__) < parse_version(
"11"
):
raise ImportError(
"pyarrow>=11.0.0 is required for converting a Polars dataframe to a"
" dataframe interchange object."
)
if not allow_copy and Categorical in unpack_dtypes(*self.dtypes):
raise TypeError(
"Polars can not currently guarantee zero-copy conversion to Arrow for"
" categorical columns. Set `allow_copy=True` or cast categorical"
" columns to string first."
if nan_as_null:
raise NotImplementedError(
"functionality for `nan_as_null` has not been implemented and the"
" parameter will be removed in a future version."
" Use the default `nan_as_null=False`."
)
return self.to_arrow().__dataframe__(nan_as_null, allow_copy)

from polars.interchange.dataframe import PolarsDataFrame

return PolarsDataFrame(self, allow_copy=allow_copy)

def __dataframe_consortium_standard__(
self, *, api_version: str | None = None
Expand Down
2 changes: 1 addition & 1 deletion py-polars/polars/datatypes/classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def __repr__(cls) -> str:
def _string_repr(cls) -> str:
return _dtype_str_repr(cls)

def base_type(cls) -> PolarsDataType:
def base_type(cls) -> DataTypeClass:
"""Return the base type."""
return cls

Expand Down
6 changes: 6 additions & 0 deletions py-polars/polars/interchange/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
"""
Module containing the implementation of the Python dataframe interchange protocol.
Details on the protocol:
https://data-apis.org/dataframe-protocol/latest/index.html
"""
73 changes: 73 additions & 0 deletions py-polars/polars/interchange/buffer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
from __future__ import annotations

from typing import TYPE_CHECKING

from polars.interchange.protocol import DlpackDeviceType, DtypeKind
from polars.interchange.utils import polars_dtype_to_dtype

if TYPE_CHECKING:
from typing import NoReturn

from polars import Series


class PolarsBuffer:
"""
A buffer object backed by a Polars Series consisting of a single chunk.
Parameters
----------
data
The Polars Series backing the buffer object.
allow_copy
Allow data to be copied during operations on this column. If set to ``False``,
a RuntimeError will be raised if data would be copied.
"""

def __init__(self, data: Series, *, allow_copy: bool = True):
if data.n_chunks() > 1:
if not allow_copy:
raise RuntimeError(
"non-contiguous buffer must be made contiguous, which is not zero-copy"
)
data = data.rechunk()

self._data = data

@property
def bufsize(self) -> int:
"""Buffer size in bytes."""
dtype = polars_dtype_to_dtype(self._data.dtype)

if dtype[0] == DtypeKind.STRING:
return self._data.str.lengths().sum() # type: ignore[return-value]

n_bits = self._data.len() * dtype[1]

result, rest = divmod(n_bits, 8)
# Round up to the nearest byte
if rest:
return result + 1
else:
return result

@property
def ptr(self) -> int:
"""Pointer to start of the buffer as an integer."""
_offset, _length, pointer = self._data._s.get_ptr()
return pointer

def __dlpack__(self) -> NoReturn:
"""Represent this structure as DLPack interface."""
raise NotImplementedError("__dlpack__")

def __dlpack_device__(self) -> tuple[DlpackDeviceType, None]:
"""Device type and device ID for where the data in the buffer resides."""
return (DlpackDeviceType.CPU, None)

def __repr__(self) -> str:
bufsize = self.bufsize
ptr = self.ptr
device = self.__dlpack_device__()[0].name
return f"PolarsBuffer(bufsize={bufsize}, ptr={ptr}, device={device!r})"
Loading