diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index ad39d0ebf4326..721f6fecd118f 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -318,6 +318,7 @@ def read(self) -> DataFrame: table, dtype_backend=dtype_backend, null_to_int64=True, + to_pandas_kwargs=self.kwds.get("to_pandas_kwargs"), dtype=self.dtype, names=self.names, ) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 64fd2836e87d4..bcfb6c4e199bc 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -830,6 +830,7 @@ def date_converter( "encoding_errors": "strict", "on_bad_lines": ParserBase.BadLineHandleMethod.ERROR, "dtype_backend": lib.no_default, + "to_pandas_kwargs": None, } diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index e517647d071d6..b4333ce531ea5 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -83,6 +83,7 @@ def __init__(self, src: ReadCsvBuffer[str], **kwds) -> None: "encoding", "memory_map", "compression", + "to_pandas_kwargs", ): kwds.pop(key, None) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 3c2c9bffc26fe..22c6f219df3ef 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -403,6 +403,7 @@ def read_csv( float_precision: Literal["high", "legacy", "round_trip"] | None = None, storage_options: StorageOptions | None = None, dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, + to_pandas_kwargs: dict | None = None, ) -> DataFrame | TextFileReader: """ Read a comma-separated values (csv) file into DataFrame. @@ -762,6 +763,12 @@ def read_csv( .. versionadded:: 2.0 + to_pandas_kwargs : dict | None, default None + Keyword arguments to pass through to :func:`pyarrow.Table.to_pandas` + when ``engine="pyarrow"``. + + .. versionadded:: 3.1.0 + Returns ------- DataFrame or TextFileReader @@ -970,6 +977,7 @@ def read_table( float_precision: Literal["high", "legacy", "round_trip"] | None = None, storage_options: StorageOptions | None = None, dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, + to_pandas_kwargs: dict | None = None, ) -> DataFrame | TextFileReader: """ Read general delimited file into DataFrame. @@ -1325,6 +1333,12 @@ def read_table( .. versionadded:: 2.0 + to_pandas_kwargs : dict | None, default None + Keyword arguments to pass through to :func:`pyarrow.Table.to_pandas` + when ``engine="pyarrow"``. + + .. versionadded:: 3.1.0 + Returns ------- DataFrame or TextFileReader @@ -1668,6 +1682,16 @@ def _get_options_with_defaults(self, engine: CSVEngine) -> dict[str, Any]: raise ValueError( f"The {argname!r} option is not supported with the 'pyarrow' engine" ) + # GH#34823: to_pandas_kwargs is only valid for pyarrow engine + if ( + argname == "to_pandas_kwargs" + and value is not None + and engine != "pyarrow" + ): + raise ValueError( + "The 'to_pandas_kwargs' option is only supported with the " + "'pyarrow' engine" + ) options[argname] = value for argname, default in _c_parser_defaults.items(): diff --git a/pandas/tests/io/parser/test_pyarrow_parser_only.py b/pandas/tests/io/parser/test_pyarrow_parser_only.py new file mode 100644 index 0000000000000..88940e435d6d4 --- /dev/null +++ b/pandas/tests/io/parser/test_pyarrow_parser_only.py @@ -0,0 +1,55 @@ +""" +Tests that apply specifically to the PyArrow parser. +""" + +from io import StringIO + +import pytest + +pa = pytest.importorskip("pyarrow") + + +@pytest.mark.parametrize("split_blocks", [True, False]) +def test_to_pandas_kwargs_split_blocks(pyarrow_parser_only, split_blocks): + # GH#34823 + # split_blocks=True prevents consolidation of same-dtype columns + data = "a,b\n1,2\n3,4" + + result = pyarrow_parser_only.read_csv( + StringIO(data), + to_pandas_kwargs={"split_blocks": split_blocks}, + ) + assert list(result.columns) == ["a", "b"] + assert len(result) == 2 + # With split_blocks=True, each column should be in its own block + assert len(result._mgr.blocks) == len(result.columns) if split_blocks else 1 + + +def test_to_pandas_kwargs_zero_copy_only_raises(pyarrow_parser_only): + # zero_copy_only=True raises if zero-copy conversion not possible + data = "a,b\n1,2\n3,4" + + # zero_copy_only=True raises without split_blocks for multi-column data + with pytest.raises(pa.lib.ArrowInvalid, match="zero copy"): + pyarrow_parser_only.read_csv( + StringIO(data), + to_pandas_kwargs={"zero_copy_only": True}, + ) + + +@pytest.mark.parametrize("zero_copy_only", [True, False]) +def test_to_pandas_kwargs_zero_copy_only_success(pyarrow_parser_only, zero_copy_only): + # GH#34823 + # zero_copy_only with split_blocks=True enables zero-copy conversion + # No exception means pyarrow confirmed zero-copy conversion is possible + data = "a,b\n1,2\n3,4" + + result = pyarrow_parser_only.read_csv( + StringIO(data), + to_pandas_kwargs={"zero_copy_only": zero_copy_only, "split_blocks": True}, + ) + assert list(result.columns) == ["a", "b"] + assert len(result) == 2 + # Zero-copy arrays share memory with pyarrow and are not writeable + assert not result["a"].values.flags.writeable + assert not result["b"].values.flags.writeable diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py index 8812a65ee4e7d..d53733fd4d651 100644 --- a/pandas/tests/io/parser/test_unsupported.py +++ b/pandas/tests/io/parser/test_unsupported.py @@ -13,6 +13,7 @@ import pytest +from pandas.compat import HAS_PYARROW from pandas.errors import ParserError import pandas._testing as tm @@ -205,3 +206,26 @@ def test_invalid_dtype_backend(all_parsers): ) with pytest.raises(ValueError, match=msg): parser.read_csv("test", dtype_backend="numpy") + + +@pytest.mark.parametrize("engine", ["c", "python"]) +def test_to_pandas_kwargs_non_pyarrow_engine(engine): + data = "a,b\n1,2\n3,4" + msg = "The 'to_pandas_kwargs' option is only supported with the 'pyarrow' engine" + + with pytest.raises(ValueError, match=msg): + read_csv( + StringIO(data), engine=engine, to_pandas_kwargs={"self_destruct": False} + ) + + +@pytest.mark.skipif(not HAS_PYARROW, reason="pyarrow is not installed") +def test_to_pandas_kwargs_invalid(): + data = "a,b\n1,2\n3,4" + + with pytest.raises(TypeError, match="unexpected keyword argument"): + read_csv( + StringIO(data), + engine="pyarrow", + to_pandas_kwargs={"invalid_kwarg": True}, + )