diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 610c5f9f1cc33..2530113eda6e5 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1822,12 +1822,28 @@ def _concat_same_type(cls, to_concat) -> Self: ------- ArrowExtensionArray """ - chunks = [array for ea in to_concat for array in ea._pa_array.iterchunks()] + from pandas import get_option + + _ARROW_RECHUNK_THRESHOLD = 1_000_000 # 1 million elements + if to_concat[0].dtype == "string": # StringDtype has no attribute pyarrow_dtype pa_dtype = pa.large_string() else: pa_dtype = to_concat[0].dtype.pyarrow_dtype + + chunks = [array for ea in to_concat for array in ea._pa_array.iterchunks()] + + # GH#42357: Combine chunks if average size is below threshold + num_chunks = len(chunks) + total_length = sum(len(chunk) for chunk in chunks) + if ( + get_option("mode.arrow_rechunk_on_concat") + and num_chunks > 1 + and total_length / num_chunks < _ARROW_RECHUNK_THRESHOLD + ): + chunks = [pa.concat_arrays(chunks).cast(pa_dtype)] + arr = pa.chunked_array(chunks, type=pa_dtype) return to_concat[0]._from_pyarrow_array(arr) diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index fcb7e1b9fff0a..f278a3f644b43 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -481,6 +481,23 @@ def is_valid_string_storage(value: Any) -> None: ) +arrow_rechunk_doc = """ +: bool + When True, automatically rechunk PyArrow-backed arrays after concatenation + if the average chunk size falls below a threshold. This improves performance + for subsequent operations at the cost of a memory copy during concat. +""" + + +with cf.config_prefix("mode"): + cf.register_option( + "arrow_rechunk_on_concat", + True, + arrow_rechunk_doc, + validator=is_bool, + ) + + # Set up the io.excel specific reader configuration. reader_engine_doc = """ : string diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index b579394d5cdf6..96bd53bc68e62 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3927,3 +3927,38 @@ def test_timestamp_reduction_consistency(unit, method): f"{method} for {unit} returned {type(result)}" ) assert result.unit == unit + + +@pytest.mark.parametrize( + "rechunk, num_duplicate, num_chunks", + [ + (False, 100, 100), + (True, 100, 1), + ], +) +def test_concat_rechunk_auto(rechunk, num_duplicate, num_chunks): + """GH#42357: Test concat with many small ArrowExtensionArrays.""" + with pd.option_context("mode.arrow_rechunk_on_concat", rechunk): + s1 = pd.Series(list("abc"), dtype="string[pyarrow]") + result = pd.concat([s1] * num_duplicate, ignore_index=True) + assert result.array._pa_array.num_chunks == num_chunks + + +@pytest.mark.parametrize( + "rechunk, arr_size, num_chunks", + [ + (False, 2_000_000, 2), + (True, 2_000_000, 2), + (False, 900_000, 2), + (True, 900_000, 1), + ], +) +def test_concat_rechunk_threshold(rechunk, arr_size, num_chunks): + """GH#42357: Large chunks should not be rechunked unnecessarily.""" + with pd.option_context("mode.arrow_rechunk_on_concat", rechunk): + # Create arrays with large chunks (2M elements each > 1M threshold) + s1 = pd.Series(range(arr_size), dtype="int64[pyarrow]") + s2 = pd.Series(range(arr_size), dtype="int64[pyarrow]") + result = pd.concat([s1, s2], ignore_index=True) + # Average chunk size > threshold, should keep 2 chunks + assert result.array._pa_array.num_chunks == num_chunks