Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 17 additions & 1 deletion pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -1822,12 +1822,28 @@ def _concat_same_type(cls, to_concat) -> Self:
-------
ArrowExtensionArray
"""
chunks = [array for ea in to_concat for array in ea._pa_array.iterchunks()]
from pandas import get_option

_ARROW_RECHUNK_THRESHOLD = 1_000_000 # 1 million elements

if to_concat[0].dtype == "string":
# StringDtype has no attribute pyarrow_dtype
pa_dtype = pa.large_string()
else:
pa_dtype = to_concat[0].dtype.pyarrow_dtype

chunks = [array for ea in to_concat for array in ea._pa_array.iterchunks()]

# GH#42357: Combine chunks if average size is below threshold
num_chunks = len(chunks)
total_length = sum(len(chunk) for chunk in chunks)
if (
get_option("mode.arrow_rechunk_on_concat")
and num_chunks > 1
and total_length / num_chunks < _ARROW_RECHUNK_THRESHOLD
):
chunks = [pa.concat_arrays(chunks).cast(pa_dtype)]

arr = pa.chunked_array(chunks, type=pa_dtype)
return to_concat[0]._from_pyarrow_array(arr)

Expand Down
17 changes: 17 additions & 0 deletions pandas/core/config_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -481,6 +481,23 @@ def is_valid_string_storage(value: Any) -> None:
)


arrow_rechunk_doc = """
: bool
When True, automatically rechunk PyArrow-backed arrays after concatenation
if the average chunk size falls below a threshold. This improves performance
for subsequent operations at the cost of a memory copy during concat.
"""


with cf.config_prefix("mode"):
cf.register_option(
"arrow_rechunk_on_concat",
True,
arrow_rechunk_doc,
validator=is_bool,
)


# Set up the io.excel specific reader configuration.
reader_engine_doc = """
: string
Expand Down
35 changes: 35 additions & 0 deletions pandas/tests/extension/test_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -3927,3 +3927,38 @@ def test_timestamp_reduction_consistency(unit, method):
f"{method} for {unit} returned {type(result)}"
)
assert result.unit == unit


@pytest.mark.parametrize(
"rechunk, num_duplicate, num_chunks",
[
(False, 100, 100),
(True, 100, 1),
],
)
def test_concat_rechunk_auto(rechunk, num_duplicate, num_chunks):
"""GH#42357: Test concat with many small ArrowExtensionArrays."""
with pd.option_context("mode.arrow_rechunk_on_concat", rechunk):
s1 = pd.Series(list("abc"), dtype="string[pyarrow]")
result = pd.concat([s1] * num_duplicate, ignore_index=True)
assert result.array._pa_array.num_chunks == num_chunks


@pytest.mark.parametrize(
"rechunk, arr_size, num_chunks",
[
(False, 2_000_000, 2),
(True, 2_000_000, 2),
(False, 900_000, 2),
(True, 900_000, 1),
],
)
def test_concat_rechunk_threshold(rechunk, arr_size, num_chunks):
"""GH#42357: Large chunks should not be rechunked unnecessarily."""
with pd.option_context("mode.arrow_rechunk_on_concat", rechunk):
# Create arrays with large chunks (2M elements each > 1M threshold)
s1 = pd.Series(range(arr_size), dtype="int64[pyarrow]")
s2 = pd.Series(range(arr_size), dtype="int64[pyarrow]")
result = pd.concat([s1, s2], ignore_index=True)
# Average chunk size > threshold, should keep 2 chunks
assert result.array._pa_array.num_chunks == num_chunks
Loading