pandas-dev · fangchenli · Jan 25, 2026
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -1822,12 +1822,28 @@ def _concat_same_type(cls, to_concat) -> Self:
         -------
         ArrowExtensionArray
         """
-        chunks = [array for ea in to_concat for array in ea._pa_array.iterchunks()]
+        from pandas import get_option
+
+        _ARROW_RECHUNK_THRESHOLD = 1_000_000  # 1 million elements
+
         if to_concat[0].dtype == "string":
             # StringDtype has no attribute pyarrow_dtype
             pa_dtype = pa.large_string()
         else:
             pa_dtype = to_concat[0].dtype.pyarrow_dtype
+
+        chunks = [array for ea in to_concat for array in ea._pa_array.iterchunks()]
+
+        # GH#42357: Combine chunks if average size is below threshold
+        num_chunks = len(chunks)
+        total_length = sum(len(chunk) for chunk in chunks)
+        if (
+            get_option("mode.arrow_rechunk_on_concat")
+            and num_chunks > 1
+            and total_length / num_chunks < _ARROW_RECHUNK_THRESHOLD
+        ):
+            chunks = [pa.concat_arrays(chunks).cast(pa_dtype)]
+
         arr = pa.chunked_array(chunks, type=pa_dtype)
         return to_concat[0]._from_pyarrow_array(arr)
 

diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py
@@ -481,6 +481,23 @@ def is_valid_string_storage(value: Any) -> None:
     )
 
 
+arrow_rechunk_doc = """
+: bool
+    When True, automatically rechunk PyArrow-backed arrays after concatenation
+    if the average chunk size falls below a threshold. This improves performance
+    for subsequent operations at the cost of a memory copy during concat.
+"""
+
+
+with cf.config_prefix("mode"):
+    cf.register_option(
+        "arrow_rechunk_on_concat",
+        True,
+        arrow_rechunk_doc,
+        validator=is_bool,
+    )
+
+
 # Set up the io.excel specific reader configuration.
 reader_engine_doc = """
 : string

diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py
@@ -3927,3 +3927,38 @@ def test_timestamp_reduction_consistency(unit, method):
         f"{method} for {unit} returned {type(result)}"
     )
     assert result.unit == unit
+
+
+@pytest.mark.parametrize(
+    "rechunk, num_duplicate, num_chunks",
+    [
+        (False, 100, 100),
+        (True, 100, 1),
+    ],
+)
+def test_concat_rechunk_auto(rechunk, num_duplicate, num_chunks):
+    """GH#42357: Test concat with many small ArrowExtensionArrays."""
+    with pd.option_context("mode.arrow_rechunk_on_concat", rechunk):
+        s1 = pd.Series(list("abc"), dtype="string[pyarrow]")
+        result = pd.concat([s1] * num_duplicate, ignore_index=True)
+        assert result.array._pa_array.num_chunks == num_chunks
+
+
+@pytest.mark.parametrize(
+    "rechunk, arr_size, num_chunks",
+    [
+        (False, 2_000_000, 2),
+        (True, 2_000_000, 2),
+        (False, 900_000, 2),
+        (True, 900_000, 1),
+    ],
+)
+def test_concat_rechunk_threshold(rechunk, arr_size, num_chunks):
+    """GH#42357: Large chunks should not be rechunked unnecessarily."""
+    with pd.option_context("mode.arrow_rechunk_on_concat", rechunk):
+        # Create arrays with large chunks (2M elements each > 1M threshold)
+        s1 = pd.Series(range(arr_size), dtype="int64[pyarrow]")
+        s2 = pd.Series(range(arr_size), dtype="int64[pyarrow]")
+        result = pd.concat([s1, s2], ignore_index=True)
+        # Average chunk size > threshold, should keep 2 chunks
+        assert result.array._pa_array.num_chunks == num_chunks