Skip to content

GH-46403: [C++] Add support for limiting element size when printing data #46536

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Jun 9, 2025
Merged
31 changes: 25 additions & 6 deletions cpp/src/arrow/pretty_print.cc
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,9 @@ class PrettyPrinter {
: options_(options), indent_(options.indent), sink_(sink) {}

inline void Write(std::string_view data);
inline void Write(std::string_view data, int max_chars);
inline void WriteIndented(std::string_view data);
inline void WriteIndented(std::string_view data, int max_chars);
inline void Newline();
inline void Indent();
inline void IndentAfterNewline();
Expand Down Expand Up @@ -104,11 +106,26 @@ void PrettyPrinter::CloseArray(const Array& array) {
(*sink_) << options_.array_delimiters.close;
}

void PrettyPrinter::Write(std::string_view data) { (*sink_) << data; }
void PrettyPrinter::Write(std::string_view data) {
Write(data, options_.element_size_limit);
}

void PrettyPrinter::Write(std::string_view data, int max_chars) {
(*sink_) << data.substr(0, max_chars);
if (data.size() > static_cast<size_t>(max_chars)) {
(*sink_) << " (... " << data.size() - static_cast<size_t>(max_chars)
<< " chars omitted)";
}
}

void PrettyPrinter::WriteIndented(std::string_view data) {
Indent();
Write(data);
Write(data, options_.element_size_limit);
}

void PrettyPrinter::WriteIndented(std::string_view data, int max_chars) {
Indent();
Write(data, max_chars);
}

void PrettyPrinter::Newline() {
Expand Down Expand Up @@ -176,7 +193,7 @@ class ArrayPrinter : public PrettyPrinter {

template <typename ArrayType, typename Formatter>
Status WritePrimitiveValues(const ArrayType& array, Formatter* formatter) {
auto appender = [&](std::string_view v) { (*sink_) << v; };
auto appender = [&](std::string_view v) { Write(v); };
auto format_func = [&](int64_t i) {
(*formatter)(array.GetView(i), appender);
return Status::OK();
Expand Down Expand Up @@ -232,9 +249,11 @@ class ArrayPrinter : public PrettyPrinter {
enable_if_has_string_view<T, Status> WriteDataValues(const ArrayType& array) {
return WriteValues(array, [&](int64_t i) {
if constexpr (T::is_utf8) {
(*sink_) << "\"" << array.GetView(i) << "\"";
(*sink_) << "\"";
this->Write(array.GetView(i), options_.element_size_limit - 2);
(*sink_) << "\"";
} else {
(*sink_) << HexEncode(array.GetView(i));
this->Write(HexEncode(array.GetView(i)));
}
return Status::OK();
});
Expand All @@ -243,7 +262,7 @@ class ArrayPrinter : public PrettyPrinter {
template <typename ArrayType, typename T = typename ArrayType::TypeClass>
enable_if_decimal<T, Status> WriteDataValues(const ArrayType& array) {
return WriteValues(array, [&](int64_t i) {
(*sink_) << array.FormatValue(i);
this->Write(array.FormatValue(i));
return Status::OK();
});
}
Expand Down
8 changes: 6 additions & 2 deletions cpp/src/arrow/pretty_print.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,14 +58,15 @@ struct ARROW_EXPORT PrettyPrintOptions {
PrettyPrintOptions(int indent, // NOLINT runtime/explicit
int window = 10, int indent_size = 2, std::string null_rep = "null",
bool skip_new_lines = false, bool truncate_metadata = true,
int container_window = 2)
int container_window = 2, int element_size_limit = 100)
: indent(indent),
indent_size(indent_size),
window(window),
container_window(container_window),
null_rep(std::move(null_rep)),
skip_new_lines(skip_new_lines),
truncate_metadata(truncate_metadata) {}
truncate_metadata(truncate_metadata),
element_size_limit(element_size_limit) {}

/// Create a PrettyPrintOptions instance with default values
static PrettyPrintOptions Defaults() { return PrettyPrintOptions(); }
Expand Down Expand Up @@ -99,6 +100,9 @@ struct ARROW_EXPORT PrettyPrintOptions {
/// If true, display schema metadata when pretty-printing a Schema
bool show_schema_metadata = true;

/// Limit each element to specified number of characters, defaults to 100
int element_size_limit = 100;

/// Delimiters to use when printing an Array
PrettyPrintDelimiters array_delimiters = PrettyPrintDelimiters::Defaults();

Expand Down
60 changes: 46 additions & 14 deletions cpp/src/arrow/pretty_print_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include <memory>
#include <sstream>
#include <string>
#include <string_view>
#include <vector>

#include "arrow/array.h"
Expand All @@ -47,37 +48,37 @@ class TestPrettyPrint : public ::testing::Test {
};

template <typename T>
void CheckStream(const T& obj, const PrettyPrintOptions& options, const char* expected) {
void CheckStream(const T& obj, const PrettyPrintOptions& options,
std::string_view expected) {
std::ostringstream sink;
ASSERT_OK(PrettyPrint(obj, options, &sink));
std::string result = sink.str();
ASSERT_EQ(std::string(expected, strlen(expected)), result);
ASSERT_EQ(expected, result);
}

void CheckArray(const Array& arr, const PrettyPrintOptions& options, const char* expected,
bool check_operator = true) {
void CheckArray(const Array& arr, const PrettyPrintOptions& options,
std::string_view expected, bool check_operator = true) {
ARROW_SCOPED_TRACE("For datatype: ", arr.type()->ToString());
CheckStream(arr, options, expected);

if (options.indent == 0 && check_operator) {
if (options.indent == 0 && options.element_size_limit == 100 && check_operator) {
std::stringstream ss;
ss << arr;
std::string result = std::string(expected, strlen(expected));
ASSERT_EQ(result, ss.str());
ASSERT_EQ(expected, ss.str());
}
}

template <typename T>
void Check(const T& obj, const PrettyPrintOptions& options, const char* expected) {
void Check(const T& obj, const PrettyPrintOptions& options, std::string_view expected) {
std::string result;
ASSERT_OK(PrettyPrint(obj, options, &result));
ASSERT_EQ(std::string(expected, strlen(expected)), result);
ASSERT_EQ(expected, result);
}

template <typename TYPE, typename C_TYPE>
void CheckPrimitive(const std::shared_ptr<DataType>& type,
const PrettyPrintOptions& options, const std::vector<bool>& is_valid,
const std::vector<C_TYPE>& values, const char* expected,
const std::vector<C_TYPE>& values, std::string_view expected,
bool check_operator = true) {
std::shared_ptr<Array> array;
ArrayFromVector<TYPE, C_TYPE>(type, is_valid, values, &array);
Expand All @@ -86,7 +87,7 @@ void CheckPrimitive(const std::shared_ptr<DataType>& type,

template <typename TYPE, typename C_TYPE>
void CheckPrimitive(const PrettyPrintOptions& options, const std::vector<bool>& is_valid,
const std::vector<C_TYPE>& values, const char* expected,
const std::vector<C_TYPE>& values, std::string_view expected,
bool check_operator = true) {
CheckPrimitive<TYPE, C_TYPE>(TypeTraits<TYPE>::type_singleton(), options, is_valid,
values, expected, check_operator);
Expand Down Expand Up @@ -158,12 +159,12 @@ TEST_F(TestPrettyPrint, PrimitiveType) {
])expected";
CheckPrimitive<DoubleType, double>({2, 10}, is_valid, values2, ex2_in2);

std::vector<std::string> values3 = {"foo", "bar", "", "baz", ""};
std::vector<std::string> values3 = {"foo", "bar", "", "a longer string", ""};
static const char* ex3 = R"expected([
"foo",
"bar",
null,
"baz",
"a longer string",
null
])expected";
CheckPrimitive<StringType, std::string>({0, 10}, is_valid, values3, ex3);
Expand All @@ -172,11 +173,23 @@ TEST_F(TestPrettyPrint, PrimitiveType) {
"foo",
"bar",
null,
"baz",
"a longer string",
null
])expected";
CheckPrimitive<StringType, std::string>({2, 10}, is_valid, values3, ex3_in2);
CheckPrimitive<LargeStringType, std::string>({2, 10}, is_valid, values3, ex3_in2);

PrettyPrintOptions options{2, 10};
options.element_size_limit = 8;
static const char* ex3_in3 = R"expected( [
"foo",
"bar",
null,
"a long (... 9 chars omitted)",
null
])expected";
CheckPrimitive<StringType, std::string>(options, is_valid, values3, ex3_in3);
CheckPrimitive<LargeStringType, std::string>(options, is_valid, values3, ex3_in3);
}

TEST_F(TestPrettyPrint, PrimitiveTypeNoNewlines) {
Expand Down Expand Up @@ -772,6 +785,12 @@ TEST_F(TestPrettyPrint, BinaryNoNewlines) {
options.window = 2;
expected = "[666F6F,626172,...,,FF]";
CheckPrimitive<BinaryType, std::string>(options, is_valid, values, expected, false);

// With truncated element size
options.element_size_limit = 1;
expected =
"[6 (... 5 chars omitted),6 (... 5 chars omitted),...,,F (... 1 chars omitted)]";
CheckPrimitive<BinaryType, std::string>(options, is_valid, values, expected, false);
}

template <typename TypeClass>
Expand Down Expand Up @@ -1103,6 +1122,12 @@ TEST_F(TestPrettyPrint, FixedSizeBinaryType) {
CheckArray(*array, {0, 10}, ex);
static const char* ex_2 = " [\n 666F6F,\n ...\n 62617A\n ]";
CheckArray(*array, {2, 1}, ex_2);

auto options = PrettyPrintOptions{2, 1};
options.element_size_limit = 3;
static const char* ex_3 =
" [\n 666 (... 3 chars omitted),\n ...\n 626 (... 3 chars omitted)\n ]";
CheckArray(*array, options, ex_3);
}

TEST_F(TestPrettyPrint, DecimalTypes) {
Expand All @@ -1115,6 +1140,12 @@ TEST_F(TestPrettyPrint, DecimalTypes) {

static const char* ex = "[\n 123.4567,\n 456.7891,\n null\n]";
CheckArray(*array, {0}, ex);

auto options = PrettyPrintOptions();
options.element_size_limit = 3;
static const char* ex_2 =
"[\n 123 (... 5 chars omitted),\n 456 (... 5 chars omitted),\n null\n]";
CheckArray(*array, options, ex_2);
}
}

Expand Down Expand Up @@ -1417,6 +1448,7 @@ lorem: 'Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nulla accumsan
sapien commodo massa, vel volutpat orci nisi eu justo. Nulla non blandit
sapien. Quisque pretium vestibulum urna eu vehicula.')";
options.truncate_metadata = false;
options.element_size_limit = 10000;
Check(*my_schema, options, expected_verbose);

// Metadata that exactly fits
Expand Down
6 changes: 5 additions & 1 deletion python/pyarrow/array.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -1357,7 +1357,8 @@ cdef class Array(_PandasConvertible):
return f'{type_format}\n{self}'

def to_string(self, *, int indent=2, int top_level_indent=0, int window=10,
int container_window=2, c_bool skip_new_lines=False):
int container_window=2, c_bool skip_new_lines=False,
int element_size_limit=100):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you perhaps add a unit test for passing a specific element_size_limit?

"""
Render a "pretty-printed" string representation of the Array.

Expand All @@ -1383,6 +1384,8 @@ cdef class Array(_PandasConvertible):
skip_new_lines : bool
If the array should be rendered as a single line of text
or if each element should be on its own line.
element_size_limit : int, default 100
Maximum number of characters of a single element before it is truncated.
"""
cdef:
c_string result
Expand All @@ -1392,6 +1395,7 @@ cdef class Array(_PandasConvertible):
options = PrettyPrintOptions(top_level_indent, window)
options.skip_new_lines = skip_new_lines
options.indent_size = indent
options.element_size_limit = element_size_limit
check_status(
PrettyPrint(
deref(self.ap),
Expand Down
1 change: 1 addition & 0 deletions python/pyarrow/includes/libarrow.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -652,6 +652,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
c_bool truncate_metadata
c_bool show_field_metadata
c_bool show_schema_metadata
int element_size_limit

@staticmethod
PrettyPrintOptions Defaults()
Expand Down
6 changes: 5 additions & 1 deletion python/pyarrow/table.pxi
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
Expand Down Expand Up @@ -116,7 +117,7 @@ cdef class ChunkedArray(_PandasConvertible):
return f"{type_format}\n{self}"

def to_string(self, *, int indent=0, int window=5, int container_window=2,
c_bool skip_new_lines=False):
c_bool skip_new_lines=False, int element_size_limit=100):
"""
Render a "pretty-printed" string representation of the ChunkedArray

Expand All @@ -137,6 +138,8 @@ cdef class ChunkedArray(_PandasConvertible):
skip_new_lines : bool
If the array should be rendered as a single line of text
or if each element should be on its own line.
element_size_limit : int, default 100
Maximum number of characters of a single element before it is truncated.

Examples
--------
Expand All @@ -153,6 +156,7 @@ cdef class ChunkedArray(_PandasConvertible):
options = PrettyPrintOptions(indent, window)
options.skip_new_lines = skip_new_lines
options.container_window = container_window
options.element_size_limit = element_size_limit
check_status(
PrettyPrint(
deref(self.chunked_array),
Expand Down
23 changes: 19 additions & 4 deletions python/pyarrow/tests/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,17 +75,32 @@ def test_constructor_raises():


def test_list_format():
arr = pa.array([[1], None, [2, 3, None]])
arr = pa.array([["foo"], None, ["bar", "a longer string", None]])
result = arr.to_string()
expected = """\
[
[
1
"foo"
],
null,
[
2,
3,
"bar",
"a longer string",
null
]
]"""
assert result == expected

result = arr.to_string(element_size_limit=10)
expected = """\
[
[
"foo"
],
null,
[
"bar",
"a longer (... 7 chars omitted)",
null
]
]"""
Expand Down
7 changes: 4 additions & 3 deletions python/pyarrow/tests/test_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,14 +321,15 @@ def test_schema_to_string_with_metadata():
-- field metadata --
key3: 'value3'
-- schema metadata --
lorem: '{lorem}'"""
lorem: '{lorem[:92]} (... {len(lorem) - 91} chars omitted)"""

assert my_schema.to_string(truncate_metadata=False,
show_field_metadata=False) == f"""\
show_field_metadata=False,
element_size_limit=50) == f"""\
foo: int32 not null
bar: string
-- schema metadata --
lorem: '{lorem}'"""
lorem: '{lorem[:50 - 8]} (... {len(lorem) - (50 - 9)} chars omitted)"""

assert my_schema.to_string(truncate_metadata=False,
show_schema_metadata=False) == """\
Expand Down
5 changes: 4 additions & 1 deletion python/pyarrow/types.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -3566,7 +3566,7 @@ cdef class Schema(_Weakrefable):
return pyarrow_wrap_schema(new_schema)

def to_string(self, truncate_metadata=True, show_field_metadata=True,
show_schema_metadata=True):
show_schema_metadata=True, element_size_limit=100):
"""
Return human-readable representation of Schema

Expand All @@ -3579,6 +3579,8 @@ cdef class Schema(_Weakrefable):
Display Field-level KeyValueMetadata
show_schema_metadata : boolean, default True
Display Schema-level KeyValueMetadata
element_size_limit : int, default 100
Maximum number of characters of a single element before it is truncated.

Returns
-------
Expand All @@ -3592,6 +3594,7 @@ cdef class Schema(_Weakrefable):
options.truncate_metadata = truncate_metadata
options.show_field_metadata = show_field_metadata
options.show_schema_metadata = show_schema_metadata
options.element_size_limit = element_size_limit

with nogil:
check_status(
Expand Down
Loading