Skip to content

Commit e0b5609

Browse files
committed
apacheGH-39122: [C++][Parquet] Optimize FLBA record reader
1 parent 1b634e7 commit e0b5609

File tree

1 file changed

+51
-20
lines changed

1 file changed

+51
-20
lines changed

cpp/src/parquet/column_reader.cc

+51-20
Original file line numberDiff line numberDiff line change
@@ -23,13 +23,15 @@
2323
#include <exception>
2424
#include <iostream>
2525
#include <memory>
26+
#include <optional>
2627
#include <string>
2728
#include <type_traits>
2829
#include <unordered_map>
2930
#include <utility>
3031
#include <vector>
3132

3233
#include "arrow/array.h"
34+
#include "arrow/array/array_binary.h"
3335
#include "arrow/array/builder_binary.h"
3436
#include "arrow/array/builder_dict.h"
3537
#include "arrow/array/builder_primitive.h"
@@ -2040,23 +2042,29 @@ class TypedRecordReader : public TypedColumnReaderImpl<DType>,
20402042
LevelInfo leaf_info_;
20412043
};
20422044

2043-
class FLBARecordReader : public TypedRecordReader<FLBAType>,
2044-
virtual public BinaryRecordReader {
2045+
class FLBARecordReader final : public TypedRecordReader<FLBAType>,
2046+
virtual public BinaryRecordReader {
20452047
public:
20462048
FLBARecordReader(const ColumnDescriptor* descr, LevelInfo leaf_info,
20472049
::arrow::MemoryPool* pool, bool read_dense_for_nullable)
20482050
: TypedRecordReader<FLBAType>(descr, leaf_info, pool, read_dense_for_nullable),
2049-
builder_(nullptr) {
2051+
byte_width_(descr_->type_length()),
2052+
empty_(byte_width_, 0),
2053+
type_(::arrow::fixed_size_binary(byte_width_)),
2054+
null_bitmap_builder_(pool),
2055+
data_builder_(pool) {
20502056
ARROW_DCHECK_EQ(descr_->physical_type(), Type::FIXED_LEN_BYTE_ARRAY);
2051-
int byte_width = descr_->type_length();
2052-
std::shared_ptr<::arrow::DataType> type = ::arrow::fixed_size_binary(byte_width);
2053-
builder_ = std::make_unique<::arrow::FixedSizeBinaryBuilder>(type, this->pool_);
20542057
}
20552058

20562059
::arrow::ArrayVector GetBuilderChunks() override {
2057-
std::shared_ptr<::arrow::Array> chunk;
2058-
PARQUET_THROW_NOT_OK(builder_->Finish(&chunk));
2059-
return ::arrow::ArrayVector({chunk});
2060+
const int64_t null_count = null_bitmap_builder_.false_count();
2061+
const int64_t length = null_bitmap_builder_.length();
2062+
ARROW_DCHECK_EQ(length * byte_width_, data_builder_.length());
2063+
PARQUET_ASSIGN_OR_THROW(auto data_buffer, data_builder_.Finish());
2064+
PARQUET_ASSIGN_OR_THROW(auto null_bitmap, null_bitmap_builder_.Finish());
2065+
auto chunk = std::make_shared<::arrow::FixedSizeBinaryArray>(
2066+
type_, length, data_buffer, null_bitmap, null_count);
2067+
return ::arrow::ArrayVector({std::move(chunk)});
20602068
}
20612069

20622070
void ReadValuesDense(int64_t values_to_read) override {
@@ -2065,9 +2073,9 @@ class FLBARecordReader : public TypedRecordReader<FLBAType>,
20652073
this->current_decoder_->Decode(values, static_cast<int>(values_to_read));
20662074
CheckNumberDecoded(num_decoded, values_to_read);
20672075

2068-
for (int64_t i = 0; i < num_decoded; i++) {
2069-
PARQUET_THROW_NOT_OK(builder_->Append(values[i].ptr));
2070-
}
2076+
PARQUET_THROW_NOT_OK(null_bitmap_builder_.Reserve(num_decoded));
2077+
PARQUET_THROW_NOT_OK(data_builder_.Reserve(num_decoded * byte_width_));
2078+
UnsafeAppendDense(values, num_decoded);
20712079
ResetValues();
20722080
}
20732081

@@ -2081,22 +2089,45 @@ class FLBARecordReader : public TypedRecordReader<FLBAType>,
20812089
valid_bits, valid_bits_offset);
20822090
ARROW_DCHECK_EQ(num_decoded, values_to_read);
20832091

2092+
PARQUET_THROW_NOT_OK(null_bitmap_builder_.Reserve(num_decoded));
2093+
PARQUET_THROW_NOT_OK(data_builder_.Reserve(num_decoded * byte_width_));
2094+
if (null_count == 0) {
2095+
UnsafeAppendDense(values, num_decoded);
2096+
} else {
2097+
UnsafeAppendSpaced(values, num_decoded, valid_bits, valid_bits_offset);
2098+
}
2099+
ResetValues();
2100+
}
2101+
2102+
void UnsafeAppendDense(const FLBA* values, int64_t num_decoded) {
2103+
null_bitmap_builder_.UnsafeAppend(num_decoded, /*value=*/true);
2104+
for (int64_t i = 0; i < num_decoded; i++) {
2105+
data_builder_.UnsafeAppend(values[i].ptr, byte_width_);
2106+
}
2107+
}
2108+
2109+
void UnsafeAppendSpaced(const FLBA* values, int64_t num_decoded,
2110+
const uint8_t* valid_bits, int64_t valid_bits_offset) {
2111+
null_bitmap_builder_.UnsafeAppend(valid_bits, valid_bits_offset, num_decoded);
20842112
for (int64_t i = 0; i < num_decoded; i++) {
20852113
if (::arrow::bit_util::GetBit(valid_bits, valid_bits_offset + i)) {
2086-
PARQUET_THROW_NOT_OK(builder_->Append(values[i].ptr));
2114+
data_builder_.UnsafeAppend(values[i].ptr, byte_width_);
20872115
} else {
2088-
PARQUET_THROW_NOT_OK(builder_->AppendNull());
2116+
data_builder_.UnsafeAppend(empty_.data(), byte_width_);
20892117
}
20902118
}
2091-
ResetValues();
20922119
}
20932120

20942121
private:
2095-
std::unique_ptr<::arrow::FixedSizeBinaryBuilder> builder_;
2122+
const int byte_width_;
2123+
const std::vector<uint8_t> empty_;
2124+
std::shared_ptr<::arrow::DataType> type_;
2125+
::arrow::TypedBufferBuilder<bool> null_bitmap_builder_;
2126+
::arrow::BufferBuilder data_builder_;
20962127
};
20972128

2098-
class ByteArrayChunkedRecordReader : public TypedRecordReader<ByteArrayType>,
2099-
virtual public BinaryRecordReader {
2129+
class ByteArrayChunkedRecordReader final : public TypedRecordReader<ByteArrayType>,
2130+
virtual public BinaryRecordReader {
21002131
public:
21012132
ByteArrayChunkedRecordReader(const ColumnDescriptor* descr, LevelInfo leaf_info,
21022133
::arrow::MemoryPool* pool, bool read_dense_for_nullable)
@@ -2137,8 +2168,8 @@ class ByteArrayChunkedRecordReader : public TypedRecordReader<ByteArrayType>,
21372168
typename EncodingTraits<ByteArrayType>::Accumulator accumulator_;
21382169
};
21392170

2140-
class ByteArrayDictionaryRecordReader : public TypedRecordReader<ByteArrayType>,
2141-
virtual public DictionaryRecordReader {
2171+
class ByteArrayDictionaryRecordReader final : public TypedRecordReader<ByteArrayType>,
2172+
virtual public DictionaryRecordReader {
21422173
public:
21432174
ByteArrayDictionaryRecordReader(const ColumnDescriptor* descr, LevelInfo leaf_info,
21442175
::arrow::MemoryPool* pool, bool read_dense_for_nullable)

0 commit comments

Comments
 (0)