Skip to content

Commit 3e1cfcb

Browse files
committed
apacheGH-44072: [C++][Parquet] Add Float16 reading benchmarks
1 parent 27acf8b commit 3e1cfcb

File tree

1 file changed

+75
-11
lines changed

1 file changed

+75
-11
lines changed

cpp/src/parquet/arrow/reader_writer_benchmark.cc

+75-11
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
#include "parquet/file_reader.h"
2929
#include "parquet/file_writer.h"
3030
#include "parquet/platform.h"
31+
#include "parquet/properties.h"
3132

3233
#include "arrow/array.h"
3334
#include "arrow/array/builder_primitive.h"
@@ -88,6 +89,11 @@ struct benchmark_traits<BooleanType> {
8889
using arrow_type = ::arrow::BooleanType;
8990
};
9091

92+
template <>
93+
struct benchmark_traits<Float16LogicalType> {
94+
using arrow_type = ::arrow::HalfFloatType;
95+
};
96+
9197
template <typename ParquetType>
9298
using ArrowType = typename benchmark_traits<ParquetType>::arrow_type;
9399

@@ -125,15 +131,15 @@ std::vector<T> RandomVector(int64_t true_percentage, int64_t vector_size,
125131
return values;
126132
}
127133

128-
template <typename ParquetType>
134+
template <typename ParquetType, typename ArrowType = ArrowType<ParquetType>>
129135
std::shared_ptr<::arrow::Table> TableFromVector(
130-
const std::vector<typename ParquetType::c_type>& vec, bool nullable,
136+
const std::vector<typename ArrowType::c_type>& vec, bool nullable,
131137
int64_t null_percentage = kAlternatingOrNa) {
132138
if (!nullable) {
133139
ARROW_CHECK_EQ(null_percentage, kAlternatingOrNa);
134140
}
135-
std::shared_ptr<::arrow::DataType> type = std::make_shared<ArrowType<ParquetType>>();
136-
NumericBuilder<ArrowType<ParquetType>> builder;
141+
std::shared_ptr<::arrow::DataType> type = std::make_shared<ArrowType>();
142+
NumericBuilder<ArrowType> builder;
137143
if (nullable) {
138144
// Note true values select index 1 of sample_values
139145
auto valid_bytes = RandomVector<uint8_t>(/*true_percentage=*/null_percentage,
@@ -258,18 +264,20 @@ struct Examples<bool> {
258264
};
259265

260266
static void BenchmarkReadTable(::benchmark::State& state, const ::arrow::Table& table,
267+
std::shared_ptr<WriterProperties> properties,
261268
int64_t num_values = -1, int64_t total_bytes = -1) {
262269
auto output = CreateOutputStream();
263-
EXIT_NOT_OK(
264-
WriteTable(table, ::arrow::default_memory_pool(), output, table.num_rows()));
270+
EXIT_NOT_OK(WriteTable(table, ::arrow::default_memory_pool(), output,
271+
/*chunk_size=*/table.num_rows(), properties));
265272
PARQUET_ASSIGN_OR_THROW(auto buffer, output->Finish());
266273

267-
while (state.KeepRunning()) {
274+
for (auto _ : state) {
268275
auto reader =
269276
ParquetFileReader::Open(std::make_shared<::arrow::io::BufferReader>(buffer));
270277
std::unique_ptr<FileReader> arrow_reader;
271278
EXIT_NOT_OK(FileReader::Make(::arrow::default_memory_pool(), std::move(reader),
272279
&arrow_reader));
280+
273281
std::shared_ptr<::arrow::Table> table;
274282
EXIT_NOT_OK(arrow_reader->ReadTable(&table));
275283
}
@@ -283,8 +291,14 @@ static void BenchmarkReadTable(::benchmark::State& state, const ::arrow::Table&
283291
}
284292
}
285293

294+
static void BenchmarkReadTable(::benchmark::State& state, const ::arrow::Table& table,
295+
int64_t num_values = -1, int64_t total_bytes = -1) {
296+
BenchmarkReadTable(state, table, default_writer_properties(), num_values, total_bytes);
297+
}
298+
286299
static void BenchmarkReadArray(::benchmark::State& state,
287300
const std::shared_ptr<Array>& array, bool nullable,
301+
std::shared_ptr<WriterProperties> properties,
288302
int64_t num_values = -1, int64_t total_bytes = -1) {
289303
auto schema = ::arrow::schema({field("s", array->type(), nullable)});
290304
auto table = ::arrow::Table::Make(schema, {array}, array->length());
@@ -294,8 +308,15 @@ static void BenchmarkReadArray(::benchmark::State& state,
294308
BenchmarkReadTable(state, *table, num_values, total_bytes);
295309
}
296310

311+
static void BenchmarkReadArray(::benchmark::State& state,
312+
const std::shared_ptr<Array>& array, bool nullable,
313+
int64_t num_values = -1, int64_t total_bytes = -1) {
314+
BenchmarkReadArray(state, array, nullable, default_writer_properties(), num_values,
315+
total_bytes);
316+
}
317+
297318
//
298-
// Benchmark reading a primitive column
319+
// Benchmark reading a dict-encoded primitive column
299320
//
300321

301322
template <bool nullable, typename ParquetType>
@@ -308,23 +329,27 @@ static void BM_ReadColumn(::benchmark::State& state) {
308329
std::shared_ptr<::arrow::Table> table =
309330
TableFromVector<ParquetType>(values, nullable, state.range(0));
310331

311-
BenchmarkReadTable(state, *table, table->num_rows(),
332+
auto properties = WriterProperties::Builder().disable_dictionary()->build();
333+
334+
BenchmarkReadTable(state, *table, properties, table->num_rows(),
312335
sizeof(typename ParquetType::c_type) * table->num_rows());
313336
}
314337

315338
// There are two parameters here that cover different data distributions.
316339
// null_percentage governs distribution and therefore runs of null values.
317340
// first_value_percentage governs distribution of values (we select from 1 of 2)
318341
// so when 0 or 100 RLE is triggered all the time. When a value in the range (0, 100)
319-
// there will be some percentage of RLE encoded values and some percentage of literal
320-
// encoded values (RLE is much less likely with percentages close to 50).
342+
// there will be some percentage of RLE-encoded dictionary indices and some
343+
// percentage of literal encoded dictionary indices
344+
// (RLE is much less likely with percentages close to 50).
321345
BENCHMARK_TEMPLATE2(BM_ReadColumn, false, Int32Type)
322346
->Args({/*null_percentage=*/kAlternatingOrNa, 1})
323347
->Args({/*null_percentage=*/kAlternatingOrNa, 10})
324348
->Args({/*null_percentage=*/kAlternatingOrNa, 50});
325349

326350
BENCHMARK_TEMPLATE2(BM_ReadColumn, true, Int32Type)
327351
->Args({/*null_percentage=*/kAlternatingOrNa, /*first_value_percentage=*/0})
352+
->Args({/*null_percentage=*/0, /*first_value_percentage=*/1})
328353
->Args({/*null_percentage=*/1, /*first_value_percentage=*/1})
329354
->Args({/*null_percentage=*/10, /*first_value_percentage=*/10})
330355
->Args({/*null_percentage=*/25, /*first_value_percentage=*/5})
@@ -369,6 +394,45 @@ BENCHMARK_TEMPLATE2(BM_ReadColumn, true, BooleanType)
369394
->Args({kAlternatingOrNa, 1})
370395
->Args({5, 10});
371396

397+
//
398+
// Benchmark reading a PLAIN-encoded primitive column
399+
//
400+
401+
template <bool nullable, typename ParquetType>
402+
static void BM_ReadColumnPlain(::benchmark::State& state) {
403+
using c_type = typename ArrowType<ParquetType>::c_type;
404+
405+
const std::vector<c_type> values(BENCHMARK_SIZE, static_cast<c_type>(42));
406+
std::shared_ptr<::arrow::Table> table =
407+
TableFromVector<ParquetType>(values, /*nullable=*/nullable, state.range(0));
408+
409+
auto properties = WriterProperties::Builder().disable_dictionary()->build();
410+
BenchmarkReadTable(state, *table, properties, table->num_rows(),
411+
sizeof(c_type) * table->num_rows());
412+
}
413+
414+
BENCHMARK_TEMPLATE2(BM_ReadColumnPlain, false, Int32Type)
415+
->ArgNames({"null_probability"})
416+
->Args({kAlternatingOrNa});
417+
BENCHMARK_TEMPLATE2(BM_ReadColumnPlain, true, Int32Type)
418+
->ArgNames({"null_probability"})
419+
->Args({0})
420+
->Args({1})
421+
->Args({50})
422+
->Args({99})
423+
->Args({100});
424+
425+
BENCHMARK_TEMPLATE2(BM_ReadColumnPlain, false, Float16LogicalType)
426+
->ArgNames({"null_probability"})
427+
->Args({kAlternatingOrNa});
428+
BENCHMARK_TEMPLATE2(BM_ReadColumnPlain, true, Float16LogicalType)
429+
->ArgNames({"null_probability"})
430+
->Args({0})
431+
->Args({1})
432+
->Args({50})
433+
->Args({99})
434+
->Args({100});
435+
372436
//
373437
// Benchmark reading binary column
374438
//

0 commit comments

Comments
 (0)