28
28
#include " parquet/file_reader.h"
29
29
#include " parquet/file_writer.h"
30
30
#include " parquet/platform.h"
31
+ #include " parquet/properties.h"
31
32
32
33
#include " arrow/array.h"
33
34
#include " arrow/array/builder_primitive.h"
@@ -88,6 +89,11 @@ struct benchmark_traits<BooleanType> {
88
89
using arrow_type = ::arrow::BooleanType;
89
90
};
90
91
92
+ template <>
93
+ struct benchmark_traits <Float16LogicalType> {
94
+ using arrow_type = ::arrow::HalfFloatType;
95
+ };
96
+
91
97
template <typename ParquetType>
92
98
using ArrowType = typename benchmark_traits<ParquetType>::arrow_type;
93
99
@@ -125,15 +131,15 @@ std::vector<T> RandomVector(int64_t true_percentage, int64_t vector_size,
125
131
return values;
126
132
}
127
133
128
- template <typename ParquetType>
134
+ template <typename ParquetType, typename ArrowType = ArrowType<ParquetType> >
129
135
std::shared_ptr<::arrow::Table> TableFromVector (
130
- const std::vector<typename ParquetType ::c_type>& vec, bool nullable,
136
+ const std::vector<typename ArrowType ::c_type>& vec, bool nullable,
131
137
int64_t null_percentage = kAlternatingOrNa ) {
132
138
if (!nullable) {
133
139
ARROW_CHECK_EQ (null_percentage, kAlternatingOrNa );
134
140
}
135
- std::shared_ptr<::arrow::DataType> type = std::make_shared<ArrowType<ParquetType> >();
136
- NumericBuilder<ArrowType<ParquetType> > builder;
141
+ std::shared_ptr<::arrow::DataType> type = std::make_shared<ArrowType>();
142
+ NumericBuilder<ArrowType> builder;
137
143
if (nullable) {
138
144
// Note true values select index 1 of sample_values
139
145
auto valid_bytes = RandomVector<uint8_t >(/* true_percentage=*/ null_percentage,
@@ -258,18 +264,20 @@ struct Examples<bool> {
258
264
};
259
265
260
266
static void BenchmarkReadTable (::benchmark::State& state, const ::arrow::Table& table,
267
+ std::shared_ptr<WriterProperties> properties,
261
268
int64_t num_values = -1 , int64_t total_bytes = -1 ) {
262
269
auto output = CreateOutputStream ();
263
- EXIT_NOT_OK (
264
- WriteTable (table, :: arrow::default_memory_pool (), output, table.num_rows ()));
270
+ EXIT_NOT_OK (WriteTable (table, :: arrow::default_memory_pool (), output,
271
+ /* chunk_size= */ table.num_rows (), properties ));
265
272
PARQUET_ASSIGN_OR_THROW (auto buffer, output->Finish ());
266
273
267
- while ( state. KeepRunning () ) {
274
+ for ( auto _ : state) {
268
275
auto reader =
269
276
ParquetFileReader::Open (std::make_shared<::arrow::io::BufferReader>(buffer));
270
277
std::unique_ptr<FileReader> arrow_reader;
271
278
EXIT_NOT_OK (FileReader::Make (::arrow::default_memory_pool (), std::move (reader),
272
279
&arrow_reader));
280
+
273
281
std::shared_ptr<::arrow::Table> table;
274
282
EXIT_NOT_OK (arrow_reader->ReadTable (&table));
275
283
}
@@ -283,8 +291,14 @@ static void BenchmarkReadTable(::benchmark::State& state, const ::arrow::Table&
283
291
}
284
292
}
285
293
294
+ static void BenchmarkReadTable (::benchmark::State& state, const ::arrow::Table& table,
295
+ int64_t num_values = -1 , int64_t total_bytes = -1 ) {
296
+ BenchmarkReadTable (state, table, default_writer_properties (), num_values, total_bytes);
297
+ }
298
+
286
299
static void BenchmarkReadArray (::benchmark::State& state,
287
300
const std::shared_ptr<Array>& array, bool nullable,
301
+ std::shared_ptr<WriterProperties> properties,
288
302
int64_t num_values = -1 , int64_t total_bytes = -1 ) {
289
303
auto schema = ::arrow::schema ({field (" s" , array->type (), nullable)});
290
304
auto table = ::arrow::Table::Make (schema, {array}, array->length ());
@@ -294,8 +308,15 @@ static void BenchmarkReadArray(::benchmark::State& state,
294
308
BenchmarkReadTable (state, *table, num_values, total_bytes);
295
309
}
296
310
311
+ static void BenchmarkReadArray (::benchmark::State& state,
312
+ const std::shared_ptr<Array>& array, bool nullable,
313
+ int64_t num_values = -1 , int64_t total_bytes = -1 ) {
314
+ BenchmarkReadArray (state, array, nullable, default_writer_properties (), num_values,
315
+ total_bytes);
316
+ }
317
+
297
318
//
298
- // Benchmark reading a primitive column
319
+ // Benchmark reading a dict-encoded primitive column
299
320
//
300
321
301
322
template <bool nullable, typename ParquetType>
@@ -308,23 +329,27 @@ static void BM_ReadColumn(::benchmark::State& state) {
308
329
std::shared_ptr<::arrow::Table> table =
309
330
TableFromVector<ParquetType>(values, nullable, state.range (0 ));
310
331
311
- BenchmarkReadTable (state, *table, table->num_rows (),
332
+ auto properties = WriterProperties::Builder ().disable_dictionary ()->build ();
333
+
334
+ BenchmarkReadTable (state, *table, properties, table->num_rows (),
312
335
sizeof (typename ParquetType::c_type) * table->num_rows ());
313
336
}
314
337
315
338
// There are two parameters here that cover different data distributions.
316
339
// null_percentage governs distribution and therefore runs of null values.
317
340
// first_value_percentage governs distribution of values (we select from 1 of 2)
318
341
// so when 0 or 100 RLE is triggered all the time. When a value in the range (0, 100)
319
- // there will be some percentage of RLE encoded values and some percentage of literal
320
- // encoded values (RLE is much less likely with percentages close to 50).
342
+ // there will be some percentage of RLE-encoded dictionary indices and some
343
+ // percentage of literal encoded dictionary indices
344
+ // (RLE is much less likely with percentages close to 50).
321
345
BENCHMARK_TEMPLATE2 (BM_ReadColumn, false , Int32Type)
322
346
->Args ({/* null_percentage=*/ kAlternatingOrNa , 1 })
323
347
->Args({/* null_percentage=*/ kAlternatingOrNa , 10 })
324
348
->Args({/* null_percentage=*/ kAlternatingOrNa , 50 });
325
349
326
350
BENCHMARK_TEMPLATE2 (BM_ReadColumn, true , Int32Type)
327
351
->Args ({/* null_percentage=*/ kAlternatingOrNa , /* first_value_percentage=*/ 0 })
352
+ ->Args({/* null_percentage=*/ 0 , /* first_value_percentage=*/ 1 })
328
353
->Args({/* null_percentage=*/ 1 , /* first_value_percentage=*/ 1 })
329
354
->Args({/* null_percentage=*/ 10 , /* first_value_percentage=*/ 10 })
330
355
->Args({/* null_percentage=*/ 25 , /* first_value_percentage=*/ 5 })
@@ -369,6 +394,45 @@ BENCHMARK_TEMPLATE2(BM_ReadColumn, true, BooleanType)
369
394
->Args ({kAlternatingOrNa , 1 })
370
395
->Args({5 , 10 });
371
396
397
+ //
398
+ // Benchmark reading a PLAIN-encoded primitive column
399
+ //
400
+
401
+ template <bool nullable, typename ParquetType>
402
+ static void BM_ReadColumnPlain (::benchmark::State& state) {
403
+ using c_type = typename ArrowType<ParquetType>::c_type;
404
+
405
+ const std::vector<c_type> values (BENCHMARK_SIZE, static_cast <c_type>(42 ));
406
+ std::shared_ptr<::arrow::Table> table =
407
+ TableFromVector<ParquetType>(values, /* nullable=*/ nullable, state.range (0 ));
408
+
409
+ auto properties = WriterProperties::Builder ().disable_dictionary ()->build ();
410
+ BenchmarkReadTable (state, *table, properties, table->num_rows (),
411
+ sizeof (c_type) * table->num_rows ());
412
+ }
413
+
414
+ BENCHMARK_TEMPLATE2 (BM_ReadColumnPlain, false , Int32Type)
415
+ ->ArgNames ({" null_probability" })
416
+ ->Args({kAlternatingOrNa });
417
+ BENCHMARK_TEMPLATE2 (BM_ReadColumnPlain, true , Int32Type)
418
+ ->ArgNames ({" null_probability" })
419
+ ->Args({0 })
420
+ ->Args({1 })
421
+ ->Args({50 })
422
+ ->Args({99 })
423
+ ->Args({100 });
424
+
425
+ BENCHMARK_TEMPLATE2 (BM_ReadColumnPlain, false , Float16LogicalType)
426
+ ->ArgNames ({" null_probability" })
427
+ ->Args({kAlternatingOrNa });
428
+ BENCHMARK_TEMPLATE2 (BM_ReadColumnPlain, true , Float16LogicalType)
429
+ ->ArgNames ({" null_probability" })
430
+ ->Args({0 })
431
+ ->Args({1 })
432
+ ->Args({50 })
433
+ ->Args({99 })
434
+ ->Args({100 });
435
+
372
436
//
373
437
// Benchmark reading binary column
374
438
//
0 commit comments