-
Notifications
You must be signed in to change notification settings - Fork 3.9k
GH-46205: [C++][Parquet][WIP] Read/Write null count statistics for UNKNOWN sort order #46275
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -307,8 +307,10 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { | |
DCHECK(writer_version_ != nullptr); | ||
// If the column statistics don't exist or column sort order is unknown | ||
// we cannot use the column stats | ||
bool is_geometry = | ||
descr_->logical_type() != nullptr && descr_->logical_type()->is_geometry(); | ||
if (!column_metadata_->__isset.statistics || | ||
descr_->sort_order() == SortOrder::UNKNOWN) { | ||
(descr_->sort_order() == SortOrder::UNKNOWN && !is_geometry)) { | ||
Comment on lines
+310
to
+313
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Again, maybe we need a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. According to the spec, only
However, I think parquet-java does not cleanly implement the column order semantics. |
||
return false; | ||
} | ||
{ | ||
|
@@ -1552,8 +1554,8 @@ bool ApplicationVersion::HasCorrectStatistics(Type::type col_type, | |
return true; | ||
} | ||
|
||
// Unknown sort order has incorrect stats | ||
if (SortOrder::UNKNOWN == sort_order) { | ||
// Unknown sort order has incorrect stats if the min or the max are specified | ||
if (SortOrder::UNKNOWN == sort_order && (statistics.has_min || statistics.has_max)) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we just check geometry & geography types here? |
||
return false; | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -963,6 +963,89 @@ std::shared_ptr<Comparator> DoMakeComparator(Type::type physical_type, | |
return nullptr; | ||
} | ||
|
||
template <typename DType> | ||
class UnsortedTypedStatisticsImpl : public TypedStatistics<DType> { | ||
Comment on lines
+966
to
+967
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure this is the answer here...I just needed a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Perhaps we can reuse the existing typed one and just ignore the stats on write? (Seems inefficient but may be more compact?) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, the Java impl did this because the spec advises that min/max values are undefined and should not be used in this case. If we go with this approach, perhaps we need to disable page index (at least the column index) to reduce file size. |
||
public: | ||
using T = typename DType::c_type; | ||
|
||
explicit UnsortedTypedStatisticsImpl(const ColumnDescriptor* descr) : descr_(descr) {} | ||
|
||
UnsortedTypedStatisticsImpl(const ColumnDescriptor* descr, int64_t num_values, | ||
int64_t null_count) | ||
: descr_(descr), num_values_(num_values), null_count_(null_count) {} | ||
|
||
bool HasDistinctCount() const override { return false; }; | ||
bool HasMinMax() const override { return false; } | ||
bool HasNullCount() const override { return true; }; | ||
|
||
int64_t null_count() const override { return null_count_; } | ||
|
||
int64_t distinct_count() const override { return num_values_; } | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we throw or return -1? |
||
|
||
void Reset() override { | ||
null_count_ = 0; | ||
num_values_ = 0; | ||
} | ||
|
||
std::string EncodeMin() const override { return ""; } | ||
|
||
std::string EncodeMax() const override { return ""; } | ||
|
||
EncodedStatistics Encode() override { | ||
EncodedStatistics out; | ||
out.set_null_count(null_count_); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need |
||
return out; | ||
} | ||
|
||
Type::type physical_type() const override { return DType::type_num; } | ||
|
||
bool Equals(const Statistics& other) const override { return false; } | ||
|
||
int64_t num_values() const override { return num_values_; } | ||
|
||
const ColumnDescriptor* descr() const override { return descr_; } | ||
|
||
const T& min() const override { return dummy_minmax_; } | ||
|
||
const T& max() const override { return dummy_minmax_; } | ||
|
||
void Merge(const TypedStatistics<DType>& other) override { | ||
num_values_ += other.num_values(); | ||
null_count_ += other.null_count(); | ||
} | ||
|
||
void Update(const T* values, int64_t num_values, int64_t null_count) override { | ||
num_values_ += num_values; | ||
null_count_ += null_count; | ||
} | ||
|
||
void UpdateSpaced(const T* values, const uint8_t* valid_bits, int64_t valid_bits_offset, | ||
int64_t num_spaced_values, int64_t num_values, | ||
int64_t null_count) override { | ||
num_values_ += num_values; | ||
null_count_ += null_count; | ||
} | ||
|
||
void Update(const ::arrow::Array& values, bool update_counts = true) override { | ||
if (update_counts) { | ||
num_values_ += values.length(); | ||
null_count_ += values.null_count(); | ||
} | ||
} | ||
|
||
void SetMinMax(const T& min, const T& max) override {} | ||
|
||
void IncrementNullCount(int64_t n) override { null_count_ += n; } | ||
|
||
void IncrementNumValues(int64_t n) override { num_values_ += n; } | ||
|
||
private: | ||
const ColumnDescriptor* descr_{}; | ||
int64_t num_values_{}; | ||
int64_t null_count_{}; | ||
T dummy_minmax_{}; | ||
}; | ||
|
||
} // namespace | ||
|
||
// ---------------------------------------------------------------------- | ||
|
@@ -982,6 +1065,15 @@ std::shared_ptr<Comparator> Comparator::Make(const ColumnDescriptor* descr) { | |
|
||
std::shared_ptr<Statistics> Statistics::Make(const ColumnDescriptor* descr, | ||
::arrow::MemoryPool* pool) { | ||
if (descr->logical_type() && descr->logical_type()->is_geometry()) { | ||
switch (descr->physical_type()) { | ||
case Type::BYTE_ARRAY: | ||
return std::make_shared<UnsortedTypedStatisticsImpl<ByteArrayType>>(descr); | ||
default: | ||
ParquetException::NYI("Unsorted statistics not implemented"); | ||
} | ||
} | ||
|
||
switch (descr->physical_type()) { | ||
case Type::BOOLEAN: | ||
return std::make_shared<TypedStatisticsImpl<BooleanType>>(descr, pool); | ||
|
@@ -1046,6 +1138,16 @@ std::shared_ptr<Statistics> Statistics::Make(const ColumnDescriptor* descr, | |
int64_t distinct_count, bool has_min_max, | ||
bool has_null_count, bool has_distinct_count, | ||
::arrow::MemoryPool* pool) { | ||
if (descr->logical_type() && descr->logical_type()->is_geometry()) { | ||
switch (descr->physical_type()) { | ||
case Type::BYTE_ARRAY: | ||
return std::make_shared<UnsortedTypedStatisticsImpl<ByteArrayType>>( | ||
descr, num_values, null_count); | ||
default: | ||
ParquetException::NYI("Unsorted statistics not implemented"); | ||
} | ||
} | ||
|
||
#define MAKE_STATS(CAP_TYPE, KLASS) \ | ||
case Type::CAP_TYPE: \ | ||
return std::make_shared<TypedStatisticsImpl<KLASS>>( \ | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The actual version of this should maybe modify the
if (SortOrder::UNKNOWN != descr_->sort_order()) {
check just above. Perhaps there needs to be adescr_->can_write_statistics()
to separate the sortedness from whether or not we can write anything?