diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 546e09472a3..ff3a2c7f6a2 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -1250,8 +1250,11 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, page_statistics_ = MakeStatistics(descr_, allocator_); chunk_statistics_ = MakeStatistics(descr_, allocator_); } + if (descr_->logical_type() != nullptr && descr_->logical_type()->is_geometry()) { chunk_geospatial_statistics_ = std::make_shared(); + page_statistics_ = MakeStatistics(descr_, allocator_); + chunk_statistics_ = MakeStatistics(descr_, allocator_); } } diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc index 0d456ee5890..9fda9fb3bd0 100644 --- a/cpp/src/parquet/column_writer_test.cc +++ b/cpp/src/parquet/column_writer_test.cc @@ -1923,8 +1923,8 @@ TEST_F(TestGeometryValuesWriter, TestWriteAndRead) { } // Statistics are unset because the sort order is unknown - ASSERT_FALSE(metadata_accessor()->is_stats_set()); - ASSERT_EQ(metadata_accessor()->statistics(), nullptr); + ASSERT_TRUE(metadata_accessor()->is_stats_set()); + ASSERT_EQ(metadata_accessor()->statistics()->null_count(), 0); ASSERT_TRUE(metadata_accessor()->is_geo_stats_set()); std::shared_ptr geospatial_statistics = metadata_geo_stats(); @@ -2007,9 +2007,11 @@ TEST_F(TestGeometryValuesWriter, TestWriteAndReadAllNull) { EXPECT_EQ(this->definition_levels_out_[i], 0); } - // Statistics are unset because the sort order is unknown - ASSERT_FALSE(metadata_accessor()->is_stats_set()); - ASSERT_EQ(metadata_accessor()->statistics(), nullptr); + // Statistics are unset because the sort order is unknown? + ASSERT_TRUE(metadata_accessor()->is_stats_set()); + ASSERT_FALSE(metadata_accessor()->statistics()->HasDistinctCount()); + ASSERT_FALSE(metadata_accessor()->statistics()->HasMinMax()); + ASSERT_EQ(metadata_accessor()->statistics()->null_count(), SMALL_SIZE); // GeoStatistics should exist but all components should be marked as uncalculated ASSERT_TRUE(metadata_accessor()->is_geo_stats_set()); diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 82822e3e354..aaf727afd6e 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -307,8 +307,10 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { DCHECK(writer_version_ != nullptr); // If the column statistics don't exist or column sort order is unknown // we cannot use the column stats + bool is_geometry = + descr_->logical_type() != nullptr && descr_->logical_type()->is_geometry(); if (!column_metadata_->__isset.statistics || - descr_->sort_order() == SortOrder::UNKNOWN) { + (descr_->sort_order() == SortOrder::UNKNOWN && !is_geometry)) { return false; } { @@ -1552,8 +1554,8 @@ bool ApplicationVersion::HasCorrectStatistics(Type::type col_type, return true; } - // Unknown sort order has incorrect stats - if (SortOrder::UNKNOWN == sort_order) { + // Unknown sort order has incorrect stats if the min or the max are specified + if (SortOrder::UNKNOWN == sort_order && (statistics.has_min || statistics.has_max)) { return false; } diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc index b225071d5fe..fd804711321 100644 --- a/cpp/src/parquet/statistics.cc +++ b/cpp/src/parquet/statistics.cc @@ -963,6 +963,89 @@ std::shared_ptr DoMakeComparator(Type::type physical_type, return nullptr; } +template +class UnsortedTypedStatisticsImpl : public TypedStatistics { + public: + using T = typename DType::c_type; + + explicit UnsortedTypedStatisticsImpl(const ColumnDescriptor* descr) : descr_(descr) {} + + UnsortedTypedStatisticsImpl(const ColumnDescriptor* descr, int64_t num_values, + int64_t null_count) + : descr_(descr), num_values_(num_values), null_count_(null_count) {} + + bool HasDistinctCount() const override { return false; }; + bool HasMinMax() const override { return false; } + bool HasNullCount() const override { return true; }; + + int64_t null_count() const override { return null_count_; } + + int64_t distinct_count() const override { return num_values_; } + + void Reset() override { + null_count_ = 0; + num_values_ = 0; + } + + std::string EncodeMin() const override { return ""; } + + std::string EncodeMax() const override { return ""; } + + EncodedStatistics Encode() override { + EncodedStatistics out; + out.set_null_count(null_count_); + return out; + } + + Type::type physical_type() const override { return DType::type_num; } + + bool Equals(const Statistics& other) const override { return false; } + + int64_t num_values() const override { return num_values_; } + + const ColumnDescriptor* descr() const override { return descr_; } + + const T& min() const override { return dummy_minmax_; } + + const T& max() const override { return dummy_minmax_; } + + void Merge(const TypedStatistics& other) override { + num_values_ += other.num_values(); + null_count_ += other.null_count(); + } + + void Update(const T* values, int64_t num_values, int64_t null_count) override { + num_values_ += num_values; + null_count_ += null_count; + } + + void UpdateSpaced(const T* values, const uint8_t* valid_bits, int64_t valid_bits_offset, + int64_t num_spaced_values, int64_t num_values, + int64_t null_count) override { + num_values_ += num_values; + null_count_ += null_count; + } + + void Update(const ::arrow::Array& values, bool update_counts = true) override { + if (update_counts) { + num_values_ += values.length(); + null_count_ += values.null_count(); + } + } + + void SetMinMax(const T& min, const T& max) override {} + + void IncrementNullCount(int64_t n) override { null_count_ += n; } + + void IncrementNumValues(int64_t n) override { num_values_ += n; } + + private: + const ColumnDescriptor* descr_{}; + int64_t num_values_{}; + int64_t null_count_{}; + T dummy_minmax_{}; +}; + } // namespace // ---------------------------------------------------------------------- @@ -982,6 +1065,15 @@ std::shared_ptr Comparator::Make(const ColumnDescriptor* descr) { std::shared_ptr Statistics::Make(const ColumnDescriptor* descr, ::arrow::MemoryPool* pool) { + if (descr->logical_type() && descr->logical_type()->is_geometry()) { + switch (descr->physical_type()) { + case Type::BYTE_ARRAY: + return std::make_shared>(descr); + default: + ParquetException::NYI("Unsorted statistics not implemented"); + } + } + switch (descr->physical_type()) { case Type::BOOLEAN: return std::make_shared>(descr, pool); @@ -1046,6 +1138,16 @@ std::shared_ptr Statistics::Make(const ColumnDescriptor* descr, int64_t distinct_count, bool has_min_max, bool has_null_count, bool has_distinct_count, ::arrow::MemoryPool* pool) { + if (descr->logical_type() && descr->logical_type()->is_geometry()) { + switch (descr->physical_type()) { + case Type::BYTE_ARRAY: + return std::make_shared>( + descr, num_values, null_count); + default: + ParquetException::NYI("Unsorted statistics not implemented"); + } + } + #define MAKE_STATS(CAP_TYPE, KLASS) \ case Type::CAP_TYPE: \ return std::make_shared>( \