Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions cpp/src/parquet/column_writer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1250,8 +1250,11 @@ class TypedColumnWriterImpl : public ColumnWriterImpl,
page_statistics_ = MakeStatistics<ParquetType>(descr_, allocator_);
chunk_statistics_ = MakeStatistics<ParquetType>(descr_, allocator_);
}

if (descr_->logical_type() != nullptr && descr_->logical_type()->is_geometry()) {
chunk_geospatial_statistics_ = std::make_shared<geospatial::GeoStatistics>();
page_statistics_ = MakeStatistics<ParquetType>(descr_, allocator_);
chunk_statistics_ = MakeStatistics<ParquetType>(descr_, allocator_);
Comment on lines +1256 to +1257
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The actual version of this should maybe modify the if (SortOrder::UNKNOWN != descr_->sort_order()) { check just above. Perhaps there needs to be a descr_->can_write_statistics() to separate the sortedness from whether or not we can write anything?

}
}

Expand Down
12 changes: 7 additions & 5 deletions cpp/src/parquet/column_writer_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1923,8 +1923,8 @@ TEST_F(TestGeometryValuesWriter, TestWriteAndRead) {
}

// Statistics are unset because the sort order is unknown
ASSERT_FALSE(metadata_accessor()->is_stats_set());
ASSERT_EQ(metadata_accessor()->statistics(), nullptr);
ASSERT_TRUE(metadata_accessor()->is_stats_set());
ASSERT_EQ(metadata_accessor()->statistics()->null_count(), 0);

ASSERT_TRUE(metadata_accessor()->is_geo_stats_set());
std::shared_ptr<geospatial::GeoStatistics> geospatial_statistics = metadata_geo_stats();
Expand Down Expand Up @@ -2007,9 +2007,11 @@ TEST_F(TestGeometryValuesWriter, TestWriteAndReadAllNull) {
EXPECT_EQ(this->definition_levels_out_[i], 0);
}

// Statistics are unset because the sort order is unknown
ASSERT_FALSE(metadata_accessor()->is_stats_set());
ASSERT_EQ(metadata_accessor()->statistics(), nullptr);
// Statistics are unset because the sort order is unknown?
ASSERT_TRUE(metadata_accessor()->is_stats_set());
ASSERT_FALSE(metadata_accessor()->statistics()->HasDistinctCount());
ASSERT_FALSE(metadata_accessor()->statistics()->HasMinMax());
ASSERT_EQ(metadata_accessor()->statistics()->null_count(), SMALL_SIZE);

// GeoStatistics should exist but all components should be marked as uncalculated
ASSERT_TRUE(metadata_accessor()->is_geo_stats_set());
Expand Down
8 changes: 5 additions & 3 deletions cpp/src/parquet/metadata.cc
Original file line number Diff line number Diff line change
Expand Up @@ -307,8 +307,10 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl {
DCHECK(writer_version_ != nullptr);
// If the column statistics don't exist or column sort order is unknown
// we cannot use the column stats
bool is_geometry =
descr_->logical_type() != nullptr && descr_->logical_type()->is_geometry();
if (!column_metadata_->__isset.statistics ||
descr_->sort_order() == SortOrder::UNKNOWN) {
(descr_->sort_order() == SortOrder::UNKNOWN && !is_geometry)) {
Comment on lines +310 to +313
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Again, maybe we need a descr_->can_read_statistics? There's a HasCorrectStatistics(), too, and maybe the check needs to be there. I wonder whether the types currently marked as unsorted had null counts written reliably by other implementations or whether we have to ignore those?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

According to the spec, only INT96 primitive type and INTERVAL logical type have undefined column order (except complex types like map, list, variant, and geo types).

However, I think parquet-java does not cleanly implement the column order semantics.

return false;
}
{
Expand Down Expand Up @@ -1552,8 +1554,8 @@ bool ApplicationVersion::HasCorrectStatistics(Type::type col_type,
return true;
}

// Unknown sort order has incorrect stats
if (SortOrder::UNKNOWN == sort_order) {
// Unknown sort order has incorrect stats if the min or the max are specified
if (SortOrder::UNKNOWN == sort_order && (statistics.has_min || statistics.has_max)) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we just check geometry & geography types here?

return false;
}

Expand Down
102 changes: 102 additions & 0 deletions cpp/src/parquet/statistics.cc
Original file line number Diff line number Diff line change
Expand Up @@ -963,6 +963,89 @@ std::shared_ptr<Comparator> DoMakeComparator(Type::type physical_type,
return nullptr;
}

template <typename DType>
class UnsortedTypedStatisticsImpl : public TypedStatistics<DType> {
Comment on lines +966 to +967
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure this is the answer here...I just needed a TypedStatistics<> to make this work in the ColumnWriter

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps we can reuse the existing typed one and just ignore the stats on write? (Seems inefficient but may be more compact?)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, the Java impl did this because the spec advises that min/max values are undefined and should not be used in this case. If we go with this approach, perhaps we need to disable page index (at least the column index) to reduce file size.

public:
using T = typename DType::c_type;

explicit UnsortedTypedStatisticsImpl(const ColumnDescriptor* descr) : descr_(descr) {}

UnsortedTypedStatisticsImpl(const ColumnDescriptor* descr, int64_t num_values,
int64_t null_count)
: descr_(descr), num_values_(num_values), null_count_(null_count) {}

bool HasDistinctCount() const override { return false; };
bool HasMinMax() const override { return false; }
bool HasNullCount() const override { return true; };

int64_t null_count() const override { return null_count_; }

int64_t distinct_count() const override { return num_values_; }
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we throw or return -1?


void Reset() override {
null_count_ = 0;
num_values_ = 0;
}

std::string EncodeMin() const override { return ""; }

std::string EncodeMax() const override { return ""; }

EncodedStatistics Encode() override {
EncodedStatistics out;
out.set_null_count(null_count_);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need all_null_value here?

return out;
}

Type::type physical_type() const override { return DType::type_num; }

bool Equals(const Statistics& other) const override { return false; }

int64_t num_values() const override { return num_values_; }

const ColumnDescriptor* descr() const override { return descr_; }

const T& min() const override { return dummy_minmax_; }

const T& max() const override { return dummy_minmax_; }

void Merge(const TypedStatistics<DType>& other) override {
num_values_ += other.num_values();
null_count_ += other.null_count();
}

void Update(const T* values, int64_t num_values, int64_t null_count) override {
num_values_ += num_values;
null_count_ += null_count;
}

void UpdateSpaced(const T* values, const uint8_t* valid_bits, int64_t valid_bits_offset,
int64_t num_spaced_values, int64_t num_values,
int64_t null_count) override {
num_values_ += num_values;
null_count_ += null_count;
}

void Update(const ::arrow::Array& values, bool update_counts = true) override {
if (update_counts) {
num_values_ += values.length();
null_count_ += values.null_count();
}
}

void SetMinMax(const T& min, const T& max) override {}

void IncrementNullCount(int64_t n) override { null_count_ += n; }

void IncrementNumValues(int64_t n) override { num_values_ += n; }

private:
const ColumnDescriptor* descr_{};
int64_t num_values_{};
int64_t null_count_{};
T dummy_minmax_{};
};

} // namespace

// ----------------------------------------------------------------------
Expand All @@ -982,6 +1065,15 @@ std::shared_ptr<Comparator> Comparator::Make(const ColumnDescriptor* descr) {

std::shared_ptr<Statistics> Statistics::Make(const ColumnDescriptor* descr,
::arrow::MemoryPool* pool) {
if (descr->logical_type() && descr->logical_type()->is_geometry()) {
switch (descr->physical_type()) {
case Type::BYTE_ARRAY:
return std::make_shared<UnsortedTypedStatisticsImpl<ByteArrayType>>(descr);
default:
ParquetException::NYI("Unsorted statistics not implemented");
}
}

switch (descr->physical_type()) {
case Type::BOOLEAN:
return std::make_shared<TypedStatisticsImpl<BooleanType>>(descr, pool);
Expand Down Expand Up @@ -1046,6 +1138,16 @@ std::shared_ptr<Statistics> Statistics::Make(const ColumnDescriptor* descr,
int64_t distinct_count, bool has_min_max,
bool has_null_count, bool has_distinct_count,
::arrow::MemoryPool* pool) {
if (descr->logical_type() && descr->logical_type()->is_geometry()) {
switch (descr->physical_type()) {
case Type::BYTE_ARRAY:
return std::make_shared<UnsortedTypedStatisticsImpl<ByteArrayType>>(
descr, num_values, null_count);
default:
ParquetException::NYI("Unsorted statistics not implemented");
}
}

#define MAKE_STATS(CAP_TYPE, KLASS) \
case Type::CAP_TYPE: \
return std::make_shared<TypedStatisticsImpl<KLASS>>( \
Expand Down