Skip to content

Commit 8978cef

Browse files
committed
apacheGH-39562: [C++][Parquet] Fix crash
TODO complete this description
1 parent ac50918 commit 8978cef

File tree

2 files changed

+15
-4
lines changed

2 files changed

+15
-4
lines changed

cpp/src/arrow/dataset/file_parquet.cc

+11-3
Original file line numberDiff line numberDiff line change
@@ -813,11 +813,17 @@ Status ParquetFileFragment::EnsureCompleteMetadata(parquet::arrow::FileReader* r
813813

814814
Status ParquetFileFragment::SetMetadata(
815815
std::shared_ptr<parquet::FileMetaData> metadata,
816-
std::shared_ptr<parquet::arrow::SchemaManifest> manifest) {
816+
std::shared_ptr<parquet::arrow::SchemaManifest> manifest,
817+
std::shared_ptr<parquet::FileMetaData> original_metadata) {
817818
DCHECK(row_groups_.has_value());
818819

819820
metadata_ = std::move(metadata);
820821
manifest_ = std::move(manifest);
822+
original_metadata_ = original_metadata ? std::move(original_metadata) : metadata_;
823+
// The SchemaDescriptor needs to be owned by a FileMetaData instance,
824+
// because SchemaManifest only stores a raw pointer (GH-39562).
825+
DCHECK_EQ(manifest_->descr, original_metadata_->schema())
826+
<< "SchemaDescriptor should be owned by the original FileMetaData";
821827

822828
statistics_expressions_.resize(row_groups_->size(), compute::literal(true));
823829
statistics_expressions_complete_.resize(manifest_->descr->num_columns(), false);
@@ -846,7 +852,8 @@ Result<FragmentVector> ParquetFileFragment::SplitByRowGroup(
846852
parquet_format_.MakeFragment(source_, partition_expression(),
847853
physical_schema_, {row_group}));
848854

849-
RETURN_NOT_OK(fragment->SetMetadata(metadata_, manifest_));
855+
RETURN_NOT_OK(fragment->SetMetadata(metadata_, manifest_,
856+
/*original_metadata=*/original_metadata_));
850857
fragments[i++] = std::move(fragment);
851858
}
852859

@@ -1106,7 +1113,8 @@ ParquetDatasetFactory::CollectParquetFragments(const Partitioning& partitioning)
11061113
format_->MakeFragment({path, filesystem_}, std::move(partition_expression),
11071114
physical_schema_, std::move(row_groups)));
11081115

1109-
RETURN_NOT_OK(fragment->SetMetadata(metadata_subset, manifest_));
1116+
RETURN_NOT_OK(fragment->SetMetadata(metadata_subset, manifest_,
1117+
/*original_metadata=*/metadata_));
11101118
fragments[i++] = std::move(fragment);
11111119
}
11121120

cpp/src/arrow/dataset/file_parquet.h

+4-1
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,8 @@ class ARROW_DS_EXPORT ParquetFileFragment : public FileFragment {
188188
std::optional<std::vector<int>> row_groups);
189189

190190
Status SetMetadata(std::shared_ptr<parquet::FileMetaData> metadata,
191-
std::shared_ptr<parquet::arrow::SchemaManifest> manifest);
191+
std::shared_ptr<parquet::arrow::SchemaManifest> manifest,
192+
std::shared_ptr<parquet::FileMetaData> original_metadata = {});
192193

193194
// Overridden to opportunistically set metadata since a reader must be opened anyway.
194195
Result<std::shared_ptr<Schema>> ReadPhysicalSchemaImpl() override {
@@ -219,6 +220,8 @@ class ARROW_DS_EXPORT ParquetFileFragment : public FileFragment {
219220
std::vector<bool> statistics_expressions_complete_;
220221
std::shared_ptr<parquet::FileMetaData> metadata_;
221222
std::shared_ptr<parquet::arrow::SchemaManifest> manifest_;
223+
// The FileMetaData that owns the SchemaDescriptor pointed by SchemaManifest.
224+
std::shared_ptr<parquet::FileMetaData> original_metadata_;
222225

223226
friend class ParquetFileFormat;
224227
friend class ParquetDatasetFactory;

0 commit comments

Comments
 (0)