Skip to content

[opt](variant) improve performance for handling nullable column #50021

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 16, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ Status convert_and_write_column(vectorized::OlapBlockDataConvertor* converter,
const uint8_t* nullmap = converted_column->get_nullmap();
RETURN_IF_ERROR(writer->append(nullmap, converted_column->get_data(), num_rows));

converter->clear_source_content();
converter->clear_source_content(column_id);
return Status::OK();
}

Expand Down Expand Up @@ -291,8 +291,8 @@ Status VariantColumnWriterImpl::_process_root_column(vectorized::ColumnObject* p
.data()
: nullptr;
RETURN_IF_ERROR(_root_writer->append(nullmap, column->get_data(), num_rows));
converter->clear_source_content(column_id);
++column_id;
converter->clear_source_content();

_opts.meta->set_num_rows(num_rows);
return Status::OK();
Expand Down Expand Up @@ -408,8 +408,8 @@ Status VariantColumnWriterImpl::_process_sparse_column(
vectorized::ColumnObject::get_sparse_column_type());
RETURN_IF_ERROR(
_sparse_column_writer->append(column->get_nullmap(), column->get_data(), num_rows));
converter->clear_source_content(column_id);
++column_id;
converter->clear_source_content();

// get stastics
// todo: reuse the statics from collected stastics from compaction stage
Expand Down
21 changes: 9 additions & 12 deletions be/src/vec/columns/column_object.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,7 @@ void ColumnObject::Subcolumn::insert(Field field, FieldInfo info) {
if (schema_util::is_conversion_required_between_integers(
base_type.idx, least_common_type.get_base_type_id())) {
VLOG_DEBUG << "Conversion between " << getTypeName(base_type.idx) << " and "
<< getTypeName(least_common_type.get_type_id());
<< getTypeName(least_common_type.get_base_type_id());
DataTypePtr base_data_type;
TypeIndex base_data_type_id;
get_least_supertype_jsonb(
Expand Down Expand Up @@ -792,7 +792,8 @@ void ColumnObject::try_insert(const Field& field) {
}
for (auto& entry : subcolumns) {
if (old_size == entry->data.size()) {
bool inserted = try_insert_default_from_nested(entry);
bool inserted = UNLIKELY(entry->path.has_nested_part() &&
try_insert_default_from_nested(entry));
if (!inserted) {
entry->data.insert_default();
}
Expand Down Expand Up @@ -838,7 +839,6 @@ bool ColumnObject::Subcolumn::is_null_at(size_t n) const {
}
ind -= part->size();
}

throw doris::Exception(ErrorCode::OUT_OF_BOUND, "Index ({}) for getting field is out of range",
n);
}
Expand Down Expand Up @@ -873,7 +873,6 @@ void ColumnObject::Subcolumn::get(size_t n, Field& res) const {

ind -= part->size();
}

throw doris::Exception(ErrorCode::OUT_OF_BOUND, "Index ({}) for getting field is out of range",
n);
}
Expand All @@ -894,20 +893,18 @@ void ColumnObject::Subcolumn::serialize_to_sparse_column(ColumnString* key, std:
row -= num_of_defaults_in_prefix;
for (size_t i = 0; i < data.size(); ++i) {
const auto& part = data[i];
size_t current_column_size = part->size();
const auto& nullable_col =
assert_cast<const ColumnNullable&, TypeCheckOnRelease::DISABLE>(*part);
size_t current_column_size = nullable_col.get_null_map_data().size();
if (row < current_column_size) {
// no need null in sparse column
if (!assert_cast<const ColumnNullable&, TypeCheckOnRelease::DISABLE>(*part).is_null_at(
row)) {
if (!nullable_col.is_null_at(row)) {
// insert key
key->insert_data(path.data(), path.size());

// every subcolumn is always Nullable
auto nullable_serde =
std::static_pointer_cast<DataTypeNullableSerDe>(data_serdes[i]);
auto& nullable_col =
assert_cast<const ColumnNullable&, TypeCheckOnRelease::DISABLE>(*part);

// insert value
ColumnString::Chars& chars = value->get_chars();
nullable_serde->get_nested_serde()->write_one_cell_to_binary(
Expand Down Expand Up @@ -1343,7 +1340,6 @@ size_t ColumnObject::Subcolumn::serialize_text_json(size_t n, BufferWritable& ou

ind -= part->size();
}

throw doris::Exception(ErrorCode::OUT_OF_BOUND,
"Index ({}) for serializing JSON is out of range", n);
}
Expand Down Expand Up @@ -1906,7 +1902,7 @@ Status ColumnObject::finalize(FinalizeMode mode) {
for (size_t i = 0; i < std::min(size_t(_max_subcolumns_count), sorted_by_size.size());
++i) {
// if too many null values, then consider it as sparse column
if ((double)sorted_by_size[i].second < (double)num_rows * 0.95) {
if ((double)sorted_by_size[i].second < (double)num_rows * 0.99) {
continue;
}
selected_path.insert(sorted_by_size[i].first);
Expand Down Expand Up @@ -2035,6 +2031,7 @@ void ColumnObject::clear_column_data() {
(*std::move(part)).clear();
}
entry->data.num_of_defaults_in_prefix = 0;
entry->data.current_num_of_defaults = 0;
entry->data.num_rows = 0;
}
serialized_sparse_column->clear();
Expand Down
2 changes: 1 addition & 1 deletion be/src/vec/columns/column_object.h
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ class ColumnObject final : public COWHelper<IColumn, ColumnObject> {

void reset_current_num_of_defaults() { current_num_of_defaults = 0; }

size_t cur_num_of_defaults() { return current_num_of_defaults; }
size_t cur_num_of_defaults() const { return current_num_of_defaults; }

void insert_many_defaults(size_t length);

Expand Down
Loading