Skip to content

Commit f50cf6b

Browse files
kaka11chen924060929mrhhsg
committed
[feature](reader) Optimize Complex Type Column Reading with Column Pruning.
Co-authored-by: 924060929 <[email protected]> Co-authored-by: Jerry Hu <[email protected]>
1 parent ada3074 commit f50cf6b

File tree

150 files changed

+23523
-10093
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

150 files changed

+23523
-10093
lines changed

be/src/http/action/debug_point_action.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@ void BaseDebugPointAction::handle(HttpRequest* req) {
3535
"Disable debug points. please check config::enable_debug_points");
3636
}
3737
std::string result = status.to_json();
38-
LOG(INFO) << "handle request result:" << result;
3938
if (status.ok()) {
4039
HttpChannel::send_reply(req, HttpStatus::OK, result);
4140
} else {

be/src/olap/iterators.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,6 @@ struct IteratorRowRef;
4646
namespace segment_v2 {
4747
struct SubstreamIterator;
4848
}
49-
5049
class StorageReadOptions {
5150
public:
5251
struct KeyRange {
@@ -143,6 +142,9 @@ class StorageReadOptions {
143142
std::map<ColumnId, size_t> vir_cid_to_idx_in_block;
144143
std::map<size_t, vectorized::DataTypePtr> vir_col_idx_to_type;
145144

145+
std::map<int32_t, TColumnAccessPaths> all_access_paths;
146+
std::map<int32_t, TColumnAccessPaths> predicate_access_paths;
147+
146148
std::shared_ptr<vectorized::ScoreRuntime> score_runtime;
147149
CollectionStatisticsPtr collection_statistics;
148150

be/src/olap/rowset/beta_rowset_reader.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,10 @@ Status BetaRowsetReader::get_segment_iterators(RowsetReaderContext* read_context
103103
_read_options.remaining_conjunct_roots = _read_context->remaining_conjunct_roots;
104104
_read_options.common_expr_ctxs_push_down = _read_context->common_expr_ctxs_push_down;
105105
_read_options.virtual_column_exprs = _read_context->virtual_column_exprs;
106+
107+
_read_options.all_access_paths = _read_context->all_access_paths;
108+
_read_options.predicate_access_paths = _read_context->predicate_access_paths;
109+
106110
_read_options.ann_topn_runtime = _read_context->ann_topn_runtime;
107111
_read_options.vir_cid_to_idx_in_block = _read_context->vir_cid_to_idx_in_block;
108112
_read_options.vir_col_idx_to_type = _read_context->vir_col_idx_to_type;

be/src/olap/rowset/rowset_reader_context.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818
#ifndef DORIS_BE_SRC_OLAP_ROWSET_ROWSET_READER_CONTEXT_H
1919
#define DORIS_BE_SRC_OLAP_ROWSET_ROWSET_READER_CONTEXT_H
2020

21+
#include <vector>
22+
2123
#include "io/io_common.h"
2224
#include "olap/column_predicate.h"
2325
#include "olap/olap_common.h"
@@ -90,6 +92,9 @@ struct RowsetReaderContext {
9092
std::map<ColumnId, size_t> vir_cid_to_idx_in_block;
9193
std::map<size_t, vectorized::DataTypePtr> vir_col_idx_to_type;
9294

95+
std::map<int32_t, TColumnAccessPaths> all_access_paths;
96+
std::map<int32_t, TColumnAccessPaths> predicate_access_paths;
97+
9398
std::shared_ptr<vectorized::ScoreRuntime> score_runtime;
9499
CollectionStatisticsPtr collection_statistics;
95100
std::shared_ptr<segment_v2::AnnTopNRuntime> ann_topn_runtime;

be/src/olap/rowset/segment_v2/column_reader.cpp

Lines changed: 351 additions & 10 deletions
Large diffs are not rendered by default.

be/src/olap/rowset/segment_v2/column_reader.h

Lines changed: 69 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
#pragma once
1919

20+
#include <gen_cpp/Descriptors_types.h>
2021
#include <gen_cpp/segment_v2.pb.h>
2122
#include <sys/types.h>
2223

@@ -69,6 +70,8 @@ class FileReader;
6970
struct Slice;
7071
struct StringRef;
7172

73+
using TColumnAccessPaths = std::vector<TColumnAccessPath>;
74+
7275
namespace segment_v2 {
7376

7477
class EncodingInfo;
@@ -271,7 +274,6 @@ class ColumnReader : public MetadataAdder<ColumnReader>,
271274
Status _calculate_row_ranges(const std::vector<uint32_t>& page_indexes, RowRanges* row_ranges,
272275
const ColumnIteratorOptions& iter_opts);
273276

274-
private:
275277
int64_t _meta_length;
276278
FieldType _meta_type;
277279
FieldType _meta_children_column_type;
@@ -366,8 +368,54 @@ class ColumnIterator {
366368

367369
virtual bool is_all_dict_encoding() const { return false; }
368370

371+
virtual Status set_access_paths(const TColumnAccessPaths& all_access_paths,
372+
const TColumnAccessPaths& predicate_access_paths) {
373+
if (!predicate_access_paths.empty()) {
374+
_reading_flag = ReadingFlag::READING_FOR_PREDICATE;
375+
}
376+
return Status::OK();
377+
}
378+
379+
void set_column_name(const std::string& column_name) { _column_name = column_name; }
380+
381+
const std::string& column_name() const { return _column_name; }
382+
383+
// Since there may be multiple paths with conflicts or overlaps,
384+
// we need to define several reading flags:
385+
//
386+
// NORMAL_READING — Default value, indicating that the column should be read.
387+
// SKIP_READING — The column should not be read.
388+
// NEED_TO_READ — The column must be read.
389+
// READING_FOR_PREDICATE — The column is required for predicate evaluation.
390+
//
391+
// For example, suppose there are two paths:
392+
// - Path 1 specifies that column A needs to be read, so it is marked as NEED_TO_READ.
393+
// - Path 2 specifies that the column should not be read, but since it is already marked as NEED_TO_READ,
394+
// it should not be changed to SKIP_READING.
395+
enum class ReadingFlag : int {
396+
NORMAL_READING,
397+
SKIP_READING,
398+
NEED_TO_READ,
399+
READING_FOR_PREDICATE
400+
};
401+
void set_reading_flag(ReadingFlag flag) {
402+
if (static_cast<int>(flag) > static_cast<int>(_reading_flag)) {
403+
_reading_flag = flag;
404+
}
405+
}
406+
407+
ReadingFlag reading_flag() const { return _reading_flag; }
408+
409+
virtual void set_need_to_read() { set_reading_flag(ReadingFlag::NEED_TO_READ); }
410+
411+
virtual void remove_pruned_sub_iterators() {};
412+
369413
protected:
414+
Result<TColumnAccessPaths> _get_sub_access_paths(const TColumnAccessPaths& access_paths);
370415
ColumnIteratorOptions _opts;
416+
417+
ReadingFlag _reading_flag {ReadingFlag::NORMAL_READING};
418+
std::string _column_name;
371419
};
372420

373421
// This iterator is used to read column data from file
@@ -504,6 +552,13 @@ class MapFileColumnIterator final : public ColumnIterator {
504552
return _offsets_iterator->get_current_ordinal();
505553
}
506554

555+
Status set_access_paths(const TColumnAccessPaths& all_access_paths,
556+
const TColumnAccessPaths& predicate_access_paths) override;
557+
558+
void set_need_to_read() override;
559+
560+
void remove_pruned_sub_iterators() override;
561+
507562
private:
508563
std::shared_ptr<ColumnReader> _map_reader = nullptr;
509564
ColumnIteratorUPtr _null_iterator;
@@ -533,6 +588,13 @@ class StructFileColumnIterator final : public ColumnIterator {
533588
return _sub_column_iterators[0]->get_current_ordinal();
534589
}
535590

591+
Status set_access_paths(const TColumnAccessPaths& all_access_paths,
592+
const TColumnAccessPaths& predicate_access_paths) override;
593+
594+
void set_need_to_read() override;
595+
596+
void remove_pruned_sub_iterators() override;
597+
536598
private:
537599
std::shared_ptr<ColumnReader> _struct_reader = nullptr;
538600
ColumnIteratorUPtr _null_iterator;
@@ -561,6 +623,12 @@ class ArrayFileColumnIterator final : public ColumnIterator {
561623
return _offset_iterator->get_current_ordinal();
562624
}
563625

626+
Status set_access_paths(const TColumnAccessPaths& all_access_paths,
627+
const TColumnAccessPaths& predicate_access_paths) override;
628+
void set_need_to_read() override;
629+
630+
void remove_pruned_sub_iterators() override;
631+
564632
private:
565633
std::shared_ptr<ColumnReader> _array_reader = nullptr;
566634
std::unique_ptr<OffsetFileColumnIterator> _offset_iterator;

be/src/olap/rowset/segment_v2/segment.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,14 @@
1717

1818
#include "olap/rowset/segment_v2/segment.h"
1919

20+
#include <gen_cpp/Descriptors_types.h>
2021
#include <gen_cpp/PlanNodes_types.h>
2122
#include <gen_cpp/olap_file.pb.h>
2223
#include <gen_cpp/segment_v2.pb.h>
2324

2425
#include <cstring>
2526
#include <memory>
27+
#include <sstream>
2628
#include <utility>
2729

2830
#include "cloud/config.h"
@@ -765,6 +767,20 @@ Status Segment::new_column_iterator(const TabletColumn& tablet_column,
765767
sparse_column_cache_ptr));
766768
} else {
767769
RETURN_IF_ERROR(reader->new_iterator(iter, &tablet_column, opt));
770+
if (opt->all_access_paths.contains(unique_id) ||
771+
opt->predicate_access_paths.contains(unique_id)) {
772+
const auto& all_access_paths = opt->all_access_paths.contains(unique_id)
773+
? opt->all_access_paths.at(unique_id)
774+
: TColumnAccessPaths {};
775+
const auto& predicate_access_paths = opt->predicate_access_paths.contains(unique_id)
776+
? opt->predicate_access_paths.at(unique_id)
777+
: TColumnAccessPaths {};
778+
779+
// set column name to apply access paths.
780+
(*iter)->set_column_name(tablet_column.name());
781+
RETURN_IF_ERROR((*iter)->set_access_paths(all_access_paths, predicate_access_paths));
782+
(*iter)->remove_pruned_sub_iterators();
783+
}
768784
}
769785

770786
if (config::enable_column_type_check && !tablet_column.has_path_info() &&

be/src/olap/tablet_reader.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -263,6 +263,9 @@ Status TabletReader::_capture_rs_readers(const ReaderParams& read_params) {
263263
_reader_context.ann_topn_runtime = read_params.ann_topn_runtime;
264264

265265
_reader_context.condition_cache_digest = read_params.condition_cache_digest;
266+
_reader_context.all_access_paths = read_params.all_access_paths;
267+
_reader_context.predicate_access_paths = read_params.predicate_access_paths;
268+
266269
return Status::OK();
267270
}
268271

be/src/olap/tablet_reader.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
#pragma once
1919

20+
#include <gen_cpp/Descriptors_types.h>
2021
#include <gen_cpp/PaloInternalService_types.h>
2122
#include <gen_cpp/PlanNodes_types.h>
2223
#include <stddef.h>
@@ -146,6 +147,9 @@ class TabletReader {
146147
// slots that cast may be eliminated in storage layer
147148
std::map<std::string, vectorized::DataTypePtr> target_cast_type_for_variants;
148149

150+
std::map<int32_t, TColumnAccessPaths> all_access_paths;
151+
std::map<int32_t, TColumnAccessPaths> predicate_access_paths;
152+
149153
std::vector<RowSetSplits> rs_splits;
150154
// For unique key table with merge-on-write
151155
DeleteBitmapPtr delete_bitmap = nullptr;

be/src/olap/tablet_schema.cpp

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1684,6 +1684,14 @@ vectorized::Block TabletSchema::create_block(
16841684
tablet_columns_need_convert_null->find(cid) !=
16851685
tablet_columns_need_convert_null->end());
16861686
auto data_type = vectorized::DataTypeFactory::instance().create_data_type(col, is_nullable);
1687+
if (col.type() == FieldType::OLAP_FIELD_TYPE_STRUCT ||
1688+
col.type() == FieldType::OLAP_FIELD_TYPE_MAP ||
1689+
col.type() == FieldType::OLAP_FIELD_TYPE_ARRAY) {
1690+
if (_pruned_columns_data_type.contains(col.unique_id())) {
1691+
data_type = _pruned_columns_data_type.at(col.unique_id());
1692+
}
1693+
}
1694+
16871695
if (_vir_col_idx_to_unique_id.contains(cid)) {
16881696
block.insert({vectorized::ColumnNothing::create(0), data_type, col.name()});
16891697
VLOG_DEBUG << fmt::format(
@@ -1703,7 +1711,13 @@ vectorized::Block TabletSchema::create_block(bool ignore_dropped_col) const {
17031711
if (ignore_dropped_col && is_dropped_column(*col)) {
17041712
continue;
17051713
}
1714+
17061715
auto data_type = vectorized::DataTypeFactory::instance().create_data_type(*col);
1716+
if (col->type() == FieldType::OLAP_FIELD_TYPE_STRUCT) {
1717+
if (_pruned_columns_data_type.contains(col->unique_id())) {
1718+
data_type = _pruned_columns_data_type.at(col->unique_id());
1719+
}
1720+
}
17071721
block.insert({data_type->create_column(), data_type, col->name()});
17081722
}
17091723
return block;
@@ -1714,6 +1728,11 @@ vectorized::Block TabletSchema::create_block_by_cids(const std::vector<uint32_t>
17141728
for (const auto& cid : cids) {
17151729
const auto& col = *_cols[cid];
17161730
auto data_type = vectorized::DataTypeFactory::instance().create_data_type(col);
1731+
if (col.type() == FieldType::OLAP_FIELD_TYPE_STRUCT) {
1732+
if (_pruned_columns_data_type.contains(col.unique_id())) {
1733+
data_type = _pruned_columns_data_type.at(col.unique_id());
1734+
}
1735+
}
17171736
block.insert({data_type->create_column(), data_type, col.name()});
17181737
}
17191738
return block;

0 commit comments

Comments
 (0)