Skip to content

Commit 3647221

Browse files
committed
update
1 parent c767d40 commit 3647221

10 files changed

+177
-2698
lines changed

be/src/vec/exec/format/parquet/byte_array_dict_decoder.cpp

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -140,19 +140,11 @@ Status ByteArrayDictDecoder::_decode_values(MutableColumnPtr& doris_column, Data
140140
break;
141141
}
142142
case ColumnSelectVector::FILTERED_CONTENT: {
143-
// In lazy materialization, keep filtered rows (fill with dict data to maintain row count)
144-
std::vector<StringRef> string_values;
145-
string_values.reserve(run_length);
146-
for (size_t i = 0; i < run_length; ++i) {
147-
string_values.emplace_back(_dict_items[_indexes[dict_index++]]);
148-
}
149-
doris_column->insert_many_strings_overflow(string_values.data(), run_length,
150-
_max_value_length);
143+
dict_index += run_length;
151144
break;
152145
}
153146
case ColumnSelectVector::FILTERED_NULL: {
154-
// In lazy materialization, keep filtered null rows (fill with defaults to maintain row count)
155-
doris_column->insert_many_defaults(run_length);
147+
// do nothing
156148
break;
157149
}
158150
}

be/src/vec/exec/format/parquet/parquet_common.h

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -154,19 +154,22 @@ class ColumnSelectVector {
154154

155155
_num_filtered = num_values - num_read;
156156

157-
// Fill null_map for ALL rows (including filtered ones) to maintain row count consistency
158-
if (null_map != nullptr) {
157+
if (null_map != nullptr && num_read > 0) {
159158
NullMap& map_data_column = *null_map;
160159
auto null_map_index = map_data_column.size();
161-
map_data_column.resize(null_map_index +
162-
num_values); // Resize to num_values, not num_read
163-
164-
// Fill null map for all rows based on _data_map
165-
for (i = 0; i < num_values; ++i) {
166-
if (_data_map[i] == CONTENT || _data_map[i] == FILTERED_CONTENT) {
167-
map_data_column[null_map_index++] = (UInt8) false;
168-
} else { // NULL_DATA or FILTERED_NULL
169-
map_data_column[null_map_index++] = (UInt8) true;
160+
map_data_column.resize(null_map_index + num_read);
161+
162+
if (_num_nulls == 0) {
163+
memset(map_data_column.data() + null_map_index, 0, num_read);
164+
} else if (_num_nulls == num_values) {
165+
memset(map_data_column.data() + null_map_index, 1, num_read);
166+
} else {
167+
for (i = 0; i < num_values; ++i) {
168+
if (_data_map[i] == CONTENT) {
169+
map_data_column[null_map_index++] = (UInt8) false;
170+
} else if (_data_map[i] == NULL_DATA) {
171+
map_data_column[null_map_index++] = (UInt8) true;
172+
}
170173
}
171174
}
172175
}

be/src/vec/exec/format/parquet/vparquet_column_reader.cpp

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -111,13 +111,13 @@ Status ParquetColumnReader::create(io::FileReaderSPtr file, FieldSchema* field,
111111
size_t max_buf_size, const tparquet::OffsetIndex* offset_index,
112112
const std::set<uint64_t>& column_ids,
113113
const std::set<uint64_t>& filter_column_ids) {
114-
if (!column_ids.empty()) {
115-
uint64_t field_column_id = field->get_column_id();
116-
if (column_ids.find(field_column_id) == column_ids.end()) {
117-
reader = nullptr; // Don't create reader for this column
118-
return Status::OK();
119-
}
120-
}
114+
// if (!column_ids.empty()) {
115+
// uint64_t field_column_id = field->get_column_id();
116+
// if (column_ids.find(field_column_id) == column_ids.end()) {
117+
// reader = nullptr; // Don't create reader for this column
118+
// return Status::OK();
119+
// }
120+
// }
121121
if (field->data_type->get_primitive_type() == TYPE_ARRAY) {
122122
std::unique_ptr<ParquetColumnReader> element_reader;
123123
RETURN_IF_ERROR(create(file, &field->children[0], row_group, row_ranges, ctz, io_ctx,
@@ -146,7 +146,8 @@ Status ParquetColumnReader::create(io::FileReaderSPtr file, FieldSchema* field,
146146
std::unordered_map<std::string, std::unique_ptr<ParquetColumnReader>> child_readers;
147147
child_readers.reserve(field->children.size());
148148
for (int i = 0; i < field->children.size(); ++i) {
149-
if (column_ids.find(field->children[i].get_column_id()) == column_ids.end()) {
149+
if (!column_ids.empty() &&
150+
column_ids.find(field->children[i].get_column_id()) == column_ids.end()) {
150151
continue; // Skip this child as it's not in the required column_ids
151152
}
152153
std::unique_ptr<ParquetColumnReader> child_reader;

be/src/vec/exec/format/table/hive/hive_orc_nested_column_utils.cpp

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,44 @@ void HiveOrcNestedColumnUtils::extract_nested_column_ids(
5151
// Track whether any child column was added to determine if parent should be included
5252
bool has_child_columns = false;
5353

54+
// For MAP type, check if we have conflicting access patterns and normalize them
55+
// Problem scenario: Same MAP column has mixed access patterns in different paths:
56+
// - Some paths use "*" (need both keys and values)
57+
// - Some paths use "KEYS" or "VALUES" (only need one side)
58+
// Example:
59+
// path1: ["map_col", "*", "nested_field"] -> needs key AND value
60+
// path2: ["map_col", "KEYS"] -> only needs keys
61+
// Solution: Merge all into "*" because wildcard is most permissive and subsumes specific access
62+
bool has_wildcard =
63+
child_paths_by_table_col_name.find("*") != child_paths_by_table_col_name.end();
64+
bool has_keys =
65+
child_paths_by_table_col_name.find("KEYS") != child_paths_by_table_col_name.end();
66+
bool has_values =
67+
child_paths_by_table_col_name.find("VALUES") != child_paths_by_table_col_name.end();
68+
69+
// If wildcard exists with KEYS or VALUES, merge them into wildcard:
70+
// - Wildcard "*" requires reading both keys and values
71+
// - Specific KEYS/VALUES requests are subsumed by wildcard
72+
// - After merge, only wildcard path remains, ensuring correct processing
73+
if (type.getKind() == orc::TypeKind::MAP && has_wildcard && (has_keys || has_values)) {
74+
// Merge KEYS paths into wildcard if present
75+
if (has_keys) {
76+
auto& wildcard_paths = child_paths_by_table_col_name["*"];
77+
auto& keys_paths = child_paths_by_table_col_name["KEYS"];
78+
wildcard_paths.insert(wildcard_paths.end(), keys_paths.begin(), keys_paths.end());
79+
child_paths_by_table_col_name.erase("KEYS");
80+
has_keys = false;
81+
}
82+
// Merge VALUES paths into wildcard if present
83+
if (has_values) {
84+
auto& wildcard_paths = child_paths_by_table_col_name["*"];
85+
auto& values_paths = child_paths_by_table_col_name["VALUES"];
86+
wildcard_paths.insert(wildcard_paths.end(), values_paths.begin(), values_paths.end());
87+
child_paths_by_table_col_name.erase("VALUES");
88+
has_values = false;
89+
}
90+
}
91+
5492
bool only_access_keys = false;
5593
bool only_access_values = false;
5694
// Efficiently traverse children - similar to create_iceberg_projected_layout's nested column processing
@@ -73,6 +111,7 @@ void HiveOrcNestedColumnUtils::extract_nested_column_ids(
73111
case orc::TypeKind::MAP:
74112
if (i == 0) {
75113
DCHECK(type.getSubtypeCount() == 2);
114+
// Re-check after potential merge above
76115
if (child_paths_by_table_col_name.find("KEYS") !=
77116
child_paths_by_table_col_name.end()) {
78117
only_access_keys = true;

be/src/vec/exec/format/table/hive/hive_parquet_nested_column_utils.cpp

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,45 @@ void HiveParquetNestedColumnUtils::extract_nested_column_ids(
5151
// Track whether any child column was added to determine if parent should be included
5252
bool has_child_columns = false;
5353

54+
// For MAP type, check if we have conflicting access patterns and normalize them
55+
// Problem scenario: Same MAP column has mixed access patterns in different paths:
56+
// - Some paths use "*" (need both keys and values)
57+
// - Some paths use "KEYS" or "VALUES" (only need one side)
58+
// Example:
59+
// path1: ["map_col", "*", "nested_field"] -> needs key AND value
60+
// path2: ["map_col", "KEYS"] -> only needs keys
61+
// Solution: Merge all into "*" because wildcard is most permissive and subsumes specific access
62+
bool has_wildcard =
63+
child_paths_by_table_col_name.find("*") != child_paths_by_table_col_name.end();
64+
bool has_keys =
65+
child_paths_by_table_col_name.find("KEYS") != child_paths_by_table_col_name.end();
66+
bool has_values =
67+
child_paths_by_table_col_name.find("VALUES") != child_paths_by_table_col_name.end();
68+
69+
// If wildcard exists with KEYS or VALUES, merge them into wildcard:
70+
// - Wildcard "*" requires reading both keys and values
71+
// - Specific KEYS/VALUES requests are subsumed by wildcard
72+
// - After merge, only wildcard path remains, ensuring correct processing
73+
if (field_schema.data_type->get_primitive_type() == PrimitiveType::TYPE_MAP && has_wildcard &&
74+
(has_keys || has_values)) {
75+
// Merge KEYS paths into wildcard if present
76+
if (has_keys) {
77+
auto& wildcard_paths = child_paths_by_table_col_name["*"];
78+
auto& keys_paths = child_paths_by_table_col_name["KEYS"];
79+
wildcard_paths.insert(wildcard_paths.end(), keys_paths.begin(), keys_paths.end());
80+
child_paths_by_table_col_name.erase("KEYS");
81+
has_keys = false;
82+
}
83+
// Merge VALUES paths into wildcard if present
84+
if (has_values) {
85+
auto& wildcard_paths = child_paths_by_table_col_name["*"];
86+
auto& values_paths = child_paths_by_table_col_name["VALUES"];
87+
wildcard_paths.insert(wildcard_paths.end(), values_paths.begin(), values_paths.end());
88+
child_paths_by_table_col_name.erase("VALUES");
89+
has_values = false;
90+
}
91+
}
92+
5493
// Efficiently traverse children - similar to create_iceberg_projected_layout's nested column processing
5594
bool only_access_keys = false;
5695
bool only_access_values = false;

be/src/vec/exec/format/table/iceberg/iceberg_orc_nested_column_utils.cpp

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,41 @@ void IcebergOrcNestedColumnUtils::extract_nested_column_ids(
4747
}
4848
}
4949

50+
// For MAP type, check if we have conflicting access patterns and normalize them
51+
// Problem scenario: Same MAP column has mixed access patterns in different paths:
52+
// - Some paths use "*" (need both keys and values)
53+
// - Some paths use "KEYS" or "VALUES" (only need one side)
54+
// Example:
55+
// path1: ["map_col", "*", "nested_field"] -> needs key AND value
56+
// path2: ["map_col", "KEYS"] -> only needs keys
57+
// Solution: Merge all into "*" because wildcard is most permissive and subsumes specific access
58+
bool has_wildcard = child_paths_by_field_id.find("*") != child_paths_by_field_id.end();
59+
bool has_keys = child_paths_by_field_id.find("KEYS") != child_paths_by_field_id.end();
60+
bool has_values = child_paths_by_field_id.find("VALUES") != child_paths_by_field_id.end();
61+
62+
// If wildcard exists with KEYS or VALUES, merge them into wildcard:
63+
// - Wildcard "*" requires reading both keys and values
64+
// - Specific KEYS/VALUES requests are subsumed by wildcard
65+
// - After merge, only wildcard path remains, ensuring correct processing
66+
if (type.getKind() == orc::TypeKind::MAP && has_wildcard && (has_keys || has_values)) {
67+
// Merge KEYS paths into wildcard if present
68+
if (has_keys) {
69+
auto& wildcard_paths = child_paths_by_field_id["*"];
70+
auto& keys_paths = child_paths_by_field_id["KEYS"];
71+
wildcard_paths.insert(wildcard_paths.end(), keys_paths.begin(), keys_paths.end());
72+
child_paths_by_field_id.erase("KEYS");
73+
has_keys = false;
74+
}
75+
// Merge VALUES paths into wildcard if present
76+
if (has_values) {
77+
auto& wildcard_paths = child_paths_by_field_id["*"];
78+
auto& values_paths = child_paths_by_field_id["VALUES"];
79+
wildcard_paths.insert(wildcard_paths.end(), values_paths.begin(), values_paths.end());
80+
child_paths_by_field_id.erase("VALUES");
81+
has_values = false;
82+
}
83+
}
84+
5085
bool has_child_columns = false;
5186
bool only_access_keys = false;
5287
bool only_access_values = false;

be/src/vec/exec/format/table/iceberg/iceberg_parquet_nested_column_utils.cpp

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,42 @@ void IcebergParquetNestedColumnUtils::extract_nested_column_ids(
5050
// Track whether any child column was added to determine if parent should be included
5151
bool has_child_columns = false;
5252

53+
// For MAP type, check if we have conflicting access patterns and normalize them
54+
// Problem scenario: Same MAP column has mixed access patterns in different paths:
55+
// - Some paths use "*" (need both keys and values)
56+
// - Some paths use "KEYS" or "VALUES" (only need one side)
57+
// Example:
58+
// path1: ["map_col", "*", "nested_field"] -> needs key AND value
59+
// path2: ["map_col", "KEYS"] -> only needs keys
60+
// Solution: Merge all into "*" because wildcard is most permissive and subsumes specific access
61+
bool has_wildcard = child_paths_by_field_id.find("*") != child_paths_by_field_id.end();
62+
bool has_keys = child_paths_by_field_id.find("KEYS") != child_paths_by_field_id.end();
63+
bool has_values = child_paths_by_field_id.find("VALUES") != child_paths_by_field_id.end();
64+
65+
// If wildcard exists with KEYS or VALUES, merge them into wildcard:
66+
// - Wildcard "*" requires reading both keys and values
67+
// - Specific KEYS/VALUES requests are subsumed by wildcard
68+
// - After merge, only wildcard path remains, ensuring correct processing
69+
if (field_schema.data_type->get_primitive_type() == PrimitiveType::TYPE_MAP && has_wildcard &&
70+
(has_keys || has_values)) {
71+
// Merge KEYS paths into wildcard if present
72+
if (has_keys) {
73+
auto& wildcard_paths = child_paths_by_field_id["*"];
74+
auto& keys_paths = child_paths_by_field_id["KEYS"];
75+
wildcard_paths.insert(wildcard_paths.end(), keys_paths.begin(), keys_paths.end());
76+
child_paths_by_field_id.erase("KEYS");
77+
has_keys = false;
78+
}
79+
// Merge VALUES paths into wildcard if present
80+
if (has_values) {
81+
auto& wildcard_paths = child_paths_by_field_id["*"];
82+
auto& values_paths = child_paths_by_field_id["VALUES"];
83+
wildcard_paths.insert(wildcard_paths.end(), values_paths.begin(), values_paths.end());
84+
child_paths_by_field_id.erase("VALUES");
85+
has_values = false;
86+
}
87+
}
88+
5389
// Efficiently traverse children - similar to create_iceberg_projected_layout's nested column processing
5490
bool only_access_keys = false;
5591
bool only_access_values = false;

0 commit comments

Comments
 (0)