update

kaka11chen · kaka11chen · commit 3647221f1bd5 · 2025-10-22T10:58:00.000+08:00
diff --git a/be/src/vec/exec/format/parquet/byte_array_dict_decoder.cpp b/be/src/vec/exec/format/parquet/byte_array_dict_decoder.cpp
@@ -140,19 +140,11 @@ Status ByteArrayDictDecoder::_decode_values(MutableColumnPtr& doris_column, Data
             break;
         }
         case ColumnSelectVector::FILTERED_CONTENT: {
-            // In lazy materialization, keep filtered rows (fill with dict data to maintain row count)
-            std::vector<StringRef> string_values;
-            string_values.reserve(run_length);
-            for (size_t i = 0; i < run_length; ++i) {
-                string_values.emplace_back(_dict_items[_indexes[dict_index++]]);
-            }
-            doris_column->insert_many_strings_overflow(string_values.data(), run_length,
-                                                       _max_value_length);
+            dict_index += run_length;
             break;
         }
         case ColumnSelectVector::FILTERED_NULL: {
-            // In lazy materialization, keep filtered null rows (fill with defaults to maintain row count)
-            doris_column->insert_many_defaults(run_length);
+            // do nothing
             break;
         }
         }
diff --git a/be/src/vec/exec/format/parquet/parquet_common.h b/be/src/vec/exec/format/parquet/parquet_common.h
@@ -154,19 +154,22 @@ class ColumnSelectVector {
 
             _num_filtered = num_values - num_read;
 
-            // Fill null_map for ALL rows (including filtered ones) to maintain row count consistency
-            if (null_map != nullptr) {
+            if (null_map != nullptr && num_read > 0) {
                 NullMap& map_data_column = *null_map;
                 auto null_map_index = map_data_column.size();
-                map_data_column.resize(null_map_index +
-                                       num_values); // Resize to num_values, not num_read
-
-                // Fill null map for all rows based on _data_map
-                for (i = 0; i < num_values; ++i) {
-                    if (_data_map[i] == CONTENT || _data_map[i] == FILTERED_CONTENT) {
-                        map_data_column[null_map_index++] = (UInt8) false;
-                    } else { // NULL_DATA or FILTERED_NULL
-                        map_data_column[null_map_index++] = (UInt8) true;
+                map_data_column.resize(null_map_index + num_read);
+
+                if (_num_nulls == 0) {
+                    memset(map_data_column.data() + null_map_index, 0, num_read);
+                } else if (_num_nulls == num_values) {
+                    memset(map_data_column.data() + null_map_index, 1, num_read);
+                } else {
+                    for (i = 0; i < num_values; ++i) {
+                        if (_data_map[i] == CONTENT) {
+                            map_data_column[null_map_index++] = (UInt8) false;
+                        } else if (_data_map[i] == NULL_DATA) {
+                            map_data_column[null_map_index++] = (UInt8) true;
+                        }
                     }
                 }
             }
diff --git a/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp
@@ -111,13 +111,13 @@ Status ParquetColumnReader::create(io::FileReaderSPtr file, FieldSchema* field,
                                    size_t max_buf_size, const tparquet::OffsetIndex* offset_index,
                                    const std::set<uint64_t>& column_ids,
                                    const std::set<uint64_t>& filter_column_ids) {
-    if (!column_ids.empty()) {
-        uint64_t field_column_id = field->get_column_id();
-        if (column_ids.find(field_column_id) == column_ids.end()) {
-            reader = nullptr; // Don't create reader for this column
-            return Status::OK();
-        }
-    }
+    // if (!column_ids.empty()) {
+    //     uint64_t field_column_id = field->get_column_id();
+    //     if (column_ids.find(field_column_id) == column_ids.end()) {
+    //         reader = nullptr; // Don't create reader for this column
+    //         return Status::OK();
+    //     }
+    // }
     if (field->data_type->get_primitive_type() == TYPE_ARRAY) {
         std::unique_ptr<ParquetColumnReader> element_reader;
         RETURN_IF_ERROR(create(file, &field->children[0], row_group, row_ranges, ctz, io_ctx,
@@ -146,7 +146,8 @@ Status ParquetColumnReader::create(io::FileReaderSPtr file, FieldSchema* field,
         std::unordered_map<std::string, std::unique_ptr<ParquetColumnReader>> child_readers;
         child_readers.reserve(field->children.size());
         for (int i = 0; i < field->children.size(); ++i) {
-            if (column_ids.find(field->children[i].get_column_id()) == column_ids.end()) {
+            if (!column_ids.empty() &&
+                column_ids.find(field->children[i].get_column_id()) == column_ids.end()) {
                 continue; // Skip this child as it's not in the required column_ids
             }
             std::unique_ptr<ParquetColumnReader> child_reader;
diff --git a/be/src/vec/exec/format/table/hive/hive_orc_nested_column_utils.cpp b/be/src/vec/exec/format/table/hive/hive_orc_nested_column_utils.cpp
@@ -51,6 +51,44 @@ void HiveOrcNestedColumnUtils::extract_nested_column_ids(
     // Track whether any child column was added to determine if parent should be included
     bool has_child_columns = false;
 
+    // For MAP type, check if we have conflicting access patterns and normalize them
+    // Problem scenario: Same MAP column has mixed access patterns in different paths:
+    //   - Some paths use "*" (need both keys and values)
+    //   - Some paths use "KEYS" or "VALUES" (only need one side)
+    // Example:
+    //   path1: ["map_col", "*", "nested_field"]     -> needs key AND value
+    //   path2: ["map_col", "KEYS"]                  -> only needs keys
+    // Solution: Merge all into "*" because wildcard is most permissive and subsumes specific access
+    bool has_wildcard =
+            child_paths_by_table_col_name.find("*") != child_paths_by_table_col_name.end();
+    bool has_keys =
+            child_paths_by_table_col_name.find("KEYS") != child_paths_by_table_col_name.end();
+    bool has_values =
+            child_paths_by_table_col_name.find("VALUES") != child_paths_by_table_col_name.end();
+
+    // If wildcard exists with KEYS or VALUES, merge them into wildcard:
+    // - Wildcard "*" requires reading both keys and values
+    // - Specific KEYS/VALUES requests are subsumed by wildcard
+    // - After merge, only wildcard path remains, ensuring correct processing
+    if (type.getKind() == orc::TypeKind::MAP && has_wildcard && (has_keys || has_values)) {
+        // Merge KEYS paths into wildcard if present
+        if (has_keys) {
+            auto& wildcard_paths = child_paths_by_table_col_name["*"];
+            auto& keys_paths = child_paths_by_table_col_name["KEYS"];
+            wildcard_paths.insert(wildcard_paths.end(), keys_paths.begin(), keys_paths.end());
+            child_paths_by_table_col_name.erase("KEYS");
+            has_keys = false;
+        }
+        // Merge VALUES paths into wildcard if present
+        if (has_values) {
+            auto& wildcard_paths = child_paths_by_table_col_name["*"];
+            auto& values_paths = child_paths_by_table_col_name["VALUES"];
+            wildcard_paths.insert(wildcard_paths.end(), values_paths.begin(), values_paths.end());
+            child_paths_by_table_col_name.erase("VALUES");
+            has_values = false;
+        }
+    }
+
     bool only_access_keys = false;
     bool only_access_values = false;
     // Efficiently traverse children - similar to create_iceberg_projected_layout's nested column processing
@@ -73,6 +111,7 @@ void HiveOrcNestedColumnUtils::extract_nested_column_ids(
         case orc::TypeKind::MAP:
             if (i == 0) {
                 DCHECK(type.getSubtypeCount() == 2);
+                // Re-check after potential merge above
                 if (child_paths_by_table_col_name.find("KEYS") !=
                     child_paths_by_table_col_name.end()) {
                     only_access_keys = true;
diff --git a/be/src/vec/exec/format/table/hive/hive_parquet_nested_column_utils.cpp b/be/src/vec/exec/format/table/hive/hive_parquet_nested_column_utils.cpp
@@ -51,6 +51,45 @@ void HiveParquetNestedColumnUtils::extract_nested_column_ids(
     // Track whether any child column was added to determine if parent should be included
     bool has_child_columns = false;
 
+    // For MAP type, check if we have conflicting access patterns and normalize them
+    // Problem scenario: Same MAP column has mixed access patterns in different paths:
+    //   - Some paths use "*" (need both keys and values)
+    //   - Some paths use "KEYS" or "VALUES" (only need one side)
+    // Example:
+    //   path1: ["map_col", "*", "nested_field"]     -> needs key AND value
+    //   path2: ["map_col", "KEYS"]                  -> only needs keys
+    // Solution: Merge all into "*" because wildcard is most permissive and subsumes specific access
+    bool has_wildcard =
+            child_paths_by_table_col_name.find("*") != child_paths_by_table_col_name.end();
+    bool has_keys =
+            child_paths_by_table_col_name.find("KEYS") != child_paths_by_table_col_name.end();
+    bool has_values =
+            child_paths_by_table_col_name.find("VALUES") != child_paths_by_table_col_name.end();
+
+    // If wildcard exists with KEYS or VALUES, merge them into wildcard:
+    // - Wildcard "*" requires reading both keys and values
+    // - Specific KEYS/VALUES requests are subsumed by wildcard
+    // - After merge, only wildcard path remains, ensuring correct processing
+    if (field_schema.data_type->get_primitive_type() == PrimitiveType::TYPE_MAP && has_wildcard &&
+        (has_keys || has_values)) {
+        // Merge KEYS paths into wildcard if present
+        if (has_keys) {
+            auto& wildcard_paths = child_paths_by_table_col_name["*"];
+            auto& keys_paths = child_paths_by_table_col_name["KEYS"];
+            wildcard_paths.insert(wildcard_paths.end(), keys_paths.begin(), keys_paths.end());
+            child_paths_by_table_col_name.erase("KEYS");
+            has_keys = false;
+        }
+        // Merge VALUES paths into wildcard if present
+        if (has_values) {
+            auto& wildcard_paths = child_paths_by_table_col_name["*"];
+            auto& values_paths = child_paths_by_table_col_name["VALUES"];
+            wildcard_paths.insert(wildcard_paths.end(), values_paths.begin(), values_paths.end());
+            child_paths_by_table_col_name.erase("VALUES");
+            has_values = false;
+        }
+    }
+
     // Efficiently traverse children - similar to create_iceberg_projected_layout's nested column processing
     bool only_access_keys = false;
     bool only_access_values = false;
diff --git a/be/src/vec/exec/format/table/iceberg/iceberg_orc_nested_column_utils.cpp b/be/src/vec/exec/format/table/iceberg/iceberg_orc_nested_column_utils.cpp
@@ -47,6 +47,41 @@ void IcebergOrcNestedColumnUtils::extract_nested_column_ids(
         }
     }
 
+    // For MAP type, check if we have conflicting access patterns and normalize them
+    // Problem scenario: Same MAP column has mixed access patterns in different paths:
+    //   - Some paths use "*" (need both keys and values)
+    //   - Some paths use "KEYS" or "VALUES" (only need one side)
+    // Example:
+    //   path1: ["map_col", "*", "nested_field"]     -> needs key AND value
+    //   path2: ["map_col", "KEYS"]                  -> only needs keys
+    // Solution: Merge all into "*" because wildcard is most permissive and subsumes specific access
+    bool has_wildcard = child_paths_by_field_id.find("*") != child_paths_by_field_id.end();
+    bool has_keys = child_paths_by_field_id.find("KEYS") != child_paths_by_field_id.end();
+    bool has_values = child_paths_by_field_id.find("VALUES") != child_paths_by_field_id.end();
+
+    // If wildcard exists with KEYS or VALUES, merge them into wildcard:
+    // - Wildcard "*" requires reading both keys and values
+    // - Specific KEYS/VALUES requests are subsumed by wildcard
+    // - After merge, only wildcard path remains, ensuring correct processing
+    if (type.getKind() == orc::TypeKind::MAP && has_wildcard && (has_keys || has_values)) {
+        // Merge KEYS paths into wildcard if present
+        if (has_keys) {
+            auto& wildcard_paths = child_paths_by_field_id["*"];
+            auto& keys_paths = child_paths_by_field_id["KEYS"];
+            wildcard_paths.insert(wildcard_paths.end(), keys_paths.begin(), keys_paths.end());
+            child_paths_by_field_id.erase("KEYS");
+            has_keys = false;
+        }
+        // Merge VALUES paths into wildcard if present
+        if (has_values) {
+            auto& wildcard_paths = child_paths_by_field_id["*"];
+            auto& values_paths = child_paths_by_field_id["VALUES"];
+            wildcard_paths.insert(wildcard_paths.end(), values_paths.begin(), values_paths.end());
+            child_paths_by_field_id.erase("VALUES");
+            has_values = false;
+        }
+    }
+
     bool has_child_columns = false;
     bool only_access_keys = false;
     bool only_access_values = false;
diff --git a/be/src/vec/exec/format/table/iceberg/iceberg_parquet_nested_column_utils.cpp b/be/src/vec/exec/format/table/iceberg/iceberg_parquet_nested_column_utils.cpp
@@ -50,6 +50,42 @@ void IcebergParquetNestedColumnUtils::extract_nested_column_ids(
     // Track whether any child column was added to determine if parent should be included
     bool has_child_columns = false;
 
+    // For MAP type, check if we have conflicting access patterns and normalize them
+    // Problem scenario: Same MAP column has mixed access patterns in different paths:
+    //   - Some paths use "*" (need both keys and values)
+    //   - Some paths use "KEYS" or "VALUES" (only need one side)
+    // Example:
+    //   path1: ["map_col", "*", "nested_field"]     -> needs key AND value
+    //   path2: ["map_col", "KEYS"]                  -> only needs keys
+    // Solution: Merge all into "*" because wildcard is most permissive and subsumes specific access
+    bool has_wildcard = child_paths_by_field_id.find("*") != child_paths_by_field_id.end();
+    bool has_keys = child_paths_by_field_id.find("KEYS") != child_paths_by_field_id.end();
+    bool has_values = child_paths_by_field_id.find("VALUES") != child_paths_by_field_id.end();
+
+    // If wildcard exists with KEYS or VALUES, merge them into wildcard:
+    // - Wildcard "*" requires reading both keys and values
+    // - Specific KEYS/VALUES requests are subsumed by wildcard
+    // - After merge, only wildcard path remains, ensuring correct processing
+    if (field_schema.data_type->get_primitive_type() == PrimitiveType::TYPE_MAP && has_wildcard &&
+        (has_keys || has_values)) {
+        // Merge KEYS paths into wildcard if present
+        if (has_keys) {
+            auto& wildcard_paths = child_paths_by_field_id["*"];
+            auto& keys_paths = child_paths_by_field_id["KEYS"];
+            wildcard_paths.insert(wildcard_paths.end(), keys_paths.begin(), keys_paths.end());
+            child_paths_by_field_id.erase("KEYS");
+            has_keys = false;
+        }
+        // Merge VALUES paths into wildcard if present
+        if (has_values) {
+            auto& wildcard_paths = child_paths_by_field_id["*"];
+            auto& values_paths = child_paths_by_field_id["VALUES"];
+            wildcard_paths.insert(wildcard_paths.end(), values_paths.begin(), values_paths.end());
+            child_paths_by_field_id.erase("VALUES");
+            has_values = false;
+        }
+    }
+
     // Efficiently traverse children - similar to create_iceberg_projected_layout's nested column processing
     bool only_access_keys = false;
     bool only_access_values = false;
diff --git a/be/test/vec/exec/format/table/hive/hive_reader_create_column_ids_test.cpp b/be/test/vec/exec/format/table/hive/hive_reader_create_column_ids_test.cpp
diff --git a/be/test/vec/exec/format/table/hive/hive_reader_predicate_test.cpp b/be/test/vec/exec/format/table/hive/hive_reader_predicate_test.cpp
diff --git a/be/test/vec/exec/format/table/iceberg/iceberg_reader_create_column_ids_test.cpp b/be/test/vec/exec/format/table/iceberg/iceberg_reader_create_column_ids_test.cpp

Original file line number	Diff line number	Diff line change
`@@ -140,19 +140,11 @@ Status ByteArrayDictDecoder::_decode_values(MutableColumnPtr& doris_column, Data`
`140`	`140`	`break;`
`141`	`141`	`}`
`142`	`142`	`case ColumnSelectVector::FILTERED_CONTENT: {`
`143`		`- // In lazy materialization, keep filtered rows (fill with dict data to maintain row count)`
`144`		`- std::vector<StringRef> string_values;`
`145`		`- string_values.reserve(run_length);`
`146`		`- for (size_t i = 0; i < run_length; ++i) {`
`147`		`- string_values.emplace_back(_dict_items[_indexes[dict_index++]]);`
`148`		`- }`
`149`		`- doris_column->insert_many_strings_overflow(string_values.data(), run_length,`
`150`		`- _max_value_length);`
	`143`	`+ dict_index += run_length;`
`151`	`144`	`break;`
`152`	`145`	`}`
`153`	`146`	`case ColumnSelectVector::FILTERED_NULL: {`
`154`		`- // In lazy materialization, keep filtered null rows (fill with defaults to maintain row count)`
`155`		`- doris_column->insert_many_defaults(run_length);`
	`147`	`+ // do nothing`
`156`	`148`	`break;`
`157`	`149`	`}`
`158`	`150`	`}`