|
54 | 54 | import org.apache.iceberg.expressions.ResidualEvaluator; |
55 | 55 | import org.apache.iceberg.hive.HiveVersion; |
56 | 56 | import org.apache.iceberg.mr.InputFormatConfig; |
| 57 | +import org.apache.iceberg.mr.hive.variant.VariantFilterRewriter; |
57 | 58 | import org.apache.iceberg.mr.mapred.AbstractMapredIcebergRecordReader; |
58 | 59 | import org.apache.iceberg.mr.mapred.Container; |
59 | 60 | import org.apache.iceberg.mr.mapred.MapredIcebergInputFormat; |
@@ -91,65 +92,129 @@ public class HiveIcebergInputFormat extends MapredIcebergInputFormat<Record> |
91 | 92 | } |
92 | 93 |
|
93 | 94 | /** |
94 | | - * Converts the Hive filter found in the job conf to an Iceberg filter expression. |
95 | | - * @param conf - job conf |
96 | | - * @return - Iceberg data filter expression |
| 95 | + * Encapsulates planning-time and reader-time Iceberg filter expressions derived from Hive predicates. |
97 | 96 | */ |
98 | | - static Expression icebergDataFilterFromHiveConf(Configuration conf) { |
99 | | - Expression icebergFilter = SerializationUtil.deserializeFromBase64(conf.get(InputFormatConfig.FILTER_EXPRESSION)); |
100 | | - if (icebergFilter != null) { |
101 | | - // in case we already have it prepared.. |
102 | | - return icebergFilter; |
| 97 | + private static final class FilterExpressions { |
| 98 | + |
| 99 | + private static Expression planningFilter(Configuration conf) { |
| 100 | + // Planning-safe filter (extract removed) may already be serialized for reuse. |
| 101 | + Expression planningFilter = SerializationUtil |
| 102 | + .deserializeFromBase64(conf.get(InputFormatConfig.FILTER_EXPRESSION)); |
| 103 | + if (planningFilter != null) { |
| 104 | + // in case we already have it prepared.. |
| 105 | + return planningFilter; |
| 106 | + } |
| 107 | + // Reader filter should retain extract(...) for row-group pruning. Rebuild from Hive predicate to avoid losing |
| 108 | + // variant rewrites when planningFilter was stripped. |
| 109 | + Expression readerFilter = icebergDataFilterFromHiveConf(conf); |
| 110 | + if (readerFilter != null) { |
| 111 | + return VariantFilterRewriter.stripVariantExtractPredicates(readerFilter); |
| 112 | + } |
| 113 | + return null; |
103 | 114 | } |
104 | | - String hiveFilter = conf.get(TableScanDesc.FILTER_EXPR_CONF_STR); |
105 | | - if (hiveFilter != null) { |
106 | | - ExprNodeGenericFuncDesc exprNodeDesc = SerializationUtilities |
107 | | - .deserializeObject(hiveFilter, ExprNodeGenericFuncDesc.class); |
108 | | - return getFilterExpr(conf, exprNodeDesc); |
| 115 | + |
| 116 | + private static Expression icebergDataFilterFromHiveConf(Configuration conf) { |
| 117 | + // Build an Iceberg filter from Hive's serialized predicate so we can preserve extract(...) terms for |
| 118 | + // reader-level pruning (e.g. Parquet shredded VARIANT row-group pruning). |
| 119 | + // |
| 120 | + // This intentionally does NOT consult FILTER_EXPRESSION, because FILTER_EXPRESSION must remain safe for |
| 121 | + // Iceberg planning-time utilities (some of which cannot stringify extract(...) terms). |
| 122 | + String hiveFilter = conf.get(TableScanDesc.FILTER_EXPR_CONF_STR); |
| 123 | + if (hiveFilter != null) { |
| 124 | + ExprNodeGenericFuncDesc exprNodeDesc = |
| 125 | + SerializationUtilities.deserializeObject(hiveFilter, ExprNodeGenericFuncDesc.class); |
| 126 | + return getFilterExpr(conf, exprNodeDesc); |
| 127 | + } |
| 128 | + return null; |
| 129 | + } |
| 130 | + |
| 131 | + private static Expression planningResidual(FileScanTask task, Configuration conf) { |
| 132 | + return residual(task, conf, planningFilter(conf)); |
| 133 | + } |
| 134 | + |
| 135 | + private static Expression readerResidual(FileScanTask task, Configuration conf) { |
| 136 | + return residual(task, conf, icebergDataFilterFromHiveConf(conf)); |
| 137 | + } |
| 138 | + |
| 139 | + private static Expression residual(FileScanTask task, Configuration conf, Expression filter) { |
| 140 | + if (filter == null) { |
| 141 | + return Expressions.alwaysTrue(); |
| 142 | + } |
| 143 | + boolean caseSensitive = conf.getBoolean( |
| 144 | + InputFormatConfig.CASE_SENSITIVE, InputFormatConfig.CASE_SENSITIVE_DEFAULT); |
| 145 | + |
| 146 | + return ResidualEvaluator.of(task.spec(), filter, caseSensitive) |
| 147 | + .residualFor(task.file().partition()); |
109 | 148 | } |
110 | | - return null; |
111 | 149 | } |
112 | 150 |
|
113 | 151 | /** |
114 | | - * getFilterExpr extracts search argument from ExprNodeGenericFuncDesc and returns Iceberg Filter Expression |
| 152 | + * Builds an Iceberg filter expression from a Hive predicate expression node. |
115 | 153 | * @param conf - job conf |
116 | 154 | * @param exprNodeDesc - Describes a GenericFunc node |
117 | 155 | * @return Iceberg Filter Expression |
118 | 156 | */ |
119 | 157 | static Expression getFilterExpr(Configuration conf, ExprNodeGenericFuncDesc exprNodeDesc) { |
120 | | - if (exprNodeDesc != null) { |
121 | | - SearchArgument sarg = ConvertAstToSearchArg.create(conf, exprNodeDesc); |
122 | | - try { |
123 | | - return HiveIcebergFilterFactory.generateFilterExpression(sarg); |
124 | | - } catch (UnsupportedOperationException e) { |
125 | | - LOG.warn("Unable to create Iceberg filter, proceeding without it (will be applied by Hive later): ", e); |
| 158 | + if (exprNodeDesc == null) { |
| 159 | + return null; |
| 160 | + } |
| 161 | + |
| 162 | + ExprNodeGenericFuncDesc exprForSarg = exprNodeDesc; |
| 163 | + if (Boolean.parseBoolean(conf.get(InputFormatConfig.VARIANT_SHREDDING_ENABLED))) { |
| 164 | + ExprNodeGenericFuncDesc rewritten = VariantFilterRewriter.rewriteForShredding(exprNodeDesc); |
| 165 | + if (rewritten != null) { |
| 166 | + exprForSarg = rewritten; |
126 | 167 | } |
127 | 168 | } |
128 | | - return null; |
| 169 | + |
| 170 | + SearchArgument sarg = ConvertAstToSearchArg.create(conf, exprForSarg); |
| 171 | + if (sarg == null) { |
| 172 | + return null; |
| 173 | + } |
| 174 | + |
| 175 | + try { |
| 176 | + return HiveIcebergFilterFactory.generateFilterExpression(sarg); |
| 177 | + } catch (UnsupportedOperationException e) { |
| 178 | + LOG.warn( |
| 179 | + "Unable to create Iceberg filter, proceeding without it (will be applied by Hive later): ", |
| 180 | + e); |
| 181 | + return null; |
| 182 | + } |
129 | 183 | } |
130 | 184 |
|
131 | 185 | /** |
132 | | - * Converts Hive filter found in the passed job conf to an Iceberg filter expression. Then evaluates this |
133 | | - * against the task's partition value producing a residual filter expression. |
| 186 | + * Returns a residual expression that is safe to apply as a record-level filter. |
| 187 | + * |
| 188 | + * <p>This residual is derived from the task-level Iceberg planning filter (already extract-free) after |
| 189 | + * evaluating it against the task's partition value. |
134 | 190 | * @param task - file scan task to evaluate the expression against |
135 | 191 | * @param conf - job conf |
136 | 192 | * @return - Iceberg residual filter expression |
137 | 193 | */ |
138 | 194 | public static Expression residualForTask(FileScanTask task, Configuration conf) { |
139 | | - Expression dataFilter = icebergDataFilterFromHiveConf(conf); |
140 | | - if (dataFilter == null) { |
141 | | - return Expressions.alwaysTrue(); |
142 | | - } |
143 | | - return ResidualEvaluator.of( |
144 | | - task.spec(), dataFilter, |
145 | | - conf.getBoolean(InputFormatConfig.CASE_SENSITIVE, InputFormatConfig.CASE_SENSITIVE_DEFAULT) |
146 | | - ).residualFor(task.file().partition()); |
| 195 | + return FilterExpressions.planningResidual(task, conf); |
| 196 | + } |
| 197 | + |
| 198 | + /** |
| 199 | + * Returns a residual expression intended only for reader-level pruning (best-effort). |
| 200 | + * |
| 201 | + * <p>This residual is derived from the task-level Iceberg filter after evaluating it against the task's |
| 202 | + * partition value. It may include {@code extract(...)} predicates and is suitable for formats/readers that |
| 203 | + * can leverage such terms for pruning (e.g. Parquet row-group pruning using shredded VARIANT columns). |
| 204 | + * |
| 205 | + * <p><strong>Do not</strong> use this for record-level residual filtering, as {@code extract} cannot be |
| 206 | + * evaluated at record level in Iceberg readers. |
| 207 | + */ |
| 208 | + public static Expression residualForReaderPruning(FileScanTask task, Configuration conf) { |
| 209 | + return FilterExpressions.readerResidual(task, conf); |
147 | 210 | } |
148 | 211 |
|
149 | 212 | @Override |
150 | 213 | public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { |
151 | | - Expression filter = icebergDataFilterFromHiveConf(job); |
| 214 | + Expression filter = FilterExpressions.planningFilter(job); |
152 | 215 | if (filter != null) { |
| 216 | + // Iceberg planning-time utilities may attempt to stringify the filter. Ensure the planning filter never |
| 217 | + // contains extract(...) or shredded typed_value references. |
153 | 218 | job.set(InputFormatConfig.FILTER_EXPRESSION, SerializationUtil.serializeToBase64(filter)); |
154 | 219 | } |
155 | 220 |
|
|
0 commit comments