From 930ceeb81cb1572b0b879d9a99b13a40cf1ab9b5 Mon Sep 17 00:00:00 2001 From: Joy Haldar Date: Sun, 16 Nov 2025 00:09:12 +0530 Subject: [PATCH 1/2] Optimize NOT IN and != predicates to prune single-value files --- .../InclusiveMetricsEvaluator.java | 29 +++++++ .../TestInclusiveMetricsEvaluator.java | 86 +++++++++++++++++++ 2 files changed, 115 insertions(+) diff --git a/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java index aa0441f49011..fd416fabb5b1 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java +++ b/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java @@ -327,6 +327,21 @@ public Boolean eq(Bound term, Literal lit) { public Boolean notEq(Bound term, Literal lit) { // because the bounds are not necessarily a min or max value, this cannot be answered using // them. notEq(col, X) with (X, Y) doesn't guarantee that X is a value in col. + // However, when min == max (single value) and the file has no nulls, we can safely prune + // if the single value equals the literal. + int id = term.ref().fieldId(); + if (mayContainNull(id)) { + return ROWS_MIGHT_MATCH; + } + T lower = lowerBound(term); + T upper = upperBound(term); + + if (lower != null && upper != null && lower.equals(upper)) { + int cmp = lit.comparator().compare(lower, lit.value()); + if (cmp == 0) { + return ROWS_CANNOT_MATCH; + } + } return ROWS_MIGHT_MATCH; } @@ -381,6 +396,20 @@ public Boolean in(Bound term, Set literalSet) { public Boolean notIn(Bound term, Set literalSet) { // because the bounds are not necessarily a min or max value, this cannot be answered using // them. notIn(col, {X, ...}) with (X, Y) doesn't guarantee that X is a value in col. + // However, when min == max (single value) and the file has no nulls, we can safely prune + // if the single value is in the exclusion set. + int id = term.ref().fieldId(); + if (mayContainNull(id)) { + return ROWS_MIGHT_MATCH; + } + T lower = lowerBound(term); + T upper = upperBound(term); + + if (lower != null && upper != null && lower.equals(upper)) { + if (literalSet.contains(lower)) { + return ROWS_CANNOT_MATCH; + } + } return ROWS_MIGHT_MATCH; } diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java index 2f4fbf395739..1fce8ad1bb49 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java @@ -970,4 +970,90 @@ public void testNotNullInNestedStruct() { .as("Should not read: optional_address.optional_street2 is optional") .isFalse(); } + + @Test + public void testNotEqWithSingleValue() { + DataFile singleValueFile = + new TestDataFile( + "single_value.avro", + Row.of(), + 10, + ImmutableMap.of(3, 10L), + ImmutableMap.of(3, 0L), + null, + ImmutableMap.of(3, toByteBuffer(StringType.get(), "abc")), + ImmutableMap.of(3, toByteBuffer(StringType.get(), "abc"))); + + boolean shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, notEqual("required", "abc")).eval(singleValueFile); + assertThat(shouldRead) + .as("Should prune: file contains single value equal to literal") + .isFalse(); + + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, notEqual("required", "def")).eval(singleValueFile); + assertThat(shouldRead) + .as("Should read: file contains single value not equal to literal") + .isTrue(); + + DataFile singleValueWithNulls = + new TestDataFile( + "single_value_nulls.avro", + Row.of(), + 10, + ImmutableMap.of(3, 10L), + ImmutableMap.of(3, 2L), + null, + ImmutableMap.of(3, toByteBuffer(StringType.get(), "abc")), + ImmutableMap.of(3, toByteBuffer(StringType.get(), "abc"))); + + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, notEqual("required", "abc")) + .eval(singleValueWithNulls); + assertThat(shouldRead).as("Should read: file has nulls which match != predicate").isTrue(); + } + + @Test + public void testNotInWithSingleValue() { + DataFile singleValueFile = + new TestDataFile( + "single_value.avro", + Row.of(), + 10, + ImmutableMap.of(3, 10L), + ImmutableMap.of(3, 0L), + null, + ImmutableMap.of(3, toByteBuffer(StringType.get(), "abc")), + ImmutableMap.of(3, toByteBuffer(StringType.get(), "abc"))); + + boolean shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, notIn("required", "abc", "def")) + .eval(singleValueFile); + assertThat(shouldRead) + .as("Should prune: file contains single value in exclusion list") + .isFalse(); + + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, notIn("required", "def", "ghi")) + .eval(singleValueFile); + assertThat(shouldRead) + .as("Should read: file contains single value not in exclusion list") + .isTrue(); + + DataFile singleValueWithNulls = + new TestDataFile( + "single_value_nulls.avro", + Row.of(), + 10, + ImmutableMap.of(3, 10L), + ImmutableMap.of(3, 2L), + null, + ImmutableMap.of(3, toByteBuffer(StringType.get(), "abc")), + ImmutableMap.of(3, toByteBuffer(StringType.get(), "abc"))); + + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, notIn("required", "abc", "def")) + .eval(singleValueWithNulls); + assertThat(shouldRead).as("Should read: file has nulls which match NOT IN predicate").isTrue(); + } } From 9bae0c62184e470c0999723465f96f1dc1f5e218 Mon Sep 17 00:00:00 2001 From: Joy Haldar Date: Sun, 16 Nov 2025 10:14:17 +0530 Subject: [PATCH 2/2] Simplify comments in notEq and notIn methods --- .../iceberg/expressions/InclusiveMetricsEvaluator.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java index fd416fabb5b1..aed7fdeab56a 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java +++ b/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java @@ -327,8 +327,8 @@ public Boolean eq(Bound term, Literal lit) { public Boolean notEq(Bound term, Literal lit) { // because the bounds are not necessarily a min or max value, this cannot be answered using // them. notEq(col, X) with (X, Y) doesn't guarantee that X is a value in col. - // However, when min == max (single value) and the file has no nulls, we can safely prune - // if the single value equals the literal. + // However, when min == max and the file has no nulls, we can safely prune + // if that value equals the literal. int id = term.ref().fieldId(); if (mayContainNull(id)) { return ROWS_MIGHT_MATCH; @@ -396,8 +396,8 @@ public Boolean in(Bound term, Set literalSet) { public Boolean notIn(Bound term, Set literalSet) { // because the bounds are not necessarily a min or max value, this cannot be answered using // them. notIn(col, {X, ...}) with (X, Y) doesn't guarantee that X is a value in col. - // However, when min == max (single value) and the file has no nulls, we can safely prune - // if the single value is in the exclusion set. + // However, when min == max and the file has no nulls, we can safely prune + // if that value is in the exclusion set. int id = term.ref().fieldId(); if (mayContainNull(id)) { return ROWS_MIGHT_MATCH;