Skip to content

[Core] Support Truncate(0) for metrics #11905

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions api/src/main/java/org/apache/iceberg/util/BinaryUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,9 @@ public static ByteBuffer truncateBinaryUnsafe(ByteBuffer value, int width) {
* the given input
*/
public static Literal<ByteBuffer> truncateBinaryMin(Literal<ByteBuffer> input, int length) {
if (length == 0) {
return null;
}
ByteBuffer inputBuffer = input.value();
if (length >= inputBuffer.remaining()) {
return input;
Expand All @@ -81,6 +84,9 @@ public static Literal<ByteBuffer> truncateBinaryMin(Literal<ByteBuffer> input, i
* than the given input
*/
public static Literal<ByteBuffer> truncateBinaryMax(Literal<ByteBuffer> input, int length) {
if (length == 0) {
return null;
}
ByteBuffer inputBuffer = input.value();
if (length >= inputBuffer.remaining()) {
return input;
Expand Down
8 changes: 7 additions & 1 deletion api/src/main/java/org/apache/iceberg/util/UnicodeUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ public static boolean isCharHighSurrogate(char ch) {
* length
*/
public static CharSequence truncateString(CharSequence input, int length) {
Preconditions.checkArgument(length > 0, "Truncate length should be positive");
Preconditions.checkArgument(length >= 0, "Truncate length should be non-negative");
StringBuilder sb = new StringBuilder(input);
// Get the number of unicode characters in the input
int numUniCodeCharacters = sb.codePointCount(0, sb.length());
Expand All @@ -60,6 +60,9 @@ public static CharSequence truncateString(CharSequence input, int length) {
*/
public static Literal<CharSequence> truncateStringMin(Literal<CharSequence> input, int length) {
// Truncate the input to the specified truncate length.
if (length == 0) {
return null;
}
CharSequence truncatedInput = truncateString(input.value(), length);
return Literal.of(truncatedInput);
}
Expand All @@ -69,6 +72,9 @@ public static Literal<CharSequence> truncateStringMin(Literal<CharSequence> inpu
* of unicode characters in the truncated charSequence is lesser than or equal to length
*/
public static Literal<CharSequence> truncateStringMax(Literal<CharSequence> input, int length) {
if (length == 0) {
return null;
}
CharSequence inputCharSeq = input.value();
// Truncate the input to the specified truncate length.
StringBuilder truncatedStringBuilder = new StringBuilder(truncateString(inputCharSeq, length));
Expand Down
5 changes: 3 additions & 2 deletions core/src/main/java/org/apache/iceberg/MetricsModes.java
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,8 @@ public String toString() {

/**
* Under this mode, value_counts, null_value_counts, nan_value_counts and truncated lower_bounds,
* upper_bounds are persisted.
* upper_bounds are persisted. If truncate is set to 0 then the no values will be persisted for
* binaries or strings.
*/
public static class Truncate extends ProxySerializableMetricsMode {
private final int length;
Expand All @@ -104,7 +105,7 @@ private Truncate(int length) {
}

public static Truncate withLength(int length) {
Preconditions.checkArgument(length > 0, "Truncate length should be positive");
Preconditions.checkArgument(length >= 0, "Truncate length should be non-negative");
return new Truncate(length);
}

Expand Down
5 changes: 3 additions & 2 deletions core/src/test/java/org/apache/iceberg/TestMetricsModes.java
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ public void testMetricsModeParsing() {
assertThat(MetricsModes.fromString("nOnE")).isEqualTo(None.get());
assertThat(MetricsModes.fromString("counts")).isEqualTo(Counts.get());
assertThat(MetricsModes.fromString("coUntS")).isEqualTo(Counts.get());
assertThat(MetricsModes.fromString("truncate(0)")).isEqualTo(Truncate.withLength(0));
assertThat(MetricsModes.fromString("truncate(1)")).isEqualTo(Truncate.withLength(1));
assertThat(MetricsModes.fromString("truNcAte(10)")).isEqualTo(Truncate.withLength(10));
assertThat(MetricsModes.fromString("full")).isEqualTo(Full.get());
Expand All @@ -71,9 +72,9 @@ public void testMetricsModeParsing() {

@TestTemplate
public void testInvalidTruncationLength() {
assertThatThrownBy(() -> MetricsModes.fromString("truncate(0)"))
assertThatThrownBy(() -> MetricsModes.fromString("truncate(-1)"))
.isInstanceOf(IllegalArgumentException.class)
.hasMessage("Truncate length should be positive");
.hasMessage("Truncate length should be non-negative");
}

@TestTemplate
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ public void testTruncateBinaryMin() {
"Output must have the first two bytes of the input. A lower bound exists "
+ "even though the first two bytes are the max value")
.isEqualTo(0);
assertThat(truncateBinaryMin(Literal.of(test2), 0)).isNull();
}

@Test
Expand Down Expand Up @@ -135,6 +136,7 @@ public void testTruncateBinaryMax() {
"Since a shorter sequence is considered smaller, output must have two bytes "
+ "and the second byte of the input must be incremented")
.isEqualTo(0);
assertThat(truncateBinaryMax(Literal.of(test4), 0)).isNull();
}

@SuppressWarnings("checkstyle:AvoidEscapedUnicodeCharacters")
Expand Down Expand Up @@ -182,6 +184,7 @@ public void testTruncateStringMin() {
assertThat(cmp.compare(truncateStringMin(Literal.of(test4), 1).value(), test4_1_expected))
.as("Output must have the first 4 byte UTF-8 character of the input")
.isEqualTo(0);
assertThat(truncateStringMax(Literal.of(test4), 0)).isNull();
}

@SuppressWarnings("checkstyle:AvoidEscapedUnicodeCharacters")
Expand Down Expand Up @@ -301,5 +304,6 @@ public void testTruncateStringMax() {
"Test the last character is `Character.MIN_SURROGATE - 1` after truncated, it should be incremented to "
+ "next valid Unicode scalar value `Character.MAX_SURROGATE + 1`")
.isEqualTo(0);
assertThat(truncateStringMax(Literal.of(test9), 0)).isNull();
}
}
1 change: 0 additions & 1 deletion orc/src/main/java/org/apache/iceberg/orc/OrcMetrics.java
Original file line number Diff line number Diff line change
Expand Up @@ -335,7 +335,6 @@ private static Object truncateIfNeeded(
CharSequence charSequence = (CharSequence) value;
MetricsModes.Truncate truncateMode = (MetricsModes.Truncate) metricsMode;
int truncateLength = truncateMode.length();

switch (bound) {
case UPPER:
return Optional.ofNullable(
Expand Down
15 changes: 11 additions & 4 deletions parquet/src/main/java/org/apache/iceberg/parquet/ParquetUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -286,13 +286,19 @@ private static <T> void updateMin(
int truncateLength = truncateMode.length();
switch (type.typeId()) {
case STRING:
lowerBounds.put(
id, UnicodeUtil.truncateStringMin((Literal<CharSequence>) min, truncateLength));
Literal<CharSequence> truncatedStringMin =
UnicodeUtil.truncateStringMin((Literal<CharSequence>) min, truncateLength);
if (truncatedStringMin != null) {
lowerBounds.put(id, truncatedStringMin);
}
break;
case FIXED:
case BINARY:
lowerBounds.put(
id, BinaryUtil.truncateBinaryMin((Literal<ByteBuffer>) min, truncateLength));
Literal<ByteBuffer> truncatedBinaryMin =
BinaryUtil.truncateBinaryMin((Literal<ByteBuffer>) min, truncateLength);
if (truncatedBinaryMin != null) {
lowerBounds.put(id, truncatedBinaryMin);
}
break;
default:
lowerBounds.put(id, min);
Expand Down Expand Up @@ -322,6 +328,7 @@ private static <T> void updateMax(
if (truncatedMaxString != null) {
upperBounds.put(id, truncatedMaxString);
}

break;
case FIXED:
case BINARY:
Expand Down
Loading