Skip to content

Commit c295c33

Browse files
committed
Optimize UTF8Validator.validate for ASCII prefixed Strings
Use a plain loop to check if it is ASCII symbol before going into more complicated UTF8 parsing. Avoid ValueAccessor to get extra boost for the ASCII check, especially in non-monomorphic cases. Patch by Dmitry Konstantinov; reviewed by Jyothsna Konisa, Stefan Miklosovic for CASSANDRA-21075
1 parent d7bd753 commit c295c33

File tree

4 files changed

+333
-2
lines changed

4 files changed

+333
-2
lines changed

CHANGES.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
5.1
2+
* Optimize UTF8Validator.validate for ASCII prefixed Strings (CASSANDRA-21075)
23
* Switch LatencyMetrics to use ThreadLocalTimer/ThreadLocalCounter (CASSANDRA-21080)
34
* Accord: write rejections would be returned to users as server errors rather than INVALID and TxnReferenceOperation didn't handle all collections prperly (CASSANDRA-21061)
45
* Use byte[] directly in QueryOptions instead of ByteBuffer and convert them to ArrayCell instead of BufferCell to reduce allocations (CASSANDRA-20166)

src/java/org/apache/cassandra/serializers/UTF8Serializer.java

Lines changed: 50 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,11 @@
1717
*/
1818
package org.apache.cassandra.serializers;
1919

20+
import java.nio.ByteBuffer;
2021
import java.nio.charset.StandardCharsets;
2122

23+
import org.apache.cassandra.db.marshal.ByteArrayAccessor;
24+
import org.apache.cassandra.db.marshal.ByteBufferAccessor;
2225
import org.apache.cassandra.db.marshal.ValueAccessor;
2326

2427
public class UTF8Serializer extends AbstractTextSerializer
@@ -57,8 +60,53 @@ static <V> boolean validate(V value, ValueAccessor<V> accessor)
5760
if (value == null)
5861
return false;
5962

60-
int b = 0;
61-
int offset = 0;
63+
// perf optimizations:
64+
// 1) avoid bimorphic/megamorphic calls via ValueAccessor
65+
// 2) use a simplified logic to handle ASCII prefixed String scenario faster
66+
if (accessor == ByteArrayAccessor.instance)
67+
{
68+
byte[] valueAsArray = accessor.toArray(value);
69+
return validateByteArray(valueAsArray, 0, valueAsArray.length, value, accessor);
70+
}
71+
72+
if (accessor == ByteBufferAccessor.instance)
73+
{
74+
ByteBuffer valueAsBuffer = accessor.toBuffer(value);
75+
if (valueAsBuffer.hasArray())
76+
{
77+
byte[] valueAsArray = valueAsBuffer.array();
78+
int start = valueAsBuffer.position();
79+
int end = start + valueAsBuffer.remaining();
80+
return validateByteArray(valueAsArray, start, end, value, accessor);
81+
}
82+
}
83+
84+
int end = accessor.size(value);
85+
for (int i = 0; i < end; i++)
86+
{
87+
if (accessor.getByte(value, i) < 0)
88+
{
89+
return validateSlowPath(value, accessor, i);
90+
}
91+
}
92+
return true;
93+
}
94+
95+
private static <V> boolean validateByteArray(byte[] valueAsArray, int start, int end,
96+
V value, ValueAccessor<V> accessor)
97+
{
98+
assert start >= 0 && end <= valueAsArray.length;
99+
for (int i = start; i < end; i++)
100+
{
101+
if (valueAsArray[i] < 0)
102+
return validateSlowPath(value, accessor, i - start);
103+
}
104+
return true;
105+
}
106+
107+
private static <V> boolean validateSlowPath(V value, ValueAccessor<V> accessor, int offset)
108+
{
109+
int b;
62110
State state = State.START;
63111
while (!accessor.isEmptyFromOffset(value, offset))
64112
{
Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
19+
package org.apache.cassandra.test.microbench;
20+
21+
import java.nio.ByteBuffer;
22+
import java.nio.charset.StandardCharsets;
23+
import java.util.concurrent.TimeUnit;
24+
25+
import org.apache.cassandra.db.marshal.ByteArrayAccessor;
26+
import org.apache.cassandra.db.marshal.ByteBufferAccessor;
27+
import org.apache.cassandra.serializers.UTF8Serializer;
28+
import org.openjdk.jmh.annotations.Benchmark;
29+
import org.openjdk.jmh.annotations.BenchmarkMode;
30+
import org.openjdk.jmh.annotations.Fork;
31+
import org.openjdk.jmh.annotations.Level;
32+
import org.openjdk.jmh.annotations.Measurement;
33+
import org.openjdk.jmh.annotations.Mode;
34+
import org.openjdk.jmh.annotations.OutputTimeUnit;
35+
import org.openjdk.jmh.annotations.Param;
36+
import org.openjdk.jmh.annotations.Scope;
37+
import org.openjdk.jmh.annotations.Setup;
38+
import org.openjdk.jmh.annotations.State;
39+
import org.openjdk.jmh.annotations.Threads;
40+
import org.openjdk.jmh.annotations.Warmup;
41+
42+
@BenchmarkMode(Mode.AverageTime)
43+
@OutputTimeUnit(TimeUnit.NANOSECONDS)
44+
@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
45+
@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
46+
@Fork(value = 3, jvmArgsAppend = "-Xmx512M")
47+
@Threads(1)
48+
@State(Scope.Benchmark)
49+
public class UTF8ValidatorBench
50+
{
51+
52+
@Param({ "short ASCII", "long ASCII", "short ASCII prefix non-ASCII", "short non-ASCII", "long non-ASCII"})
53+
private String stringType;
54+
55+
byte[] arrayValue;
56+
ByteBuffer heapByteBufferValue;
57+
58+
@Setup(Level.Trial)
59+
public void setup() throws Throwable
60+
{
61+
switch (stringType)
62+
{
63+
case "short ASCII":
64+
arrayValue = "ASCII string".getBytes(StandardCharsets.UTF_8);
65+
break;
66+
case "long ASCII":
67+
arrayValue = ("ASCII is an acronym for American Standard Code for Information Interchange, " +
68+
"is a character encoding standard for representing a particular set of 95 " +
69+
"(English language focused) printable and 33 control characters – a total of 128 code points. " +
70+
"The set of available punctuation had significant impact on the syntax of computer languages " +
71+
"and text markup. ASCII hugely influenced the design of character sets used by modern computers; " +
72+
"for example, the first 128 code points of Unicode are the same as ASCII.").getBytes(StandardCharsets.UTF_8);
73+
break;
74+
case "short ASCII prefix non-ASCII":
75+
arrayValue = "a hierarchy of number systems: ℕ ⊆ ℕ₀ ⊂ ℤ ⊂ ℚ ⊂ ℝ ⊂ ℂ".getBytes(StandardCharsets.UTF_8);
76+
break;
77+
case "short non-ASCII":
78+
arrayValue = "ℕ ⊆ ℕ₀ ⊂ ℤ ⊂ ℚ ⊂ ℝ ⊂ ℂ".getBytes(StandardCharsets.UTF_8);
79+
break;
80+
case "long non-ASCII": // https://www.w3.org/2001/06/utf-8-test/UTF-8-demo.html
81+
arrayValue = ("⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠙⠑⠁⠙⠒ ⠞⠕ ⠃⠑⠛⠔ ⠺⠊⠹⠲ ⡹⠻⠑ ⠊⠎ ⠝⠕ ⠙⠳⠃⠞\n" +
82+
" ⠱⠁⠞⠑⠧⠻ ⠁⠃⠳⠞ ⠹⠁⠞⠲ ⡹⠑ ⠗⠑⠛⠊⠌⠻ ⠕⠋ ⠙⠊⠎ ⠃⠥⠗⠊⠁⠇ ⠺⠁⠎\n" +
83+
" ⠎⠊⠛⠝⠫ ⠃⠹ ⠹⠑ ⠊⠇⠻⠛⠹⠍⠁⠝⠂ ⠹⠑ ⠊⠇⠻⠅⠂ ⠹⠑ ⠥⠝⠙⠻⠞⠁⠅⠻⠂\n" +
84+
" ⠁⠝⠙ ⠹⠑ ⠡⠊⠑⠋ ⠍⠳⠗⠝⠻⠲ ⡎⠊⠗⠕⠕⠛⠑ ⠎⠊⠛⠝⠫ ⠊⠞⠲ ⡁⠝⠙\n" +
85+
" ⡎⠊⠗⠕⠕⠛⠑⠰⠎ ⠝⠁⠍⠑ ⠺⠁⠎ ⠛⠕⠕⠙ ⠥⠏⠕⠝ ⠰⡡⠁⠝⠛⠑⠂ ⠋⠕⠗ ⠁⠝⠹⠹⠔⠛ ⠙⠑ \n" +
86+
" ⠡⠕⠎⠑ ⠞⠕ ⠏⠥⠞ ⠙⠊⠎ ⠙⠁⠝⠙ ⠞⠕⠲").getBytes(StandardCharsets.UTF_8);
87+
break;
88+
default:
89+
throw new UnsupportedOperationException();
90+
}
91+
heapByteBufferValue = ByteBuffer.allocate(arrayValue.length);
92+
heapByteBufferValue.put(arrayValue).rewind();
93+
}
94+
95+
96+
@Benchmark
97+
public void testBimorphic()
98+
{
99+
UTF8Serializer.instance.validate(heapByteBufferValue, ByteBufferAccessor.instance);
100+
UTF8Serializer.instance.validate(arrayValue, ByteArrayAccessor.instance);
101+
}
102+
103+
104+
@Benchmark
105+
public void testMonomorphicArray()
106+
{
107+
UTF8Serializer.instance.validate(arrayValue, ByteArrayAccessor.instance);
108+
UTF8Serializer.instance.validate(arrayValue, ByteArrayAccessor.instance);
109+
}
110+
111+
@Benchmark
112+
public void testMonomorphicHeapByteBuffer()
113+
{
114+
UTF8Serializer.instance.validate(heapByteBufferValue, ByteBufferAccessor.instance);
115+
UTF8Serializer.instance.validate(heapByteBufferValue, ByteBufferAccessor.instance);
116+
}
117+
}
Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
19+
package org.apache.cassandra.serializers;
20+
21+
import java.nio.ByteBuffer;
22+
import java.nio.charset.StandardCharsets;
23+
24+
import org.junit.Test;
25+
26+
import org.apache.cassandra.db.marshal.ByteArrayAccessor;
27+
import org.apache.cassandra.db.marshal.ByteBufferAccessor;
28+
29+
import static org.junit.Assert.assertFalse;
30+
import static org.junit.Assert.assertTrue;
31+
32+
// https://www.w3.org/2001/06/utf-8-test/UTF-8-demo.html
33+
public class UTF8ValidatorTest
34+
{
35+
@Test
36+
public void testValidStrings()
37+
{
38+
assertValidUtf8String("");
39+
assertValidUtf8String("ASCII text");
40+
assertValidUtf8String("\n\r");
41+
assertValidUtf8String("a hierarchy of number systems: ℕ ⊆ ℕ₀ ⊂ ℤ ⊂ ℚ ⊂ ℝ ⊂ ℂ");
42+
assertValidUtf8String("ℕ ⊆ ℕ₀ ⊂ ℤ ⊂ ℚ ⊂ ℝ ⊂ ℂ");
43+
assertValidUtf8String("⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠙⠑⠁⠙⠒ ⠞⠕ ⠃⠑⠛⠔ ⠺⠊⠹⠲ ⡹⠻⠑ ⠊⠎ ⠝⠕ ⠙⠳⠃⠞\n" +
44+
" ⠱⠁⠞⠑⠧⠻ ⠁⠃⠳⠞ ⠹⠁⠞⠲ ⡹⠑ ⠗⠑⠛⠊⠌⠻ ⠕⠋ ⠙⠊⠎ ⠃⠥⠗⠊⠁⠇ ⠺⠁⠎\n" +
45+
" ⠎⠊⠛⠝⠫ ⠃⠹ ⠹⠑ ⠊⠇⠻⠛⠹⠍⠁⠝⠂ ⠹⠑ ⠊⠇⠻⠅⠂ ⠹⠑ ⠥⠝⠙⠻⠞⠁⠅⠻⠂\n" +
46+
" ⠁⠝⠙ ⠹⠑ ⠡⠊⠑⠋ ⠍⠳⠗⠝⠻⠲ ⡎⠊⠗⠕⠕⠛⠑ ⠎⠊⠛⠝⠫ ⠊⠞⠲ ⡁⠝⠙\n" +
47+
" ⡎⠊⠗⠕⠕⠛⠑⠰⠎ ⠝⠁⠍⠑ ⠺⠁⠎ ⠛⠕⠕⠙ ⠥⠏⠕⠝ ⠰⡡⠁⠝⠛⠑⠂ ⠋⠕⠗ ⠁⠝⠹⠹⠔⠛ ⠙⠑ \n" +
48+
" ⠡⠕⠎⠑ ⠞⠕ ⠏⠥⠞ ⠙⠊⠎ ⠙⠁⠝⠙ ⠞⠕⠲");
49+
assertValidUtf8String("Excerpt from a poetry on The Romance of The Three Kingdoms (a Chinese\n" +
50+
" classic 'San Gua'):\n" +
51+
"\n" +
52+
" [----------------------------|------------------------]\n" +
53+
" ๏ แผ่นดินฮั่นเสื่อมโทรมแสนสังเวช พระปกเกศกองบู๊กู้ขึ้นใหม่\n" +
54+
" สิบสองกษัตริย์ก่อนหน้าแลถัดไป สององค์ไซร้โง่เขลาเบาปัญญา\n" +
55+
" ทรงนับถือขันทีเป็นที่พึ่ง บ้านเมืองจึงวิปริตเป็นนักหนา\n" +
56+
" โฮจิ๋นเรียกทัพทั่วหัวเมืองมา หมายจะฆ่ามดชั่วตัวสำคัญ\n" +
57+
" เหมือนขับไสไล่เสือจากเคหา รับหมาป่าเข้ามาเลยอาสัญ\n" +
58+
" ฝ่ายอ้องอุ้นยุแยกให้แตกกัน ใช้สาวนั้นเป็นชนวนชื่นชวนใจ\n" +
59+
" พลันลิฉุยกุยกีกลับก่อเหตุ ช่างอาเพศจริงหนาฟ้าร้องไห้\n" +
60+
" ต้องรบราฆ่าฟันจนบรรลัย ฤๅหาใครค้ำชูกู้บรรลังก์ ฯ");
61+
62+
assertValidUtf8String("ᚻᛖ ᚳᚹᚫᚦ ᚦᚫᛏ ᚻᛖ ᛒᚢᛞᛖ ᚩᚾ ᚦᚫᛗ ᛚᚪᚾᛞᛖ ᚾᚩᚱᚦᚹᛖᚪᚱᛞᚢᛗ ᚹᛁᚦ ᚦᚪ ᚹᛖᛥᚫ");
63+
assertValidUtf8String("Box drawing alignment tests: █\n" +
64+
" ▉\n" +
65+
" ╔══╦══╗ ┌──┬──┐ ╭──┬──╮ ╭──┬──╮ ┏━━┳━━┓ ┎┒┏┑ ╷ ╻ ┏┯┓ ┌┰┐ ▊ ╱╲╱╲╳╳╳\n" +
66+
" ║┌─╨─┐║ │╔═╧═╗│ │╒═╪═╕│ │╓─╁─╖│ ┃┌─╂─┐┃ ┗╃╄┙ ╶┼╴╺╋╸┠┼┨ ┝╋┥ ▋ ╲╱╲╱╳╳╳\n" +
67+
" ║│╲ ╱│║ │║ ║│ ││ │ ││ │║ ┃ ║│ ┃│ ╿ │┃ ┍╅╆┓ ╵ ╹ ┗┷┛ └┸┘ ▌ ╱╲╱╲╳╳╳\n" +
68+
" ╠╡ ╳ ╞╣ ├╢ ╟┤ ├┼─┼─┼┤ ├╫─╂─╫┤ ┣┿╾┼╼┿┫ ┕┛┖┚ ┌┄┄┐ ╎ ┏┅┅┓ ┋ ▍ ╲╱╲╱╳╳╳\n" +
69+
" ║│╱ ╲│║ │║ ║│ ││ │ ││ │║ ┃ ║│ ┃│ ╽ │┃ ░░▒▒▓▓██ ┊ ┆ ╎ ╏ ┇ ┋ ▎\n" +
70+
" ║└─╥─┘║ │╚═╤═╝│ │╘═╪═╛│ │╙─╀─╜│ ┃└─╂─┘┃ ░░▒▒▓▓██ ┊ ┆ ╎ ╏ ┇ ┋ ▏\n" +
71+
" ╚══╩══╝ └──┴──┘ ╰──┴──╯ ╰──┴──╯ ┗━━┻━━┛ └╌╌┘ ╎ ┗╍╍┛ ┋ ▁▂▃▄▅▆▇█\n");
72+
73+
}
74+
75+
@Test // https://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html
76+
public void testInvalidStrings()
77+
{
78+
// continuation bytes only
79+
assertInvalidUtf8String(0x80);
80+
assertInvalidUtf8String(0xbf);
81+
assertInvalidUtf8String(0x80,0x80);
82+
// Bad trailing bytes
83+
assertInvalidUtf8String(0xF0, 0xA4, 0xAD, 0x7F);
84+
assertInvalidUtf8String(0xF0, 0xA4, 0xAD, 0x7F);
85+
// first bytes of 2-byte sequences (0xc0-0xdf), each followed by a space character
86+
assertInvalidUtf8String(0xc0, ' ', 0xdf, ' ');
87+
// first bytes of 3-byte sequences (0xe0-0xef), each followed by a space character
88+
assertInvalidUtf8String(0xe0, ' ', 0xe1, ' ');
89+
// first bytes of 4-byte sequences (0xf0-0xf7), each followed by a space character
90+
assertInvalidUtf8String(0xf0, ' ', 0xf7, ' ');
91+
// first bytes of 5-byte sequences (0xf8-0xfb), each followed by a space character
92+
assertInvalidUtf8String(0xf8, ' ', 0xfb, ' ');
93+
// first bytes of 6-byte sequences (0xfc-0xfd), each followed by a space character
94+
assertInvalidUtf8String(0xfc, ' ', 0xfd, ' ');
95+
// Impossible bytes
96+
assertInvalidUtf8String(0xfe);
97+
assertInvalidUtf8String(0xff);
98+
assertInvalidUtf8String(0xfe, 0xfe, 0xff, 0xff);
99+
// Sequences with last continuation byte missing
100+
assertInvalidUtf8String(0xc0);
101+
assertInvalidUtf8String(0xe0, 0x80);
102+
// Maximum overlong sequences
103+
assertInvalidUtf8String(0xc1, 0xbf);
104+
105+
// 'ASCII' + continuation byte at the end
106+
assertInvalidUtf8String(0x41, 0x53, 0x43, 0x49, 0x49, 0x80);
107+
}
108+
109+
public void assertValidUtf8String(String value)
110+
{
111+
byte[] byteArrayValue = value.getBytes(StandardCharsets.UTF_8);
112+
ByteBuffer bufferValue = ByteBuffer.wrap(byteArrayValue);
113+
ByteBuffer bufferValueInTheMiddle = ByteBuffer.allocate(byteArrayValue.length + 2 + 2);
114+
wrapValueWithImpossibleBytes(bufferValueInTheMiddle, byteArrayValue);
115+
116+
ByteBuffer directBufferValue = ByteBuffer.allocateDirect(byteArrayValue.length);
117+
directBufferValue.put(byteArrayValue);
118+
directBufferValue.rewind();
119+
ByteBuffer directBufferValueInTheMiddle = ByteBuffer.allocate(byteArrayValue.length + 2 + 2);
120+
wrapValueWithImpossibleBytes(directBufferValueInTheMiddle, byteArrayValue);
121+
122+
assertTrue(UTF8Serializer.UTF8Validator.validate(byteArrayValue, ByteArrayAccessor.instance));
123+
124+
assertTrue(UTF8Serializer.UTF8Validator.validate(bufferValue, ByteBufferAccessor.instance));
125+
assertTrue(UTF8Serializer.UTF8Validator.validate(bufferValueInTheMiddle, ByteBufferAccessor.instance));
126+
127+
assertTrue(UTF8Serializer.UTF8Validator.validate(directBufferValue, ByteBufferAccessor.instance));
128+
assertTrue(UTF8Serializer.UTF8Validator.validate(bufferValueInTheMiddle, ByteBufferAccessor.instance));
129+
}
130+
131+
private static void wrapValueWithImpossibleBytes(ByteBuffer bufferValueInTheMiddle, byte[] byteArrayValue)
132+
{
133+
// bufferValue wrapped by impossible bytes
134+
// to ensure that validate method does not read outside of buffer boundaries
135+
bufferValueInTheMiddle.put((byte)0xfe);
136+
bufferValueInTheMiddle.put((byte)0xfe);
137+
bufferValueInTheMiddle.put(byteArrayValue);
138+
bufferValueInTheMiddle.put((byte)0xfe);
139+
bufferValueInTheMiddle.put((byte)0xfe);
140+
bufferValueInTheMiddle.rewind();
141+
bufferValueInTheMiddle.position(2);
142+
bufferValueInTheMiddle.limit(bufferValueInTheMiddle.limit() - 2);
143+
}
144+
145+
public void assertInvalidUtf8String(int ... bytes)
146+
{
147+
byte[] byteArrayValue = toByteArray(bytes);
148+
ByteBuffer bufferValue = ByteBuffer.wrap(byteArrayValue);
149+
ByteBuffer directBufferValue = ByteBuffer.allocateDirect(byteArrayValue.length);
150+
directBufferValue.put(byteArrayValue);
151+
directBufferValue.rewind();
152+
153+
assertFalse(UTF8Serializer.UTF8Validator.validate(byteArrayValue, ByteArrayAccessor.instance));
154+
assertFalse(UTF8Serializer.UTF8Validator.validate(bufferValue, ByteBufferAccessor.instance));
155+
assertFalse(UTF8Serializer.UTF8Validator.validate(directBufferValue, ByteBufferAccessor.instance));
156+
}
157+
158+
private static byte[] toByteArray(int... bytes)
159+
{
160+
byte[] value = new byte[bytes.length];
161+
for (int i = 0; i < bytes.length; i++)
162+
value[i] = (byte) bytes[i];
163+
return value;
164+
}
165+
}

0 commit comments

Comments
 (0)