|
| 1 | +/* |
| 2 | + * Licensed to the Apache Software Foundation (ASF) under one |
| 3 | + * or more contributor license agreements. See the NOTICE file |
| 4 | + * distributed with this work for additional information |
| 5 | + * regarding copyright ownership. The ASF licenses this file |
| 6 | + * to you under the Apache License, Version 2.0 (the |
| 7 | + * "License"); you may not use this file except in compliance |
| 8 | + * with the License. You may obtain a copy of the License at |
| 9 | + * |
| 10 | + * http://www.apache.org/licenses/LICENSE-2.0 |
| 11 | + * |
| 12 | + * Unless required by applicable law or agreed to in writing, software |
| 13 | + * distributed under the License is distributed on an "AS IS" BASIS, |
| 14 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 15 | + * See the License for the specific language governing permissions and |
| 16 | + * limitations under the License. |
| 17 | + */ |
| 18 | + |
| 19 | +package org.apache.cassandra.serializers; |
| 20 | + |
| 21 | +import java.nio.ByteBuffer; |
| 22 | +import java.nio.charset.StandardCharsets; |
| 23 | + |
| 24 | +import org.junit.Test; |
| 25 | + |
| 26 | +import org.apache.cassandra.db.marshal.ByteArrayAccessor; |
| 27 | +import org.apache.cassandra.db.marshal.ByteBufferAccessor; |
| 28 | + |
| 29 | +import static org.junit.Assert.assertFalse; |
| 30 | +import static org.junit.Assert.assertTrue; |
| 31 | + |
| 32 | +// https://www.w3.org/2001/06/utf-8-test/UTF-8-demo.html |
| 33 | +public class UTF8ValidatorTest |
| 34 | +{ |
| 35 | + @Test |
| 36 | + public void testValidStrings() |
| 37 | + { |
| 38 | + assertValidUtf8String(""); |
| 39 | + assertValidUtf8String("ASCII text"); |
| 40 | + assertValidUtf8String("\n\r"); |
| 41 | + assertValidUtf8String("a hierarchy of number systems: ℕ ⊆ ℕ₀ ⊂ ℤ ⊂ ℚ ⊂ ℝ ⊂ ℂ"); |
| 42 | + assertValidUtf8String("ℕ ⊆ ℕ₀ ⊂ ℤ ⊂ ℚ ⊂ ℝ ⊂ ℂ"); |
| 43 | + assertValidUtf8String("⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠙⠑⠁⠙⠒ ⠞⠕ ⠃⠑⠛⠔ ⠺⠊⠹⠲ ⡹⠻⠑ ⠊⠎ ⠝⠕ ⠙⠳⠃⠞\n" + |
| 44 | + " ⠱⠁⠞⠑⠧⠻ ⠁⠃⠳⠞ ⠹⠁⠞⠲ ⡹⠑ ⠗⠑⠛⠊⠌⠻ ⠕⠋ ⠙⠊⠎ ⠃⠥⠗⠊⠁⠇ ⠺⠁⠎\n" + |
| 45 | + " ⠎⠊⠛⠝⠫ ⠃⠹ ⠹⠑ ⠊⠇⠻⠛⠹⠍⠁⠝⠂ ⠹⠑ ⠊⠇⠻⠅⠂ ⠹⠑ ⠥⠝⠙⠻⠞⠁⠅⠻⠂\n" + |
| 46 | + " ⠁⠝⠙ ⠹⠑ ⠡⠊⠑⠋ ⠍⠳⠗⠝⠻⠲ ⡎⠊⠗⠕⠕⠛⠑ ⠎⠊⠛⠝⠫ ⠊⠞⠲ ⡁⠝⠙\n" + |
| 47 | + " ⡎⠊⠗⠕⠕⠛⠑⠰⠎ ⠝⠁⠍⠑ ⠺⠁⠎ ⠛⠕⠕⠙ ⠥⠏⠕⠝ ⠰⡡⠁⠝⠛⠑⠂ ⠋⠕⠗ ⠁⠝⠹⠹⠔⠛ ⠙⠑ \n" + |
| 48 | + " ⠡⠕⠎⠑ ⠞⠕ ⠏⠥⠞ ⠙⠊⠎ ⠙⠁⠝⠙ ⠞⠕⠲"); |
| 49 | + assertValidUtf8String("Excerpt from a poetry on The Romance of The Three Kingdoms (a Chinese\n" + |
| 50 | + " classic 'San Gua'):\n" + |
| 51 | + "\n" + |
| 52 | + " [----------------------------|------------------------]\n" + |
| 53 | + " ๏ แผ่นดินฮั่นเสื่อมโทรมแสนสังเวช พระปกเกศกองบู๊กู้ขึ้นใหม่\n" + |
| 54 | + " สิบสองกษัตริย์ก่อนหน้าแลถัดไป สององค์ไซร้โง่เขลาเบาปัญญา\n" + |
| 55 | + " ทรงนับถือขันทีเป็นที่พึ่ง บ้านเมืองจึงวิปริตเป็นนักหนา\n" + |
| 56 | + " โฮจิ๋นเรียกทัพทั่วหัวเมืองมา หมายจะฆ่ามดชั่วตัวสำคัญ\n" + |
| 57 | + " เหมือนขับไสไล่เสือจากเคหา รับหมาป่าเข้ามาเลยอาสัญ\n" + |
| 58 | + " ฝ่ายอ้องอุ้นยุแยกให้แตกกัน ใช้สาวนั้นเป็นชนวนชื่นชวนใจ\n" + |
| 59 | + " พลันลิฉุยกุยกีกลับก่อเหตุ ช่างอาเพศจริงหนาฟ้าร้องไห้\n" + |
| 60 | + " ต้องรบราฆ่าฟันจนบรรลัย ฤๅหาใครค้ำชูกู้บรรลังก์ ฯ"); |
| 61 | + |
| 62 | + assertValidUtf8String("ᚻᛖ ᚳᚹᚫᚦ ᚦᚫᛏ ᚻᛖ ᛒᚢᛞᛖ ᚩᚾ ᚦᚫᛗ ᛚᚪᚾᛞᛖ ᚾᚩᚱᚦᚹᛖᚪᚱᛞᚢᛗ ᚹᛁᚦ ᚦᚪ ᚹᛖᛥᚫ"); |
| 63 | + assertValidUtf8String("Box drawing alignment tests: █\n" + |
| 64 | + " ▉\n" + |
| 65 | + " ╔══╦══╗ ┌──┬──┐ ╭──┬──╮ ╭──┬──╮ ┏━━┳━━┓ ┎┒┏┑ ╷ ╻ ┏┯┓ ┌┰┐ ▊ ╱╲╱╲╳╳╳\n" + |
| 66 | + " ║┌─╨─┐║ │╔═╧═╗│ │╒═╪═╕│ │╓─╁─╖│ ┃┌─╂─┐┃ ┗╃╄┙ ╶┼╴╺╋╸┠┼┨ ┝╋┥ ▋ ╲╱╲╱╳╳╳\n" + |
| 67 | + " ║│╲ ╱│║ │║ ║│ ││ │ ││ │║ ┃ ║│ ┃│ ╿ │┃ ┍╅╆┓ ╵ ╹ ┗┷┛ └┸┘ ▌ ╱╲╱╲╳╳╳\n" + |
| 68 | + " ╠╡ ╳ ╞╣ ├╢ ╟┤ ├┼─┼─┼┤ ├╫─╂─╫┤ ┣┿╾┼╼┿┫ ┕┛┖┚ ┌┄┄┐ ╎ ┏┅┅┓ ┋ ▍ ╲╱╲╱╳╳╳\n" + |
| 69 | + " ║│╱ ╲│║ │║ ║│ ││ │ ││ │║ ┃ ║│ ┃│ ╽ │┃ ░░▒▒▓▓██ ┊ ┆ ╎ ╏ ┇ ┋ ▎\n" + |
| 70 | + " ║└─╥─┘║ │╚═╤═╝│ │╘═╪═╛│ │╙─╀─╜│ ┃└─╂─┘┃ ░░▒▒▓▓██ ┊ ┆ ╎ ╏ ┇ ┋ ▏\n" + |
| 71 | + " ╚══╩══╝ └──┴──┘ ╰──┴──╯ ╰──┴──╯ ┗━━┻━━┛ └╌╌┘ ╎ ┗╍╍┛ ┋ ▁▂▃▄▅▆▇█\n"); |
| 72 | + |
| 73 | + } |
| 74 | + |
| 75 | + @Test // https://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html |
| 76 | + public void testInvalidStrings() |
| 77 | + { |
| 78 | + // continuation bytes only |
| 79 | + assertInvalidUtf8String(0x80); |
| 80 | + assertInvalidUtf8String(0xbf); |
| 81 | + assertInvalidUtf8String(0x80,0x80); |
| 82 | + // Bad trailing bytes |
| 83 | + assertInvalidUtf8String(0xF0, 0xA4, 0xAD, 0x7F); |
| 84 | + assertInvalidUtf8String(0xF0, 0xA4, 0xAD, 0x7F); |
| 85 | + // first bytes of 2-byte sequences (0xc0-0xdf), each followed by a space character |
| 86 | + assertInvalidUtf8String(0xc0, ' ', 0xdf, ' '); |
| 87 | + // first bytes of 3-byte sequences (0xe0-0xef), each followed by a space character |
| 88 | + assertInvalidUtf8String(0xe0, ' ', 0xe1, ' '); |
| 89 | + // first bytes of 4-byte sequences (0xf0-0xf7), each followed by a space character |
| 90 | + assertInvalidUtf8String(0xf0, ' ', 0xf7, ' '); |
| 91 | + // first bytes of 5-byte sequences (0xf8-0xfb), each followed by a space character |
| 92 | + assertInvalidUtf8String(0xf8, ' ', 0xfb, ' '); |
| 93 | + // first bytes of 6-byte sequences (0xfc-0xfd), each followed by a space character |
| 94 | + assertInvalidUtf8String(0xfc, ' ', 0xfd, ' '); |
| 95 | + // Impossible bytes |
| 96 | + assertInvalidUtf8String(0xfe); |
| 97 | + assertInvalidUtf8String(0xff); |
| 98 | + assertInvalidUtf8String(0xfe, 0xfe, 0xff, 0xff); |
| 99 | + // Sequences with last continuation byte missing |
| 100 | + assertInvalidUtf8String(0xc0); |
| 101 | + assertInvalidUtf8String(0xe0, 0x80); |
| 102 | + // Maximum overlong sequences |
| 103 | + assertInvalidUtf8String(0xc1, 0xbf); |
| 104 | + |
| 105 | + // 'ASCII' + continuation byte at the end |
| 106 | + assertInvalidUtf8String(0x41, 0x53, 0x43, 0x49, 0x49, 0x80); |
| 107 | + } |
| 108 | + |
| 109 | + public void assertValidUtf8String(String value) |
| 110 | + { |
| 111 | + byte[] byteArrayValue = value.getBytes(StandardCharsets.UTF_8); |
| 112 | + ByteBuffer bufferValue = ByteBuffer.wrap(byteArrayValue); |
| 113 | + ByteBuffer bufferValueInTheMiddle = ByteBuffer.allocate(byteArrayValue.length + 2 + 2); |
| 114 | + wrapValueWithImpossibleBytes(bufferValueInTheMiddle, byteArrayValue); |
| 115 | + |
| 116 | + ByteBuffer directBufferValue = ByteBuffer.allocateDirect(byteArrayValue.length); |
| 117 | + directBufferValue.put(byteArrayValue); |
| 118 | + directBufferValue.rewind(); |
| 119 | + ByteBuffer directBufferValueInTheMiddle = ByteBuffer.allocate(byteArrayValue.length + 2 + 2); |
| 120 | + wrapValueWithImpossibleBytes(directBufferValueInTheMiddle, byteArrayValue); |
| 121 | + |
| 122 | + assertTrue(UTF8Serializer.UTF8Validator.validate(byteArrayValue, ByteArrayAccessor.instance)); |
| 123 | + |
| 124 | + assertTrue(UTF8Serializer.UTF8Validator.validate(bufferValue, ByteBufferAccessor.instance)); |
| 125 | + assertTrue(UTF8Serializer.UTF8Validator.validate(bufferValueInTheMiddle, ByteBufferAccessor.instance)); |
| 126 | + |
| 127 | + assertTrue(UTF8Serializer.UTF8Validator.validate(directBufferValue, ByteBufferAccessor.instance)); |
| 128 | + assertTrue(UTF8Serializer.UTF8Validator.validate(bufferValueInTheMiddle, ByteBufferAccessor.instance)); |
| 129 | + } |
| 130 | + |
| 131 | + private static void wrapValueWithImpossibleBytes(ByteBuffer bufferValueInTheMiddle, byte[] byteArrayValue) |
| 132 | + { |
| 133 | + // bufferValue wrapped by impossible bytes |
| 134 | + // to ensure that validate method does not read outside of buffer boundaries |
| 135 | + bufferValueInTheMiddle.put((byte)0xfe); |
| 136 | + bufferValueInTheMiddle.put((byte)0xfe); |
| 137 | + bufferValueInTheMiddle.put(byteArrayValue); |
| 138 | + bufferValueInTheMiddle.put((byte)0xfe); |
| 139 | + bufferValueInTheMiddle.put((byte)0xfe); |
| 140 | + bufferValueInTheMiddle.rewind(); |
| 141 | + bufferValueInTheMiddle.position(2); |
| 142 | + bufferValueInTheMiddle.limit(bufferValueInTheMiddle.limit() - 2); |
| 143 | + } |
| 144 | + |
| 145 | + public void assertInvalidUtf8String(int ... bytes) |
| 146 | + { |
| 147 | + byte[] byteArrayValue = toByteArray(bytes); |
| 148 | + ByteBuffer bufferValue = ByteBuffer.wrap(byteArrayValue); |
| 149 | + ByteBuffer directBufferValue = ByteBuffer.allocateDirect(byteArrayValue.length); |
| 150 | + directBufferValue.put(byteArrayValue); |
| 151 | + directBufferValue.rewind(); |
| 152 | + |
| 153 | + assertFalse(UTF8Serializer.UTF8Validator.validate(byteArrayValue, ByteArrayAccessor.instance)); |
| 154 | + assertFalse(UTF8Serializer.UTF8Validator.validate(bufferValue, ByteBufferAccessor.instance)); |
| 155 | + assertFalse(UTF8Serializer.UTF8Validator.validate(directBufferValue, ByteBufferAccessor.instance)); |
| 156 | + } |
| 157 | + |
| 158 | + private static byte[] toByteArray(int... bytes) |
| 159 | + { |
| 160 | + byte[] value = new byte[bytes.length]; |
| 161 | + for (int i = 0; i < bytes.length; i++) |
| 162 | + value[i] = (byte) bytes[i]; |
| 163 | + return value; |
| 164 | + } |
| 165 | +} |
0 commit comments