| // Protocol Buffers - Google's data interchange format |
| // Copyright 2008 Google Inc. All rights reserved. |
| // https://developers.google.com/protocol-buffers/ |
| // |
| // Redistribution and use in source and binary forms, with or without |
| // modification, are permitted provided that the following conditions are |
| // met: |
| // |
| // * Redistributions of source code must retain the above copyright |
| // notice, this list of conditions and the following disclaimer. |
| // * Redistributions in binary form must reproduce the above |
| // copyright notice, this list of conditions and the following disclaimer |
| // in the documentation and/or other materials provided with the |
| // distribution. |
| // * Neither the name of Google Inc. nor the names of its |
| // contributors may be used to endorse or promote products derived from |
| // this software without specific prior written permission. |
| // |
| // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| package com.google.protobuf; |
| |
| import static org.junit.Assert.assertEquals; |
| import static org.junit.Assert.assertFalse; |
| import static org.junit.Assert.assertSame; |
| import static org.junit.Assert.assertTrue; |
| import static org.junit.Assert.fail; |
| |
| import java.lang.ref.SoftReference; |
| import java.nio.ByteBuffer; |
| import java.nio.CharBuffer; |
| import java.nio.charset.CharsetDecoder; |
| import java.nio.charset.CharsetEncoder; |
| import java.nio.charset.CoderResult; |
| import java.nio.charset.CodingErrorAction; |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.List; |
| import java.util.Random; |
| import java.util.logging.Logger; |
| |
| /** |
| * Shared testing code for {@link IsValidUtf8Test} and {@link IsValidUtf8FourByteTest}. |
| * |
| * @author jonp@google.com (Jon Perlow) |
| * @author martinrb@google.com (Martin Buchholz) |
| */ |
| final class IsValidUtf8TestUtil { |
| private static Logger logger = Logger.getLogger(IsValidUtf8TestUtil.class.getName()); |
| |
| private IsValidUtf8TestUtil() {} |
| |
| static interface ByteStringFactory { |
| ByteString newByteString(byte[] bytes); |
| } |
| |
| static final ByteStringFactory LITERAL_FACTORY = |
| new ByteStringFactory() { |
| @Override |
| public ByteString newByteString(byte[] bytes) { |
| return ByteString.wrap(bytes); |
| } |
| }; |
| |
| static final ByteStringFactory HEAP_NIO_FACTORY = |
| new ByteStringFactory() { |
| @Override |
| public ByteString newByteString(byte[] bytes) { |
| return new NioByteString(ByteBuffer.wrap(bytes)); |
| } |
| }; |
| |
| private static ThreadLocal<SoftReference<ByteBuffer>> directBuffer = |
| new ThreadLocal<SoftReference<ByteBuffer>>(); |
| |
| /** |
| * Factory for direct {@link ByteBuffer} instances. To reduce direct memory usage, this uses a |
| * thread local direct buffer. This means that each call will overwrite the buffer's contents from |
| * the previous call, so the calling code must be careful not to continue using a buffer returned |
| * from a previous invocation. |
| */ |
| static final ByteStringFactory DIRECT_NIO_FACTORY = |
| new ByteStringFactory() { |
| @Override |
| public ByteString newByteString(byte[] bytes) { |
| SoftReference<ByteBuffer> ref = directBuffer.get(); |
| ByteBuffer buffer = ref == null ? null : ref.get(); |
| if (buffer == null || buffer.capacity() < bytes.length) { |
| buffer = ByteBuffer.allocateDirect(bytes.length); |
| directBuffer.set(new SoftReference<ByteBuffer>(buffer)); |
| } |
| buffer.clear(); |
| buffer.put(bytes); |
| buffer.flip(); |
| return new NioByteString(buffer); |
| } |
| }; |
| |
| // 128 - [chars 0x0000 to 0x007f] |
| static final long ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x007f - 0x0000 + 1; |
| |
| // 128 |
| static final long EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT = ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS; |
| |
| // 1920 [chars 0x0080 to 0x07FF] |
| static final long TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x07FF - 0x0080 + 1; |
| |
| // 18,304 |
| static final long EXPECTED_TWO_BYTE_ROUNDTRIPPABLE_COUNT = |
| // Both bytes are one byte characters |
| (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 2) |
| + |
| // The possible number of two byte characters |
| TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS; |
| |
| // 2048 |
| static final long THREE_BYTE_SURROGATES = 2 * 1024; |
| |
| // 61,440 [chars 0x0800 to 0xFFFF, minus surrogates] |
| static final long THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS = |
| 0xFFFF - 0x0800 + 1 - THREE_BYTE_SURROGATES; |
| |
| // 2,650,112 |
| static final long EXPECTED_THREE_BYTE_ROUNDTRIPPABLE_COUNT = |
| // All one byte characters |
| (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 3) |
| + |
| // One two byte character and a one byte character |
| 2 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS |
| + |
| // Three byte characters |
| THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS; |
| |
| // 1,048,576 [chars 0x10000L to 0x10FFFF] |
| static final long FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x10FFFF - 0x10000L + 1; |
| |
| // 289,571,839 |
| static final long EXPECTED_FOUR_BYTE_ROUNDTRIPPABLE_COUNT = |
| // All one byte characters |
| (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 4) |
| + |
| // One and three byte characters |
| 2 * THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS |
| + |
| // Two two byte characters |
| TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS |
| + |
| // Permutations of one and two byte characters |
| 3 |
| * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS |
| * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS |
| * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS |
| + |
| // Four byte characters |
| FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS; |
| |
| static final class Shard { |
| final long index; |
| final long start; |
| final long lim; |
| final long expected; |
| |
| public Shard(long index, long start, long lim, long expected) { |
| assertTrue(start < lim); |
| this.index = index; |
| this.start = start; |
| this.lim = lim; |
| this.expected = expected; |
| } |
| } |
| |
| static final long[] FOUR_BYTE_SHARDS_EXPECTED_ROUNTRIPPABLES = |
| generateFourByteShardsExpectedRunnables(); |
| |
| private static long[] generateFourByteShardsExpectedRunnables() { |
| long[] expected = new long[128]; |
| |
| // 0-63 are all 5300224 |
| for (int i = 0; i <= 63; i++) { |
| expected[i] = 5300224; |
| } |
| |
| // 97-111 are all 2342912 |
| for (int i = 97; i <= 111; i++) { |
| expected[i] = 2342912; |
| } |
| |
| // 113-117 are all 1048576 |
| for (int i = 113; i <= 117; i++) { |
| expected[i] = 1048576; |
| } |
| |
| // One offs |
| expected[112] = 786432; |
| expected[118] = 786432; |
| expected[119] = 1048576; |
| expected[120] = 458752; |
| expected[121] = 524288; |
| expected[122] = 65536; |
| |
| // Anything not assigned was the default 0. |
| return expected; |
| } |
| |
| static final List<Shard> FOUR_BYTE_SHARDS = |
| generateFourByteShards(128, FOUR_BYTE_SHARDS_EXPECTED_ROUNTRIPPABLES); |
| |
| private static List<Shard> generateFourByteShards(int numShards, long[] expected) { |
| assertEquals(numShards, expected.length); |
| List<Shard> shards = new ArrayList<Shard>(numShards); |
| long lim = 1L << 32; |
| long increment = lim / numShards; |
| assertTrue(lim % numShards == 0); |
| for (int i = 0; i < numShards; i++) { |
| shards.add(new Shard(i, increment * i, increment * (i + 1), expected[i])); |
| } |
| return shards; |
| } |
| |
| /** |
| * Helper to run the loop to test all the permutations for the number of bytes specified. |
| * |
| * @param factory the factory for {@link ByteString} instances. |
| * @param numBytes the number of bytes in the byte array |
| * @param expectedCount the expected number of roundtrippable permutations |
| */ |
| static void testBytes(ByteStringFactory factory, int numBytes, long expectedCount) { |
| testBytes(factory, numBytes, expectedCount, 0, -1); |
| } |
| |
| /** |
| * Helper to run the loop to test all the permutations for the number of bytes specified. This |
| * overload is useful for debugging to get the loop to start at a certain character. |
| * |
| * @param factory the factory for {@link ByteString} instances. |
| * @param numBytes the number of bytes in the byte array |
| * @param expectedCount the expected number of roundtrippable permutations |
| * @param start the starting bytes encoded as a long as big-endian |
| * @param lim the limit of bytes to process encoded as a long as big-endian, or -1 to mean the max |
| * limit for numBytes |
| */ |
| static void testBytes( |
| ByteStringFactory factory, int numBytes, long expectedCount, long start, long lim) { |
| Random rnd = new Random(); |
| byte[] bytes = new byte[numBytes]; |
| |
| if (lim == -1) { |
| lim = 1L << (numBytes * 8); |
| } |
| long count = 0; |
| long countRoundTripped = 0; |
| for (long byteChar = start; byteChar < lim; byteChar++) { |
| long tmpByteChar = byteChar; |
| for (int i = 0; i < numBytes; i++) { |
| bytes[bytes.length - i - 1] = (byte) tmpByteChar; |
| tmpByteChar = tmpByteChar >> 8; |
| } |
| ByteString bs = factory.newByteString(bytes); |
| boolean isRoundTrippable = bs.isValidUtf8(); |
| String s = new String(bytes, Internal.UTF_8); |
| byte[] bytesReencoded = s.getBytes(Internal.UTF_8); |
| boolean bytesEqual = Arrays.equals(bytes, bytesReencoded); |
| |
| if (bytesEqual != isRoundTrippable) { |
| outputFailure(byteChar, bytes, bytesReencoded); |
| } |
| |
| // Check agreement with static Utf8 methods. |
| assertEquals(isRoundTrippable, Utf8.isValidUtf8(bytes)); |
| assertEquals(isRoundTrippable, Utf8.isValidUtf8(bytes, 0, numBytes)); |
| |
| try { |
| assertEquals(s, Utf8.decodeUtf8(bytes, 0, numBytes)); |
| } catch (InvalidProtocolBufferException e) { |
| if (isRoundTrippable) { |
| System.out.println("Could not decode utf-8"); |
| outputFailure(byteChar, bytes, bytesReencoded); |
| } |
| } |
| |
| // Test partial sequences. |
| // Partition numBytes into three segments (not necessarily non-empty). |
| int i = rnd.nextInt(numBytes); |
| int j = rnd.nextInt(numBytes); |
| if (j < i) { |
| int tmp = i; |
| i = j; |
| j = tmp; |
| } |
| int state1 = Utf8.partialIsValidUtf8(Utf8.COMPLETE, bytes, 0, i); |
| int state2 = Utf8.partialIsValidUtf8(state1, bytes, i, j); |
| int state3 = Utf8.partialIsValidUtf8(state2, bytes, j, numBytes); |
| if (isRoundTrippable != (state3 == Utf8.COMPLETE)) { |
| System.out.printf("state=%04x %04x %04x i=%d j=%d%n", state1, state2, state3, i, j); |
| outputFailure(byteChar, bytes, bytesReencoded); |
| } |
| assertEquals(isRoundTrippable, (state3 == Utf8.COMPLETE)); |
| |
| // Test ropes built out of small partial sequences |
| ByteString rope = |
| RopeByteString.newInstanceForTest( |
| bs.substring(0, i), |
| RopeByteString.newInstanceForTest(bs.substring(i, j), bs.substring(j, numBytes))); |
| assertSame(RopeByteString.class, rope.getClass()); |
| |
| ByteString[] byteStrings = {bs, bs.substring(0, numBytes), rope}; |
| for (ByteString x : byteStrings) { |
| assertEquals(isRoundTrippable, x.isValidUtf8()); |
| assertEquals(state3, x.partialIsValidUtf8(Utf8.COMPLETE, 0, numBytes)); |
| |
| assertEquals(state1, x.partialIsValidUtf8(Utf8.COMPLETE, 0, i)); |
| assertEquals(state1, x.substring(0, i).partialIsValidUtf8(Utf8.COMPLETE, 0, i)); |
| assertEquals(state2, x.partialIsValidUtf8(state1, i, j - i)); |
| assertEquals(state2, x.substring(i, j).partialIsValidUtf8(state1, 0, j - i)); |
| assertEquals(state3, x.partialIsValidUtf8(state2, j, numBytes - j)); |
| assertEquals(state3, x.substring(j, numBytes).partialIsValidUtf8(state2, 0, numBytes - j)); |
| } |
| |
| // ByteString reduplication should not affect its UTF-8 validity. |
| ByteString ropeADope = RopeByteString.newInstanceForTest(bs, bs.substring(0, numBytes)); |
| assertEquals(isRoundTrippable, ropeADope.isValidUtf8()); |
| |
| if (isRoundTrippable) { |
| countRoundTripped++; |
| } |
| count++; |
| if (byteChar != 0 && byteChar % 1000000L == 0) { |
| logger.info("Processed " + (byteChar / 1000000L) + " million characters"); |
| } |
| } |
| logger.info("Round tripped " + countRoundTripped + " of " + count); |
| assertEquals(expectedCount, countRoundTripped); |
| } |
| |
| /** |
| * Variation of {@link #testBytes} that does less allocation using the low-level encoders/decoders |
| * directly. Checked in because it's useful for debugging when trying to process bytes faster, but |
| * since it doesn't use the actual String class, it's possible for incompatibilities to develop |
| * (although unlikely). |
| * |
| * @param factory the factory for {@link ByteString} instances. |
| * @param numBytes the number of bytes in the byte array |
| * @param expectedCount the expected number of roundtrippable permutations |
| * @param start the starting bytes encoded as a long as big-endian |
| * @param lim the limit of bytes to process encoded as a long as big-endian, or -1 to mean the max |
| * limit for numBytes |
| */ |
| static void testBytesUsingByteBuffers( |
| ByteStringFactory factory, int numBytes, long expectedCount, long start, long lim) { |
| CharsetDecoder decoder = |
| Internal.UTF_8 |
| .newDecoder() |
| .onMalformedInput(CodingErrorAction.REPLACE) |
| .onUnmappableCharacter(CodingErrorAction.REPLACE); |
| CharsetEncoder encoder = |
| Internal.UTF_8 |
| .newEncoder() |
| .onMalformedInput(CodingErrorAction.REPLACE) |
| .onUnmappableCharacter(CodingErrorAction.REPLACE); |
| byte[] bytes = new byte[numBytes]; |
| int maxChars = (int) (decoder.maxCharsPerByte() * numBytes) + 1; |
| char[] charsDecoded = new char[(int) (decoder.maxCharsPerByte() * numBytes) + 1]; |
| int maxBytes = (int) (encoder.maxBytesPerChar() * maxChars) + 1; |
| byte[] bytesReencoded = new byte[maxBytes]; |
| |
| ByteBuffer bb = ByteBuffer.wrap(bytes); |
| CharBuffer cb = CharBuffer.wrap(charsDecoded); |
| ByteBuffer bbReencoded = ByteBuffer.wrap(bytesReencoded); |
| if (lim == -1) { |
| lim = 1L << (numBytes * 8); |
| } |
| long count = 0; |
| long countRoundTripped = 0; |
| for (long byteChar = start; byteChar < lim; byteChar++) { |
| bb.rewind(); |
| bb.limit(bytes.length); |
| cb.rewind(); |
| cb.limit(charsDecoded.length); |
| bbReencoded.rewind(); |
| bbReencoded.limit(bytesReencoded.length); |
| encoder.reset(); |
| decoder.reset(); |
| long tmpByteChar = byteChar; |
| for (int i = 0; i < bytes.length; i++) { |
| bytes[bytes.length - i - 1] = (byte) tmpByteChar; |
| tmpByteChar = tmpByteChar >> 8; |
| } |
| boolean isRoundTrippable = factory.newByteString(bytes).isValidUtf8(); |
| CoderResult result = decoder.decode(bb, cb, true); |
| assertFalse(result.isError()); |
| result = decoder.flush(cb); |
| assertFalse(result.isError()); |
| |
| int charLen = cb.position(); |
| cb.rewind(); |
| cb.limit(charLen); |
| result = encoder.encode(cb, bbReencoded, true); |
| assertFalse(result.isError()); |
| result = encoder.flush(bbReencoded); |
| assertFalse(result.isError()); |
| |
| boolean bytesEqual = true; |
| int bytesLen = bbReencoded.position(); |
| if (bytesLen != numBytes) { |
| bytesEqual = false; |
| } else { |
| for (int i = 0; i < numBytes; i++) { |
| if (bytes[i] != bytesReencoded[i]) { |
| bytesEqual = false; |
| break; |
| } |
| } |
| } |
| if (bytesEqual != isRoundTrippable) { |
| outputFailure(byteChar, bytes, bytesReencoded, bytesLen); |
| } |
| |
| count++; |
| if (isRoundTrippable) { |
| countRoundTripped++; |
| } |
| if (byteChar != 0 && byteChar % 1000000 == 0) { |
| logger.info("Processed " + (byteChar / 1000000) + " million characters"); |
| } |
| } |
| logger.info("Round tripped " + countRoundTripped + " of " + count); |
| assertEquals(expectedCount, countRoundTripped); |
| } |
| |
| private static void outputFailure(long byteChar, byte[] bytes, byte[] after) { |
| outputFailure(byteChar, bytes, after, after.length); |
| } |
| |
| private static void outputFailure(long byteChar, byte[] bytes, byte[] after, int len) { |
| fail( |
| String.format( |
| "Failure: (%s) %s => %s", |
| Long.toHexString(byteChar), toHexString(bytes), toHexString(after, len))); |
| } |
| |
| private static String toHexString(byte[] b) { |
| return toHexString(b, b.length); |
| } |
| |
| private static String toHexString(byte[] b, int len) { |
| StringBuilder s = new StringBuilder(); |
| s.append("\""); |
| for (int i = 0; i < len; i++) { |
| if (i > 0) { |
| s.append(" "); |
| } |
| s.append(String.format("%02x", b[i] & 0xFF)); |
| } |
| s.append("\""); |
| return s.toString(); |
| } |
| } |