| // Protocol Buffers - Google's data interchange format |
| // Copyright 2008 Google Inc. All rights reserved. |
| // https://developers.google.com/protocol-buffers/ |
| // |
| // Redistribution and use in source and binary forms, with or without |
| // modification, are permitted provided that the following conditions are |
| // met: |
| // |
| // * Redistributions of source code must retain the above copyright |
| // notice, this list of conditions and the following disclaimer. |
| // * Redistributions in binary form must reproduce the above |
| // copyright notice, this list of conditions and the following disclaimer |
| // in the documentation and/or other materials provided with the |
| // distribution. |
| // * Neither the name of Google Inc. nor the names of its |
| // contributors may be used to endorse or promote products derived from |
| // this software without specific prior written permission. |
| // |
| // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| package com.google.protobuf; |
| |
| import static java.lang.Character.MIN_HIGH_SURROGATE; |
| import static java.lang.Character.MIN_LOW_SURROGATE; |
| import static java.lang.Character.MIN_SURROGATE; |
| |
| import java.util.Random; |
| |
| /** Utilities for benchmarking UTF-8. */ |
| final class Utf8Utils { |
| private Utf8Utils() {} |
| |
| static class MaxCodePoint { |
| final int value; |
| |
| /** |
| * Convert the input string to a code point. Accepts regular decimal numerals, hex strings, and |
| * some symbolic names meaningful to humans. |
| */ |
| private static int decode(String userFriendly) { |
| try { |
| return Integer.decode(userFriendly); |
| } catch (NumberFormatException ignored) { |
| if (userFriendly.matches("(?i)(?:American|English|ASCII)")) { |
| // 1-byte UTF-8 sequences - "American" ASCII text |
| return 0x80; |
| } else if (userFriendly.matches("(?i)(?:Danish|Latin|Western.*European)")) { |
| // Mostly 1-byte UTF-8 sequences, mixed with occasional 2-byte |
| // sequences - "Western European" text |
| return 0x90; |
| } else if (userFriendly.matches("(?i)(?:Greek|Cyrillic|European|ISO.?8859)")) { |
| // Mostly 2-byte UTF-8 sequences - "European" text |
| return 0x800; |
| } else if (userFriendly.matches("(?i)(?:Chinese|Han|Asian|BMP)")) { |
| // Mostly 3-byte UTF-8 sequences - "Asian" text |
| return Character.MIN_SUPPLEMENTARY_CODE_POINT; |
| } else if (userFriendly.matches("(?i)(?:Cuneiform|rare|exotic|supplementary.*)")) { |
| // Mostly 4-byte UTF-8 sequences - "rare exotic" text |
| return Character.MAX_CODE_POINT; |
| } else { |
| throw new IllegalArgumentException("Can't decode codepoint " + userFriendly); |
| } |
| } |
| } |
| |
| public static MaxCodePoint valueOf(String userFriendly) { |
| return new MaxCodePoint(userFriendly); |
| } |
| |
| public MaxCodePoint(String userFriendly) { |
| value = decode(userFriendly); |
| } |
| } |
| |
| /** |
| * The Utf8 distribution of real data. The distribution is an array with length 4. |
| * "distribution[i]" means the total number of characters who are encoded with (i + 1) bytes. |
| * |
| * <p>GMM_UTF8_DISTRIBUTION is the distribution of gmm data set. GSR_UTF8_DISTRIBUTION is the |
| * distribution of gsreq/gsresp data set |
| */ |
| public enum Utf8Distribution { |
| GMM_UTF8_DISTRIBUTION { |
| @Override |
| public int[] getDistribution() { |
| return new int[] {53059, 104, 0, 0}; |
| } |
| }, |
| GSR_UTF8_DISTRIBUTION { |
| @Override |
| public int[] getDistribution() { |
| return new int[] {119458, 74, 2706, 0}; |
| } |
| }; |
| |
| public abstract int[] getDistribution(); |
| } |
| |
| /** |
| * Creates an array of random strings. |
| * |
| * @param stringCount the number of strings to be created. |
| * @param charCount the number of characters per string. |
| * @param maxCodePoint the maximum code point for the characters in the strings. |
| * @return an array of random strings. |
| */ |
| static String[] randomStrings(int stringCount, int charCount, MaxCodePoint maxCodePoint) { |
| final long seed = 99; |
| final Random rnd = new Random(seed); |
| String[] strings = new String[stringCount]; |
| for (int i = 0; i < stringCount; i++) { |
| strings[i] = randomString(rnd, charCount, maxCodePoint); |
| } |
| return strings; |
| } |
| |
| /** |
| * Creates a random string |
| * |
| * @param rnd the random generator. |
| * @param charCount the number of characters per string. |
| * @param maxCodePoint the maximum code point for the characters in the strings. |
| */ |
| static String randomString(Random rnd, int charCount, MaxCodePoint maxCodePoint) { |
| StringBuilder sb = new StringBuilder(); |
| for (int i = 0; i < charCount; i++) { |
| int codePoint; |
| do { |
| codePoint = rnd.nextInt(maxCodePoint.value); |
| } while (Utf8Utils.isSurrogate(codePoint)); |
| sb.appendCodePoint(codePoint); |
| } |
| return sb.toString(); |
| } |
| |
| /** Character.isSurrogate was added in Java SE 7. */ |
| static boolean isSurrogate(int c) { |
| return Character.MIN_HIGH_SURROGATE <= c && c <= Character.MAX_LOW_SURROGATE; |
| } |
| |
| /** |
| * Creates an array of random strings according to UTF8 distribution. |
| * |
| * @param stringCount the number of strings to be created. |
| * @param charCount the number of characters per string. |
| */ |
| static String[] randomStringsWithDistribution( |
| int stringCount, int charCount, Utf8Distribution utf8Distribution) { |
| final int[] distribution = utf8Distribution.getDistribution(); |
| for (int i = 0; i < 3; i++) { |
| distribution[i + 1] += distribution[i]; |
| } |
| final long seed = 99; |
| final Random rnd = new Random(seed); |
| String[] strings = new String[stringCount]; |
| for (int i = 0; i < stringCount; i++) { |
| StringBuilder sb = new StringBuilder(); |
| for (int j = 0; j < charCount; j++) { |
| int codePoint; |
| do { |
| codePoint = rnd.nextInt(distribution[3]); |
| if (codePoint < distribution[0]) { |
| // 1 bytes |
| sb.append(0x7F); |
| } else if (codePoint < distribution[1]) { |
| // 2 bytes |
| sb.append(0x7FF); |
| } else if (codePoint < distribution[2]) { |
| // 3 bytes |
| sb.append(MIN_SURROGATE - 1); |
| } else { |
| // 4 bytes |
| sb.append(MIN_HIGH_SURROGATE); |
| sb.append(MIN_LOW_SURROGATE); |
| } |
| } while (Utf8Utils.isSurrogate(codePoint)); |
| sb.appendCodePoint(codePoint); |
| } |
| strings[i] = sb.toString(); |
| } |
| return strings; |
| } |
| } |