java/com/google/flatbuffers/Utf8.java - third_party/flatbuffers - Git at Google

 /*
  * Copyright 2014 Google Inc. All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package com.google.flatbuffers;

 import java.nio.ByteBuffer;

 import static java.lang.Character.MAX_SURROGATE;
 import static java.lang.Character.MIN_SURROGATE;
 import static java.lang.Character.MIN_HIGH_SURROGATE;
 import static java.lang.Character.MIN_LOW_SURROGATE;
 import static java.lang.Character.MIN_SUPPLEMENTARY_CODE_POINT;
 import static java.lang.Character.isSurrogatePair;
 import static java.lang.Character.toCodePoint;

 public abstract class Utf8 {

   /**
    * Returns the number of bytes in the UTF-8-encoded form of {@code sequence}. For a string,
    * this method is equivalent to {@code string.getBytes(UTF_8).length}, but is more efficient in
    * both time and space.
    *
    * @throws IllegalArgumentException if {@code sequence} contains ill-formed UTF-16 (unpaired
    *     surrogates)
    */
   public abstract int encodedLength(CharSequence sequence);

   /**
    * Encodes the given characters to the target {@link ByteBuffer} using UTF-8 encoding.
    *
    * <p>Selects an optimal algorithm based on the type of {@link ByteBuffer} (i.e. heap or direct)
    * and the capabilities of the platform.
    *
    * @param in the source string to be encoded
    * @param out the target buffer to receive the encoded string.
    */
   public abstract void encodeUtf8(CharSequence in, ByteBuffer out);

   /**
    * Decodes the given UTF-8 portion of the {@link ByteBuffer} into a {@link String}.
    *
    * @throws IllegalArgumentException if the input is not valid UTF-8.
    */
   public abstract String decodeUtf8(ByteBuffer buffer, int offset, int length);

   private static Utf8 DEFAULT;

   /**
    * Get the default UTF-8 processor.
    * @return the default processor
    */
   public static Utf8 getDefault() {
     if (DEFAULT == null) {
       DEFAULT = new Utf8Safe();
     }
     return DEFAULT;
   }

   /**
    * Set the default instance of the UTF-8 processor.
    * @param instance the new instance to use
    */
   public static void setDefault(Utf8 instance) {
     DEFAULT = instance;
   }

   /**
    * Encode a Java's CharSequence UTF8 codepoint into a byte array.
    * @param in CharSequence to be encoded
    * @param start start position of the first char in the codepoint
    * @param out byte array of 4 bytes to be filled
    * @return return the amount of bytes occupied by the codepoint
    */
   public static int encodeUtf8CodePoint(CharSequence in, int start, byte[] out) {
     // utf8 codepoint needs at least 4 bytes
     assert out.length >= 4;

     final int inLength = in.length();
     if (start >= inLength) {
       return 0;
     }

     char c = in.charAt(start);
      if (c < 0x80) {
        // One byte (0xxx xxxx)
        out[0] = (byte) c;
        return 1;
      } else if (c < 0x800) {
       // Two bytes (110x xxxx 10xx xxxx)
       out[0] = (byte) (0xC0 | (c >>> 6));
       out[1] = (byte) (0x80 | (0x3F & c));
       return 2;
     } else if (c < MIN_SURROGATE || MAX_SURROGATE < c) {
       // Three bytes (1110 xxxx 10xx xxxx 10xx xxxx)
       // Maximum single-char code point is 0xFFFF, 16 bits.
       out[0] = (byte) (0xE0 | (c >>> 12));
       out[1] =(byte) (0x80 | (0x3F & (c >>> 6)));
       out[2] = (byte) (0x80 | (0x3F & c));
       return 3;
     } else {
       // Four bytes (1111 xxxx 10xx xxxx 10xx xxxx 10xx xxxx)
       // Minimum code point represented by a surrogate pair is 0x10000, 17 bits, four UTF-8
       // bytes
       final char low;
       if (start + 1 == inLength || !isSurrogatePair(c, (low = in.charAt(start+1)))) {
         throw new UnpairedSurrogateException(start, inLength);
       }
       int codePoint = toCodePoint(c, low);
       out[0] = (byte) ((0xF << 4) | (codePoint >>> 18));
       out[1] = (byte) (0x80 | (0x3F & (codePoint >>> 12)));
       out[2] = (byte) (0x80 | (0x3F & (codePoint >>> 6)));
       out[3] = (byte) (0x80 | (0x3F & codePoint));
       return 4;
     }
   }

   /**
    * Utility methods for decoding bytes into {@link String}. Callers are responsible for extracting
    * bytes (possibly using Unsafe methods), and checking remaining bytes. All other UTF-8 validity
    * checks and codepoint conversion happen in this class.
    */
   static class DecodeUtil {

     /**
      * Returns whether this is a single-byte codepoint (i.e., ASCII) with the form '0XXXXXXX'.
      */
     static boolean isOneByte(byte b) {
       return b >= 0;
     }

     /**
      * Returns whether this is a two-byte codepoint with the form '10XXXXXX'.
      */
     static boolean isTwoBytes(byte b) {
       return b < (byte) 0xE0;
     }

     /**
      * Returns whether this is a three-byte codepoint with the form '110XXXXX'.
      */
     static boolean isThreeBytes(byte b) {
       return b < (byte) 0xF0;
     }

     static void handleOneByte(byte byte1, char[] resultArr, int resultPos) {
       resultArr[resultPos] = (char) byte1;
     }

     static void handleTwoBytes(
         byte byte1, byte byte2, char[] resultArr, int resultPos)
         throws IllegalArgumentException {
       // Simultaneously checks for illegal trailing-byte in leading position (<= '11000000') and
       // overlong 2-byte, '11000001'.
       if (byte1 < (byte) 0xC2) {
         throw new IllegalArgumentException("Invalid UTF-8: Illegal leading byte in 2 bytes utf");
       }
       if (isNotTrailingByte(byte2)) {
         throw new IllegalArgumentException("Invalid UTF-8: Illegal trailing byte in 2 bytes utf");
       }
       resultArr[resultPos] = (char) (((byte1 & 0x1F) << 6) | trailingByteValue(byte2));
     }

     static void handleThreeBytes(
         byte byte1, byte byte2, byte byte3, char[] resultArr, int resultPos)
         throws IllegalArgumentException {
       if (isNotTrailingByte(byte2)
               // overlong? 5 most significant bits must not all be zero
               || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
               // check for illegal surrogate codepoints
               || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)
               || isNotTrailingByte(byte3)) {
         throw new IllegalArgumentException("Invalid UTF-8");
       }
       resultArr[resultPos] = (char)
                                  (((byte1 & 0x0F) << 12) | (trailingByteValue(byte2) << 6) | trailingByteValue(byte3));
     }

     static void handleFourBytes(
         byte byte1, byte byte2, byte byte3, byte byte4, char[] resultArr, int resultPos)
         throws IllegalArgumentException{
       if (isNotTrailingByte(byte2)
               // Check that 1 <= plane <= 16.  Tricky optimized form of:
               //   valid 4-byte leading byte?
               // if (byte1 > (byte) 0xF4 ||
               //   overlong? 4 most significant bits must not all be zero
               //     byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 ||
               //   codepoint larger than the highest code point (U+10FFFF)?
               //     byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
               || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
               || isNotTrailingByte(byte3)
               || isNotTrailingByte(byte4)) {
         throw new IllegalArgumentException("Invalid UTF-8");
       }
       int codepoint = ((byte1 & 0x07) << 18)
                           | (trailingByteValue(byte2) << 12)
                           | (trailingByteValue(byte3) << 6)
                           | trailingByteValue(byte4);
       resultArr[resultPos] = DecodeUtil.highSurrogate(codepoint);
       resultArr[resultPos + 1] = DecodeUtil.lowSurrogate(codepoint);
     }

     /**
      * Returns whether the byte is not a valid continuation of the form '10XXXXXX'.
      */
     private static boolean isNotTrailingByte(byte b) {
       return b > (byte) 0xBF;
     }

     /**
      * Returns the actual value of the trailing byte (removes the prefix '10') for composition.
      */
     private static int trailingByteValue(byte b) {
       return b & 0x3F;
     }

     private static char highSurrogate(int codePoint) {
       return (char) ((MIN_HIGH_SURROGATE - (MIN_SUPPLEMENTARY_CODE_POINT >>> 10))
                          + (codePoint >>> 10));
     }

     private static char lowSurrogate(int codePoint) {
       return (char) (MIN_LOW_SURROGATE + (codePoint & 0x3ff));
     }
   }

   // These UTF-8 handling methods are copied from Guava's Utf8Unsafe class with a modification to throw
   // a protocol buffer local exception. This exception is then caught in CodedOutputStream so it can
   // fallback to more lenient behavior.
   static class UnpairedSurrogateException extends IllegalArgumentException {
     UnpairedSurrogateException(int index, int length) {
       super("Unpaired surrogate at index " + index + " of " + length);
     }
   }
 }
	/*
	* Copyright 2014 Google Inc. All rights reserved.
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package com.google.flatbuffers;

	import java.nio.ByteBuffer;

	import static java.lang.Character.MAX_SURROGATE;
	import static java.lang.Character.MIN_SURROGATE;
	import static java.lang.Character.MIN_HIGH_SURROGATE;
	import static java.lang.Character.MIN_LOW_SURROGATE;
	import static java.lang.Character.MIN_SUPPLEMENTARY_CODE_POINT;
	import static java.lang.Character.isSurrogatePair;
	import static java.lang.Character.toCodePoint;

	public abstract class Utf8 {

	/**
	* Returns the number of bytes in the UTF-8-encoded form of {@code sequence}. For a string,
	* this method is equivalent to {@code string.getBytes(UTF_8).length}, but is more efficient in
	* both time and space.
	*
	* @throws IllegalArgumentException if {@code sequence} contains ill-formed UTF-16 (unpaired
	* surrogates)
	*/
	public abstract int encodedLength(CharSequence sequence);

	/**
	* Encodes the given characters to the target {@link ByteBuffer} using UTF-8 encoding.
	*
	* <p>Selects an optimal algorithm based on the type of {@link ByteBuffer} (i.e. heap or direct)
	* and the capabilities of the platform.
	*
	* @param in the source string to be encoded
	* @param out the target buffer to receive the encoded string.
	*/
	public abstract void encodeUtf8(CharSequence in, ByteBuffer out);

	/**
	* Decodes the given UTF-8 portion of the {@link ByteBuffer} into a {@link String}.
	*
	* @throws IllegalArgumentException if the input is not valid UTF-8.
	*/
	public abstract String decodeUtf8(ByteBuffer buffer, int offset, int length);

	private static Utf8 DEFAULT;

	/**
	* Get the default UTF-8 processor.
	* @return the default processor
	*/
	public static Utf8 getDefault() {
	if (DEFAULT == null) {
	DEFAULT = new Utf8Safe();
	}
	return DEFAULT;
	}

	/**
	* Set the default instance of the UTF-8 processor.
	* @param instance the new instance to use
	*/
	public static void setDefault(Utf8 instance) {
	DEFAULT = instance;
	}

	/**
	* Encode a Java's CharSequence UTF8 codepoint into a byte array.
	* @param in CharSequence to be encoded
	* @param start start position of the first char in the codepoint
	* @param out byte array of 4 bytes to be filled
	* @return return the amount of bytes occupied by the codepoint
	*/
	public static int encodeUtf8CodePoint(CharSequence in, int start, byte[] out) {
	// utf8 codepoint needs at least 4 bytes
	assert out.length >= 4;

	final int inLength = in.length();
	if (start >= inLength) {
	return 0;
	}

	char c = in.charAt(start);
	if (c < 0x80) {
	// One byte (0xxx xxxx)
	out[0] = (byte) c;
	return 1;
	} else if (c < 0x800) {
	// Two bytes (110x xxxx 10xx xxxx)
	out[0] = (byte) (0xC0 \| (c >>> 6));
	out[1] = (byte) (0x80 \| (0x3F & c));
	return 2;
	} else if (c < MIN_SURROGATE \|\| MAX_SURROGATE < c) {
	// Three bytes (1110 xxxx 10xx xxxx 10xx xxxx)
	// Maximum single-char code point is 0xFFFF, 16 bits.
	out[0] = (byte) (0xE0 \| (c >>> 12));
	out[1] =(byte) (0x80 \| (0x3F & (c >>> 6)));
	out[2] = (byte) (0x80 \| (0x3F & c));
	return 3;
	} else {
	// Four bytes (1111 xxxx 10xx xxxx 10xx xxxx 10xx xxxx)
	// Minimum code point represented by a surrogate pair is 0x10000, 17 bits, four UTF-8
	// bytes
	final char low;
	if (start + 1 == inLength \|\| !isSurrogatePair(c, (low = in.charAt(start+1)))) {
	throw new UnpairedSurrogateException(start, inLength);
	}
	int codePoint = toCodePoint(c, low);
	out[0] = (byte) ((0xF << 4) \| (codePoint >>> 18));
	out[1] = (byte) (0x80 \| (0x3F & (codePoint >>> 12)));
	out[2] = (byte) (0x80 \| (0x3F & (codePoint >>> 6)));
	out[3] = (byte) (0x80 \| (0x3F & codePoint));
	return 4;
	}
	}

	/**
	* Utility methods for decoding bytes into {@link String}. Callers are responsible for extracting
	* bytes (possibly using Unsafe methods), and checking remaining bytes. All other UTF-8 validity
	* checks and codepoint conversion happen in this class.
	*/
	static class DecodeUtil {

	/**
	* Returns whether this is a single-byte codepoint (i.e., ASCII) with the form '0XXXXXXX'.
	*/
	static boolean isOneByte(byte b) {
	return b >= 0;
	}

	/**
	* Returns whether this is a two-byte codepoint with the form '10XXXXXX'.
	*/
	static boolean isTwoBytes(byte b) {
	return b < (byte) 0xE0;
	}

	/**
	* Returns whether this is a three-byte codepoint with the form '110XXXXX'.
	*/
	static boolean isThreeBytes(byte b) {
	return b < (byte) 0xF0;
	}

	static void handleOneByte(byte byte1, char[] resultArr, int resultPos) {
	resultArr[resultPos] = (char) byte1;
	}

	static void handleTwoBytes(
	byte byte1, byte byte2, char[] resultArr, int resultPos)
	throws IllegalArgumentException {
	// Simultaneously checks for illegal trailing-byte in leading position (<= '11000000') and
	// overlong 2-byte, '11000001'.
	if (byte1 < (byte) 0xC2) {
	throw new IllegalArgumentException("Invalid UTF-8: Illegal leading byte in 2 bytes utf");
	}
	if (isNotTrailingByte(byte2)) {
	throw new IllegalArgumentException("Invalid UTF-8: Illegal trailing byte in 2 bytes utf");
	}
	resultArr[resultPos] = (char) (((byte1 & 0x1F) << 6) \| trailingByteValue(byte2));
	}

	static void handleThreeBytes(
	byte byte1, byte byte2, byte byte3, char[] resultArr, int resultPos)
	throws IllegalArgumentException {
	if (isNotTrailingByte(byte2)
	// overlong? 5 most significant bits must not all be zero
	\|\| (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
	// check for illegal surrogate codepoints
	\|\| (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)
	\|\| isNotTrailingByte(byte3)) {
	throw new IllegalArgumentException("Invalid UTF-8");
	}
	resultArr[resultPos] = (char)
	(((byte1 & 0x0F) << 12) \| (trailingByteValue(byte2) << 6) \| trailingByteValue(byte3));
	}

	static void handleFourBytes(
	byte byte1, byte byte2, byte byte3, byte byte4, char[] resultArr, int resultPos)
	throws IllegalArgumentException{
	if (isNotTrailingByte(byte2)
	// Check that 1 <= plane <= 16. Tricky optimized form of:
	// valid 4-byte leading byte?
	// if (byte1 > (byte) 0xF4 \|\|
	// overlong? 4 most significant bits must not all be zero
	// byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 \|\|
	// codepoint larger than the highest code point (U+10FFFF)?
	// byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
	\|\| (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
	\|\| isNotTrailingByte(byte3)
	\|\| isNotTrailingByte(byte4)) {
	throw new IllegalArgumentException("Invalid UTF-8");
	}
	int codepoint = ((byte1 & 0x07) << 18)
	\| (trailingByteValue(byte2) << 12)
	\| (trailingByteValue(byte3) << 6)
	\| trailingByteValue(byte4);
	resultArr[resultPos] = DecodeUtil.highSurrogate(codepoint);
	resultArr[resultPos + 1] = DecodeUtil.lowSurrogate(codepoint);
	}

	/**
	* Returns whether the byte is not a valid continuation of the form '10XXXXXX'.
	*/
	private static boolean isNotTrailingByte(byte b) {
	return b > (byte) 0xBF;
	}

	/**
	* Returns the actual value of the trailing byte (removes the prefix '10') for composition.
	*/
	private static int trailingByteValue(byte b) {
	return b & 0x3F;
	}

	private static char highSurrogate(int codePoint) {
	return (char) ((MIN_HIGH_SURROGATE - (MIN_SUPPLEMENTARY_CODE_POINT >>> 10))
	+ (codePoint >>> 10));
	}

	private static char lowSurrogate(int codePoint) {
	return (char) (MIN_LOW_SURROGATE + (codePoint & 0x3ff));
	}
	}

	// These UTF-8 handling methods are copied from Guava's Utf8Unsafe class with a modification to throw
	// a protocol buffer local exception. This exception is then caught in CodedOutputStream so it can
	// fallback to more lenient behavior.
	static class UnpairedSurrogateException extends IllegalArgumentException {
	UnpairedSurrogateException(int index, int length) {
	super("Unpaired surrogate at index " + index + " of " + length);
	}
	}
	}