blob: bacff2522ea34866149b8d8a4b4c9c593a861bb5 [file] [log] [blame]
//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2014 - 2016 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See http://swift.org/LICENSE.txt for license information
// See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//
// Conversions between different Unicode encodings. Note that UTF-16 and
// UTF-32 decoding are *not* currently resilient to erroneous data.
/// The result of one Unicode decoding step.
///
/// A unicode scalar value, an indication that no more unicode scalars
/// are available, or an indication of a decoding error.
public enum UnicodeDecodingResult {
case Result(UnicodeScalar)
case EmptyInput
case Error
/// Return true if `self` indicates no more unicode scalars are
/// available.
@warn_unused_result
public func isEmptyInput() -> Bool {
switch self {
case .EmptyInput:
return true
default:
return false
}
}
}
/// A Unicode [encoding scheme](http://www.unicode.org/glossary/#character_encoding_scheme).
///
/// Consists of an underlying [code unit](http://www.unicode.org/glossary/#code_unit) and functions to
/// translate between sequences of these code units and [unicode scalar values](http://www.unicode.org/glossary/#unicode_scalar_value).
public protocol UnicodeCodecType {
/// A type that can hold [code unit](http://www.unicode.org/glossary/#code_unit) values for this
/// encoding.
typealias CodeUnit
init()
/// Start or continue decoding a UTF sequence.
///
/// In order to decode a code unit sequence completely, this function should
/// be called repeatedly until it returns `UnicodeDecodingResult.EmptyInput`.
/// Checking that the generator was exhausted is not sufficient. The decoder
/// can have an internal buffer that is pre-filled with data from the input
/// generator.
///
/// Because of buffering, it is impossible to find the corresponding position
/// in the generator for a given returned `UnicodeScalar` or an error.
///
/// - parameter next: A *generator* of code units to be decoded.
mutating func decode<
G : GeneratorType where G.Element == CodeUnit
>(inout next: G) -> UnicodeDecodingResult
/// Encode a `UnicodeScalar` as a series of `CodeUnit`s by
/// calling `output` on each `CodeUnit`.
static func encode(input: UnicodeScalar, output: (CodeUnit) -> Void)
}
/// A codec for [UTF-8](http://www.unicode.org/glossary/#UTF_8).
public struct UTF8 : UnicodeCodecType {
/// A type that can hold [code unit](http://www.unicode.org/glossary/#code_unit) values for this
/// encoding.
public typealias CodeUnit = UInt8
public init() {}
/// Returns the number of expected trailing bytes for a given first byte: 0,
/// 1, 2 or 3. If the first byte cannot start a valid UTF-8 code unit
/// sequence, returns 4.
@warn_unused_result
public static func _numTrailingBytes(cu0: CodeUnit) -> UInt8 {
if _fastPath(cu0 & 0x80 == 0) {
// 0x00 -- 0x7f: 1-byte sequences.
return 0
}
// 0xc0 -- 0xc1: invalid first byte.
// 0xc2 -- 0xdf: 2-byte sequences.
// 0xe0 -- 0xef: 3-byte sequences.
// 0xf0 -- 0xf4: 4-byte sequences.
// 0xf5 -- 0xff: invalid first byte.
// The rules above are represented as a lookup table. The lookup table
// consists of two words, where `high` contains the high bit of the result,
// `low` contains the low bit.
//
// Bit patterns:
// high | low | meaning
// -----+-----+----------------
// 0 | 0 | 2-byte sequence
// 0 | 1 | 3-byte sequence
// 1 | 0 | 4-byte sequence
// 1 | 1 | invalid
//
// This implementation allows us to handle these cases without branches.
// ---------0xf?------- ---------0xe?------- ---------0xd?------- ---------0xc?-------
let low: UInt64 =
0b1111_1111__1110_0000__1111_1111__1111_1111__0000_0000__0000_0000__0000_0000__0000_0011
let high: UInt64 =
0b1111_1111__1111_1111__0000_0000__0000_0000__0000_0000__0000_0000__0000_0000__0000_0011
let index = UInt64(max(0, Int(cu0) - 0xc0))
let highBit = ((high >> index) & 1) << 1
let lowBit = (low >> index) & 1
return UInt8(1 + (highBit | lowBit))
}
/// Lookahead buffer used for UTF-8 decoding. New bytes are inserted at LSB,
/// and bytes are read at MSB.
var _decodeLookahead: UInt32 = 0
/// Flags with layout: `0bxxxx_yyyy`.
///
/// `xxxx` is the EOF flag. It means that the input generator has signaled
/// end of sequence. Out of the four bits, only one bit can be set. The bit
/// position specifies how many bytes have been consumed from the lookahead
/// buffer already. A value of `1000` means that there are `yyyy` bytes in
/// the buffer, `0100` means that there are `yyyy - 1` bytes, `0010` --
/// `yyyy - 2`, `0001` -- `yyyy - 3`.
///
/// `yyyy` specifies how many bytes are valid in the lookahead buffer. Value
/// is expressed in unary code. Valid values: `1111` (4), `0111` (3),
/// `0011` (2), `0001` (1), `0000` (0).
///
/// This representation is crafted to allow one to consume a byte from a
/// buffer with a shift, and update flags with a single-bit right shift.
var _lookaheadFlags: UInt8 = 0
/// Return `true` if the LSB bytes in `buffer` are well-formed UTF-8 code
/// unit sequence.
@warn_unused_result
static func _isValidUTF8Impl(buffer: UInt32, length: UInt8) -> Bool {
switch length {
case 4:
let cu3 = UInt8((buffer >> 24) & 0xff)
if cu3 < 0x80 || cu3 > 0xbf {
return false
}
fallthrough
case 3:
let cu2 = UInt8((buffer >> 16) & 0xff)
if cu2 < 0x80 || cu2 > 0xbf {
return false
}
fallthrough
case 2:
let cu0 = UInt8(buffer & 0xff)
let cu1 = UInt8((buffer >> 8) & 0xff)
switch cu0 {
case 0xe0:
if cu1 < 0xa0 || cu1 > 0xbf {
return false
}
case 0xed:
if cu1 < 0x80 || cu1 > 0x9f {
return false
}
case 0xf0:
if cu1 < 0x90 || cu1 > 0xbf {
return false
}
case 0xf4:
if cu1 < 0x80 || cu1 > 0x8f {
return false
}
default:
_sanityCheck(cu0 >= 0xc2 && cu0 <= 0xf4,
"invalid first bytes should be handled in the caller")
if cu1 < 0x80 || cu1 > 0xbf {
return false
}
}
return true
default:
_sanityCheckFailure("one-byte sequences should be handled in the caller")
}
}
/// Return `true` if the LSB bytes in `buffer` are well-formed UTF-8 code
/// unit sequence.
@warn_unused_result
static func _isValidUTF8(buffer: UInt32, validBytes: UInt8) -> Bool {
_sanityCheck(validBytes & 0b0000_1111 != 0,
"input buffer should not be empty")
let cu0 = UInt8(buffer & 0xff)
let trailingBytes = _numTrailingBytes(cu0)
switch trailingBytes {
case 0:
return true
case 1, 2, 3:
// We *don't* need to check the if the buffer actually contains at least
// `trailingBytes` bytes. Here's why.
//
// If the buffer is not full -- contains fewer than 4 bytes, we are at
// EOF, and the buffer will be padded with 0x00. Thus, an incomplete
// code unit sequence just before EOF would be seen by code below as
// padded with nuls. This sequence will be rejected by the logic in
// `_isValidUTF8Impl`, because the nul byte is not a valid continuation
// byte for UTF-8.
return _isValidUTF8Impl(buffer, length: trailingBytes + 1)
default:
return false
}
}
/// Given an ill-formed sequence, find the length of its maximal subpart.
@inline(never)
@warn_unused_result
static func _findMaximalSubpartOfIllFormedUTF8Sequence(
buffer: UInt32, validBytes: UInt8) -> UInt8 {
var buffer = buffer
var validBytes = validBytes
// This function is '@inline(never)' because it is used only in the error
// handling path.
// Clear EOF flag, we don't care about it.
validBytes &= 0b0000_1111
_sanityCheck(validBytes != 0,
"input buffer should not be empty")
_sanityCheck(!UTF8._isValidUTF8(buffer, validBytes: validBytes),
"input sequence should be ill-formed UTF-8")
// Unicode 6.3.0, D93b:
//
// Maximal subpart of an ill-formed subsequence: The longest code unit
// subsequence starting at an unconvertible offset that is either:
// a. the initial subsequence of a well-formed code unit sequence, or
// b. a subsequence of length one.
// Perform case analysis. See Unicode 6.3.0, Table 3-7. Well-Formed UTF-8
// Byte Sequences.
let cu0 = UInt8(buffer & 0xff)
buffer >>= 8
validBytes >>= 1
if (cu0 >= 0xc2 && cu0 <= 0xdf) {
// First byte is valid, but we know that this code unit sequence is
// invalid, so the maximal subpart has to end after the first byte.
return 1
}
if validBytes == 0 {
return 1
}
let cu1 = UInt8(buffer & 0xff)
buffer >>= 8
validBytes >>= 1
if (cu0 == 0xe0) {
return (cu1 >= 0xa0 && cu1 <= 0xbf) ? 2 : 1
}
if (cu0 >= 0xe1 && cu0 <= 0xec) {
return (cu1 >= 0x80 && cu1 <= 0xbf) ? 2 : 1
}
if (cu0 == 0xed) {
return (cu1 >= 0x80 && cu1 <= 0x9f) ? 2 : 1
}
if (cu0 >= 0xee && cu0 <= 0xef) {
return (cu1 >= 0x80 && cu1 <= 0xbf) ? 2 : 1
}
if (cu0 == 0xf0) {
if (cu1 >= 0x90 && cu1 <= 0xbf) {
if validBytes == 0 {
return 2
}
let cu2 = UInt8(buffer & 0xff)
return (cu2 >= 0x80 && cu2 <= 0xbf) ? 3 : 2
}
return 1
}
if (cu0 >= 0xf1 && cu0 <= 0xf3) {
if (cu1 >= 0x80 && cu1 <= 0xbf) {
if validBytes == 0 {
return 2
}
let cu2 = UInt8(buffer & 0xff)
return (cu2 >= 0x80 && cu2 <= 0xbf) ? 3 : 2
}
return 1
}
if (cu0 == 0xf4) {
if (cu1 >= 0x80 && cu1 <= 0x8f) {
if validBytes == 0 {
return 2
}
let cu2 = UInt8(buffer & 0xff)
return (cu2 >= 0x80 && cu2 <= 0xbf) ? 3 : 2
}
return 1
}
_sanityCheck((cu0 >= 0x80 && cu0 <= 0xc1) || cu0 >= 0xf5,
"case analysis above should have handled all valid first bytes")
// There are no well-formed sequences that start with these bytes. Maximal
// subpart is defined to have length 1 in these cases.
return 1
}
/// Start or continue decoding a UTF sequence.
///
/// In order to decode a code unit sequence completely, this function should
/// be called repeatedly until it returns `UnicodeDecodingResult.EmptyInput`.
/// Checking that the generator was exhausted is not sufficient. The decoder
/// can have an internal buffer that is pre-filled with data from the input
/// generator.
///
/// Because of buffering, it is impossible to find the corresponding position
/// in the generator for a given returned `UnicodeScalar` or an error.
///
/// - parameter next: A *generator* of code units to be decoded.
public mutating func decode<
G : GeneratorType where G.Element == CodeUnit
>(inout next: G) -> UnicodeDecodingResult {
// If the EOF flag is not set, fill the lookahead buffer from the input
// generator.
if _lookaheadFlags & 0b1111_0000 == 0 {
// Add more bytes into the buffer until we have 4.
while _lookaheadFlags != 0b0000_1111 {
if let codeUnit = next.next() {
_decodeLookahead = (_decodeLookahead << 8) | UInt32(codeUnit)
_lookaheadFlags = (_lookaheadFlags << 1) | 1
} else {
// Set the EOF flag.
switch _lookaheadFlags & 0b0000_1111 {
case 0b1111:
_sanityCheckFailure("should have not entered buffer refill loop")
case 0b0111:
_lookaheadFlags |= 0b0100_0000
case 0b0011:
_lookaheadFlags |= 0b0010_0000
case 0b0001:
_lookaheadFlags |= 0b0001_0000
case 0b0000:
_lookaheadFlags |= 0b1000_0000
return .EmptyInput
default:
_sanityCheckFailure("bad value in _lookaheadFlags")
}
break
}
}
}
if _slowPath(_lookaheadFlags & 0b0000_1111 == 0) {
return .EmptyInput
}
if _slowPath(_lookaheadFlags & 0b1111_0000 != 0) {
// Reached EOF. Restore the invariant: first unread byte is always at
// MSB.
switch _lookaheadFlags & 0b1111_0000 {
case 0b1000_0000:
break
case 0b0100_0000:
_decodeLookahead <<= 1 * 8
case 0b0010_0000:
_decodeLookahead <<= 2 * 8
case 0b0001_0000:
_decodeLookahead <<= 3 * 8
default:
_sanityCheckFailure("bad value in _lookaheadFlags")
}
_lookaheadFlags = (_lookaheadFlags & 0b0000_1111) | 0b1000_0000
}
// The first byte to read is located at MSB of `_decodeLookahead`. Get a
// representation of the buffer where we can read bytes starting from LSB.
var buffer = _decodeLookahead.byteSwapped
if _slowPath(!UTF8._isValidUTF8(buffer, validBytes: _lookaheadFlags)) {
// The code unit sequence is ill-formed. According to Unicode
// recommendation, replace the maximal subpart of ill-formed sequence
// with one replacement character.
_lookaheadFlags >>=
UTF8._findMaximalSubpartOfIllFormedUTF8Sequence(buffer,
validBytes: _lookaheadFlags)
return .Error
}
// At this point we know that `buffer` starts with a well-formed code unit
// sequence. Decode it.
//
// When consuming bytes from the `buffer`, we just need to update
// `_lookaheadFlags`. The stored buffer in `_decodeLookahead` will be
// shifted at the beginning of the next decoding cycle.
let cu0 = UInt8(buffer & 0xff)
buffer >>= 8
_lookaheadFlags >>= 1
if cu0 < 0x80 {
// 1-byte sequences.
return .Result(UnicodeScalar(UInt32(cu0)))
}
// Start with octet 1 (we'll mask off high bits later).
var result = UInt32(cu0)
let cu1 = UInt8(buffer & 0xff)
buffer >>= 8
_lookaheadFlags >>= 1
result = (result << 6) | UInt32(cu1 & 0x3f)
if cu0 < 0xe0 {
// 2-byte sequences.
return .Result(UnicodeScalar(result & 0x000007ff)) // 11 bits
}
let cu2 = UInt8(buffer & 0xff)
buffer >>= 8
_lookaheadFlags >>= 1
result = (result << 6) | UInt32(cu2 & 0x3f)
if cu0 < 0xf0 {
// 3-byte sequences.
return .Result(UnicodeScalar(result & 0x0000ffff)) // 16 bits
}
// 4-byte sequences.
let cu3 = UInt8(buffer & 0xff)
_lookaheadFlags >>= 1
result = (result << 6) | UInt32(cu3 & 0x3f)
return .Result(UnicodeScalar(result & 0x001fffff)) // 21 bits
}
/// Encode a `UnicodeScalar` as a series of `CodeUnit`s by
/// calling `output` on each `CodeUnit`.
public static func encode(
input: UnicodeScalar,
output put: (CodeUnit) -> Void
) {
var c = UInt32(input)
var buf3 = UInt8(c & 0xFF)
if c >= UInt32(1<<7) {
c >>= 6
buf3 = (buf3 & 0x3F) | 0x80 // 10xxxxxx
var buf2 = UInt8(c & 0xFF)
if c < UInt32(1<<5) {
buf2 |= 0xC0 // 110xxxxx
}
else {
c >>= 6
buf2 = (buf2 & 0x3F) | 0x80 // 10xxxxxx
var buf1 = UInt8(c & 0xFF)
if c < UInt32(1<<4) {
buf1 |= 0xE0 // 1110xxxx
}
else {
c >>= 6
buf1 = (buf1 & 0x3F) | 0x80 // 10xxxxxx
put(UInt8(c | 0xF0)) // 11110xxx
}
put(buf1)
}
put(buf2)
}
put(buf3)
}
/// Return `true` if `byte` is a continuation byte of the form
/// `0b10xxxxxx`.
@warn_unused_result
public static func isContinuation(byte: CodeUnit) -> Bool {
return byte & 0b11_00__0000 == 0b10_00__0000
}
var _value = UInt8()
}
/// A codec for [UTF-16](http://www.unicode.org/glossary/#UTF_16).
public struct UTF16 : UnicodeCodecType {
/// A type that can hold [code unit](http://www.unicode.org/glossary/#code_unit) values for this
/// encoding.
public typealias CodeUnit = UInt16
public init() {}
/// A lookahead buffer for one UTF-16 code unit.
var _decodeLookahead: UInt32 = 0
/// Flags with layout: `0b0000_00xy`.
///
/// `y` is the EOF flag.
///
/// `x` is set when `_decodeLookahead` contains a code unit.
var _lookaheadFlags: UInt8 = 0
/// Start or continue decoding a UTF sequence.
///
/// In order to decode a code unit sequence completely, this function should
/// be called repeatedly until it returns `UnicodeDecodingResult.EmptyInput`.
/// Checking that the generator was exhausted is not sufficient. The decoder
/// can have an internal buffer that is pre-filled with data from the input
/// generator.
///
/// Because of buffering, it is impossible to find the corresponding position
/// in the generator for a given returned `UnicodeScalar` or an error.
///
/// - parameter next: A *generator* of code units to be decoded.
public mutating func decode<
G : GeneratorType where G.Element == CodeUnit
>(inout input: G) -> UnicodeDecodingResult {
if _lookaheadFlags & 0b01 != 0 {
return .EmptyInput
}
// Note: maximal subpart of ill-formed sequence for UTF-16 can only have
// length 1. Length 0 does not make sense. Neither does length 2 -- in
// that case the sequence is valid.
var unit0: UInt32
if _fastPath(_lookaheadFlags & 0b10 == 0) {
if let first = input.next() {
unit0 = UInt32(first)
} else {
// Set EOF flag.
_lookaheadFlags |= 0b01
return .EmptyInput
}
} else {
// Fetch code unit from the lookahead buffer and note this fact in flags.
unit0 = _decodeLookahead
_lookaheadFlags &= 0b01
}
// A well-formed pair of surrogates looks like this:
// [1101 10ww wwxx xxxx] [1101 11xx xxxx xxxx]
if _fastPath((unit0 >> 11) != 0b1101_1) {
// Neither high-surrogate, nor low-surrogate -- sequence of 1 code unit,
// decoding is trivial.
return .Result(UnicodeScalar(unit0))
}
if _slowPath((unit0 >> 10) == 0b1101_11) {
// `unit0` is a low-surrogate. We have an ill-formed sequence.
return .Error
}
// At this point we know that `unit0` is a high-surrogate.
var unit1: UInt32
if let second = input.next() {
unit1 = UInt32(second)
} else {
// EOF reached. Set EOF flag.
_lookaheadFlags |= 0b01
// We have seen a high-surrogate and EOF, so we have an ill-formed
// sequence.
return .Error
}
if _fastPath((unit1 >> 10) == 0b1101_11) {
// `unit1` is a low-surrogate. We have a well-formed surrogate pair.
let result = 0x10000 + (((unit0 & 0x03ff) << 10) | (unit1 & 0x03ff))
return .Result(UnicodeScalar(result))
}
// Otherwise, we have an ill-formed sequence. These are the possible
// cases:
//
// * `unit1` is a high-surrogate, so we have a pair of two high-surrogates.
//
// * `unit1` is not a surrogate. We have an ill-formed sequence:
// high-surrogate followed by a non-surrogate.
// Save the second code unit in the lookahead buffer.
_decodeLookahead = unit1
_lookaheadFlags |= 0b10
return .Error
}
/// Try to decode one Unicode scalar, and return the actual number of code
/// units it spanned in the input. This function may consume more code
/// units than required for this scalar.
mutating func _decodeOne<
G : GeneratorType where G.Element == CodeUnit
>(inout input: G) -> (UnicodeDecodingResult, Int) {
let result = decode(&input)
switch result {
case .Result(let us):
return (result, UTF16.width(us))
case .EmptyInput:
return (result, 0)
case .Error:
return (result, 1)
}
}
/// Encode a `UnicodeScalar` as a series of `CodeUnit`s by
/// calling `output` on each `CodeUnit`.
public static func encode(
input: UnicodeScalar,
output put: (CodeUnit) -> Void
) {
let scalarValue: UInt32 = UInt32(input)
if scalarValue <= UInt32(UInt16.max) {
put(UInt16(scalarValue))
}
else {
let lead_offset = UInt32(0xd800) - UInt32(0x10000 >> 10)
put(UInt16(lead_offset + (scalarValue >> 10)))
put(UInt16(0xdc00 + (scalarValue & 0x3ff)))
}
}
var _value = UInt16()
}
/// A codec for [UTF-32](http://www.unicode.org/glossary/#UTF_32).
public struct UTF32 : UnicodeCodecType {
/// A type that can hold [code unit](http://www.unicode.org/glossary/#code_unit) values for this
/// encoding.
public typealias CodeUnit = UInt32
public init() {}
/// Start or continue decoding a UTF sequence.
///
/// In order to decode a code unit sequence completely, this function should
/// be called repeatedly until it returns `UnicodeDecodingResult.EmptyInput`.
/// Checking that the generator was exhausted is not sufficient. The decoder
/// can have an internal buffer that is pre-filled with data from the input
/// generator.
///
/// Because of buffering, it is impossible to find the corresponding position
/// in the generator for a given returned `UnicodeScalar` or an error.
///
/// - parameter next: A *generator* of code units to be decoded.
public mutating func decode<
G : GeneratorType where G.Element == CodeUnit
>(inout input: G) -> UnicodeDecodingResult {
return UTF32._decode(&input)
}
static func _decode<
G : GeneratorType where G.Element == CodeUnit
>(inout input: G) -> UnicodeDecodingResult {
guard let x = input.next() else { return .EmptyInput }
if _fastPath((x >> 11) != 0b1101_1 && x <= 0x10ffff) {
return .Result(UnicodeScalar(x))
} else {
return .Error
}
}
/// Encode a `UnicodeScalar` as a series of `CodeUnit`s by
/// calling `output` on each `CodeUnit`.
public static func encode(
input: UnicodeScalar,
output put: (CodeUnit) -> Void
) {
put(UInt32(input))
}
}
/// Translate `input`, in the given `InputEncoding`, into `output`, in
/// the given `OutputEncoding`.
///
/// - parameter stopOnError: Causes encoding to stop when an encoding
/// error is detected in `input`, if `true`. Otherwise, U+FFFD
/// replacement characters are inserted for each detected error.
public func transcode<
Input : GeneratorType,
InputEncoding : UnicodeCodecType,
OutputEncoding : UnicodeCodecType
where InputEncoding.CodeUnit == Input.Element>(
inputEncoding: InputEncoding.Type, _ outputEncoding: OutputEncoding.Type,
_ input: Input, _ output: (OutputEncoding.CodeUnit) -> Void,
stopOnError: Bool
) -> Bool {
var input = input
// NB. It is not possible to optimize this routine to a memcpy if
// InputEncoding == OutputEncoding. The reason is that memcpy will not
// substitute U+FFFD replacement characters for ill-formed sequences.
var inputDecoder = inputEncoding.init()
var hadError = false
for var scalar = inputDecoder.decode(&input);
!scalar.isEmptyInput();
scalar = inputDecoder.decode(&input) {
switch scalar {
case .Result(let us):
OutputEncoding.encode(us, output: output)
case .EmptyInput:
_sanityCheckFailure("should not enter the loop when input becomes empty")
case .Error:
if stopOnError {
return (hadError: true)
} else {
OutputEncoding.encode("\u{fffd}", output: output)
hadError = true
}
}
}
return hadError
}
/// Transcode UTF-16 to UTF-8, replacing ill-formed sequences with U+FFFD.
///
/// Returns the index of the first unhandled code unit and the UTF-8 data
/// that was encoded.
@warn_unused_result
internal func _transcodeSomeUTF16AsUTF8<
Input : CollectionType
where Input.Generator.Element == UInt16>(
input: Input, _ startIndex: Input.Index
) -> (Input.Index, _StringCore.UTF8Chunk) {
typealias UTF8Chunk = _StringCore.UTF8Chunk
let endIndex = input.endIndex
let utf8Max = sizeof(UTF8Chunk.self)
var result: UTF8Chunk = 0
var utf8Count = 0
var nextIndex = startIndex
while nextIndex != input.endIndex && utf8Count != utf8Max {
let u = UInt(input[nextIndex])
let shift = UTF8Chunk(utf8Count * 8)
var utf16Length: Input.Index.Distance = 1
if _fastPath(u <= 0x7f) {
result |= UTF8Chunk(u) << shift
utf8Count += 1
} else {
var scalarUtf8Length: Int
var r: UInt
if _fastPath((u >> 11) != 0b1101_1) {
// Neither high-surrogate, nor low-surrogate -- well-formed sequence
// of 1 code unit, decoding is trivial.
if u < 0x800 {
r = 0b10__00_0000__110__0_0000
r |= u >> 6
r |= (u & 0b11_1111) << 8
scalarUtf8Length = 2
}
else {
r = 0b10__00_0000__10__00_0000__1110__0000
r |= u >> 12
r |= ((u >> 6) & 0b11_1111) << 8
r |= (u & 0b11_1111) << 16
scalarUtf8Length = 3
}
} else {
let unit0 = u
if _slowPath((unit0 >> 10) == 0b1101_11) {
// `unit0` is a low-surrogate. We have an ill-formed sequence.
// Replace it with U+FFFD.
r = 0xbdbfef
scalarUtf8Length = 3
} else if _slowPath(nextIndex.advancedBy(1) == endIndex) {
// We have seen a high-surrogate and EOF, so we have an ill-formed
// sequence. Replace it with U+FFFD.
r = 0xbdbfef
scalarUtf8Length = 3
} else {
let unit1 = UInt(input[nextIndex.advancedBy(1)])
if _fastPath((unit1 >> 10) == 0b1101_11) {
// `unit1` is a low-surrogate. We have a well-formed surrogate
// pair.
let v = 0x10000 + (((unit0 & 0x03ff) << 10) | (unit1 & 0x03ff))
r = 0b10__00_0000__10__00_0000__10__00_0000__1111_0__000
r |= v >> 18
r |= ((v >> 12) & 0b11_1111) << 8
r |= ((v >> 6) & 0b11_1111) << 16
r |= (v & 0b11_1111) << 24
scalarUtf8Length = 4
utf16Length = 2
} else {
// Otherwise, we have an ill-formed sequence. Replace it with
// U+FFFD.
r = 0xbdbfef
scalarUtf8Length = 3
}
}
}
// Don't overrun the buffer
if utf8Count + scalarUtf8Length > utf8Max {
break
}
result |= numericCast(r) << shift
utf8Count += scalarUtf8Length
}
nextIndex = nextIndex.advancedBy(utf16Length)
}
// FIXME: Annoying check, courtesy of <rdar://problem/16740169>
if utf8Count < sizeofValue(result) {
result |= ~0 << numericCast(utf8Count * 8)
}
return (nextIndex, result)
}
/// Instances of conforming types are used in internal `String`
/// representation.
public // @testable
protocol _StringElementType {
@warn_unused_result
static func _toUTF16CodeUnit(_: Self) -> UTF16.CodeUnit
@warn_unused_result
static func _fromUTF16CodeUnit(utf16: UTF16.CodeUnit) -> Self
}
extension UTF16.CodeUnit : _StringElementType {
public // @testable
static func _toUTF16CodeUnit(x: UTF16.CodeUnit) -> UTF16.CodeUnit {
return x
}
public // @testable
static func _fromUTF16CodeUnit(
utf16: UTF16.CodeUnit
) -> UTF16.CodeUnit {
return utf16
}
}
extension UTF8.CodeUnit : _StringElementType {
public // @testable
static func _toUTF16CodeUnit(x: UTF8.CodeUnit) -> UTF16.CodeUnit {
_sanityCheck(x <= 0x7f, "should only be doing this with ASCII")
return UTF16.CodeUnit(x)
}
public // @testable
static func _fromUTF16CodeUnit(
utf16: UTF16.CodeUnit
) -> UTF8.CodeUnit {
_sanityCheck(utf16 <= 0x7f, "should only be doing this with ASCII")
return UTF8.CodeUnit(utf16)
}
}
extension UTF16 {
/// Return the number of code units required to encode `x`.
@warn_unused_result
public static func width(x: UnicodeScalar) -> Int {
return x.value <= 0xFFFF ? 1 : 2
}
/// Return the high surrogate code unit of a [surrogate pair](http://www.unicode.org/glossary/#surrogate_pair) representing
/// `x`.
///
/// - Requires: `width(x) == 2`.
@warn_unused_result
public static func leadSurrogate(x: UnicodeScalar) -> UTF16.CodeUnit {
_precondition(width(x) == 2)
return UTF16.CodeUnit((x.value - 0x1_0000) >> (10 as UInt32)) + 0xD800
}
/// Return the low surrogate code unit of a [surrogate pair](http://www.unicode.org/glossary/#surrogate_pair) representing
/// `x`.
///
/// - Requires: `width(x) == 2`.
@warn_unused_result
public static func trailSurrogate(x: UnicodeScalar) -> UTF16.CodeUnit {
_precondition(width(x) == 2)
return UTF16.CodeUnit(
(x.value - 0x1_0000) & (((1 as UInt32) << 10) - 1)
) + 0xDC00
}
@warn_unused_result
public static func isLeadSurrogate(x: CodeUnit) -> Bool {
return 0xD800...0xDBFF ~= x
}
@warn_unused_result
public static func isTrailSurrogate(x: CodeUnit) -> Bool {
return 0xDC00...0xDFFF ~= x
}
public // @testable
static func _copy<T : _StringElementType, U : _StringElementType>(
source: UnsafeMutablePointer<T>,
destination: UnsafeMutablePointer<U>, count: Int
) {
if strideof(T.self) == strideof(U.self) {
_memcpy(
dest: UnsafeMutablePointer(destination),
src: UnsafeMutablePointer(source),
size: UInt(count) * UInt(strideof(U.self)))
}
else {
for i in 0..<count {
let u16 = T._toUTF16CodeUnit((source + i).memory)
(destination + i).memory = U._fromUTF16CodeUnit(u16)
}
}
}
/// Returns the number of UTF-16 code units required for the given code unit
/// sequence when transcoded to UTF-16, and a bit describing if the sequence
/// was found to contain only ASCII characters.
///
/// If `repairIllFormedSequences` is `true`, the function always succeeds.
/// If it is `false`, `nil` is returned if an ill-formed code unit sequence is
/// found in `input`.
@warn_unused_result
public static func measure<
Encoding : UnicodeCodecType, Input : GeneratorType
where Encoding.CodeUnit == Input.Element
>(
_: Encoding.Type, input: Input, repairIllFormedSequences: Bool
) -> (Int, Bool)? {
var input = input
var count = 0
var isAscii = true
var inputDecoder = Encoding()
loop:
while true {
switch inputDecoder.decode(&input) {
case .Result(let us):
if us.value > 0x7f {
isAscii = false
}
count += width(us)
case .EmptyInput:
break loop
case .Error:
if !repairIllFormedSequences {
return nil
}
isAscii = false
count += width(UnicodeScalar(0xfffd))
}
}
return (count, isAscii)
}
}