| //===----------------------------------------------------------------------===// |
| // |
| // This source file is part of the Swift.org open source project |
| // |
| // Copyright (c) 2014 - 2016 Apple Inc. and the Swift project authors |
| // Licensed under Apache License v2.0 with Runtime Library Exception |
| // |
| // See http://swift.org/LICENSE.txt for license information |
| // See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors |
| // |
| //===----------------------------------------------------------------------===// |
| |
| |
| // Conversions between different Unicode encodings. Note that UTF-16 and |
| // UTF-32 decoding are *not* currently resilient to erroneous data. |
| |
| /// The result of one Unicode decoding step. |
| /// |
| /// A unicode scalar value, an indication that no more unicode scalars |
| /// are available, or an indication of a decoding error. |
| public enum UnicodeDecodingResult { |
| case Result(UnicodeScalar) |
| case EmptyInput |
| case Error |
| |
| /// Return true if `self` indicates no more unicode scalars are |
| /// available. |
| @warn_unused_result |
| public func isEmptyInput() -> Bool { |
| switch self { |
| case .EmptyInput: |
| return true |
| default: |
| return false |
| } |
| } |
| } |
| |
| /// A Unicode [encoding scheme](http://www.unicode.org/glossary/#character_encoding_scheme). |
| /// |
| /// Consists of an underlying [code unit](http://www.unicode.org/glossary/#code_unit) and functions to |
| /// translate between sequences of these code units and [unicode scalar values](http://www.unicode.org/glossary/#unicode_scalar_value). |
| public protocol UnicodeCodecType { |
| |
| /// A type that can hold [code unit](http://www.unicode.org/glossary/#code_unit) values for this |
| /// encoding. |
| typealias CodeUnit |
| |
| init() |
| |
| /// Start or continue decoding a UTF sequence. |
| /// |
| /// In order to decode a code unit sequence completely, this function should |
| /// be called repeatedly until it returns `UnicodeDecodingResult.EmptyInput`. |
| /// Checking that the generator was exhausted is not sufficient. The decoder |
| /// can have an internal buffer that is pre-filled with data from the input |
| /// generator. |
| /// |
| /// Because of buffering, it is impossible to find the corresponding position |
| /// in the generator for a given returned `UnicodeScalar` or an error. |
| /// |
| /// - parameter next: A *generator* of code units to be decoded. |
| mutating func decode< |
| G : GeneratorType where G.Element == CodeUnit |
| >(inout next: G) -> UnicodeDecodingResult |
| |
| /// Encode a `UnicodeScalar` as a series of `CodeUnit`s by |
| /// calling `output` on each `CodeUnit`. |
| static func encode(input: UnicodeScalar, output: (CodeUnit) -> Void) |
| } |
| |
| /// A codec for [UTF-8](http://www.unicode.org/glossary/#UTF_8). |
| public struct UTF8 : UnicodeCodecType { |
| |
| /// A type that can hold [code unit](http://www.unicode.org/glossary/#code_unit) values for this |
| /// encoding. |
| public typealias CodeUnit = UInt8 |
| |
| public init() {} |
| |
| /// Returns the number of expected trailing bytes for a given first byte: 0, |
| /// 1, 2 or 3. If the first byte cannot start a valid UTF-8 code unit |
| /// sequence, returns 4. |
| @warn_unused_result |
| public static func _numTrailingBytes(cu0: CodeUnit) -> UInt8 { |
| if _fastPath(cu0 & 0x80 == 0) { |
| // 0x00 -- 0x7f: 1-byte sequences. |
| return 0 |
| } |
| |
| // 0xc0 -- 0xc1: invalid first byte. |
| // 0xc2 -- 0xdf: 2-byte sequences. |
| // 0xe0 -- 0xef: 3-byte sequences. |
| // 0xf0 -- 0xf4: 4-byte sequences. |
| // 0xf5 -- 0xff: invalid first byte. |
| |
| // The rules above are represented as a lookup table. The lookup table |
| // consists of two words, where `high` contains the high bit of the result, |
| // `low` contains the low bit. |
| // |
| // Bit patterns: |
| // high | low | meaning |
| // -----+-----+---------------- |
| // 0 | 0 | 2-byte sequence |
| // 0 | 1 | 3-byte sequence |
| // 1 | 0 | 4-byte sequence |
| // 1 | 1 | invalid |
| // |
| // This implementation allows us to handle these cases without branches. |
| |
| // ---------0xf?------- ---------0xe?------- ---------0xd?------- ---------0xc?------- |
| let low: UInt64 = |
| 0b1111_1111__1110_0000__1111_1111__1111_1111__0000_0000__0000_0000__0000_0000__0000_0011 |
| let high: UInt64 = |
| 0b1111_1111__1111_1111__0000_0000__0000_0000__0000_0000__0000_0000__0000_0000__0000_0011 |
| |
| let index = UInt64(max(0, Int(cu0) - 0xc0)) |
| let highBit = ((high >> index) & 1) << 1 |
| let lowBit = (low >> index) & 1 |
| return UInt8(1 + (highBit | lowBit)) |
| } |
| |
| /// Lookahead buffer used for UTF-8 decoding. New bytes are inserted at LSB, |
| /// and bytes are read at MSB. |
| var _decodeLookahead: UInt32 = 0 |
| |
| /// Flags with layout: `0bxxxx_yyyy`. |
| /// |
| /// `xxxx` is the EOF flag. It means that the input generator has signaled |
| /// end of sequence. Out of the four bits, only one bit can be set. The bit |
| /// position specifies how many bytes have been consumed from the lookahead |
| /// buffer already. A value of `1000` means that there are `yyyy` bytes in |
| /// the buffer, `0100` means that there are `yyyy - 1` bytes, `0010` -- |
| /// `yyyy - 2`, `0001` -- `yyyy - 3`. |
| /// |
| /// `yyyy` specifies how many bytes are valid in the lookahead buffer. Value |
| /// is expressed in unary code. Valid values: `1111` (4), `0111` (3), |
| /// `0011` (2), `0001` (1), `0000` (0). |
| /// |
| /// This representation is crafted to allow one to consume a byte from a |
| /// buffer with a shift, and update flags with a single-bit right shift. |
| var _lookaheadFlags: UInt8 = 0 |
| |
| /// Return `true` if the LSB bytes in `buffer` are well-formed UTF-8 code |
| /// unit sequence. |
| @warn_unused_result |
| static func _isValidUTF8Impl(buffer: UInt32, length: UInt8) -> Bool { |
| switch length { |
| case 4: |
| let cu3 = UInt8((buffer >> 24) & 0xff) |
| if cu3 < 0x80 || cu3 > 0xbf { |
| return false |
| } |
| fallthrough |
| case 3: |
| let cu2 = UInt8((buffer >> 16) & 0xff) |
| if cu2 < 0x80 || cu2 > 0xbf { |
| return false |
| } |
| fallthrough |
| case 2: |
| let cu0 = UInt8(buffer & 0xff) |
| let cu1 = UInt8((buffer >> 8) & 0xff) |
| switch cu0 { |
| case 0xe0: |
| if cu1 < 0xa0 || cu1 > 0xbf { |
| return false |
| } |
| case 0xed: |
| if cu1 < 0x80 || cu1 > 0x9f { |
| return false |
| } |
| case 0xf0: |
| if cu1 < 0x90 || cu1 > 0xbf { |
| return false |
| } |
| case 0xf4: |
| if cu1 < 0x80 || cu1 > 0x8f { |
| return false |
| } |
| default: |
| _sanityCheck(cu0 >= 0xc2 && cu0 <= 0xf4, |
| "invalid first bytes should be handled in the caller") |
| if cu1 < 0x80 || cu1 > 0xbf { |
| return false |
| } |
| } |
| return true |
| |
| default: |
| _sanityCheckFailure("one-byte sequences should be handled in the caller") |
| } |
| } |
| |
| /// Return `true` if the LSB bytes in `buffer` are well-formed UTF-8 code |
| /// unit sequence. |
| @warn_unused_result |
| static func _isValidUTF8(buffer: UInt32, validBytes: UInt8) -> Bool { |
| _sanityCheck(validBytes & 0b0000_1111 != 0, |
| "input buffer should not be empty") |
| |
| let cu0 = UInt8(buffer & 0xff) |
| let trailingBytes = _numTrailingBytes(cu0) |
| switch trailingBytes { |
| case 0: |
| return true |
| |
| case 1, 2, 3: |
| // We *don't* need to check the if the buffer actually contains at least |
| // `trailingBytes` bytes. Here's why. |
| // |
| // If the buffer is not full -- contains fewer than 4 bytes, we are at |
| // EOF, and the buffer will be padded with 0x00. Thus, an incomplete |
| // code unit sequence just before EOF would be seen by code below as |
| // padded with nuls. This sequence will be rejected by the logic in |
| // `_isValidUTF8Impl`, because the nul byte is not a valid continuation |
| // byte for UTF-8. |
| return _isValidUTF8Impl(buffer, length: trailingBytes + 1) |
| |
| default: |
| return false |
| } |
| } |
| |
| /// Given an ill-formed sequence, find the length of its maximal subpart. |
| @inline(never) |
| @warn_unused_result |
| static func _findMaximalSubpartOfIllFormedUTF8Sequence( |
| buffer: UInt32, validBytes: UInt8) -> UInt8 { |
| var buffer = buffer |
| var validBytes = validBytes |
| // This function is '@inline(never)' because it is used only in the error |
| // handling path. |
| |
| // Clear EOF flag, we don't care about it. |
| validBytes &= 0b0000_1111 |
| |
| _sanityCheck(validBytes != 0, |
| "input buffer should not be empty") |
| _sanityCheck(!UTF8._isValidUTF8(buffer, validBytes: validBytes), |
| "input sequence should be ill-formed UTF-8") |
| |
| // Unicode 6.3.0, D93b: |
| // |
| // Maximal subpart of an ill-formed subsequence: The longest code unit |
| // subsequence starting at an unconvertible offset that is either: |
| // a. the initial subsequence of a well-formed code unit sequence, or |
| // b. a subsequence of length one. |
| |
| // Perform case analysis. See Unicode 6.3.0, Table 3-7. Well-Formed UTF-8 |
| // Byte Sequences. |
| |
| let cu0 = UInt8(buffer & 0xff) |
| buffer >>= 8 |
| validBytes >>= 1 |
| if (cu0 >= 0xc2 && cu0 <= 0xdf) { |
| // First byte is valid, but we know that this code unit sequence is |
| // invalid, so the maximal subpart has to end after the first byte. |
| return 1 |
| } |
| |
| if validBytes == 0 { |
| return 1 |
| } |
| |
| let cu1 = UInt8(buffer & 0xff) |
| buffer >>= 8 |
| validBytes >>= 1 |
| |
| if (cu0 == 0xe0) { |
| return (cu1 >= 0xa0 && cu1 <= 0xbf) ? 2 : 1 |
| } |
| if (cu0 >= 0xe1 && cu0 <= 0xec) { |
| return (cu1 >= 0x80 && cu1 <= 0xbf) ? 2 : 1 |
| } |
| if (cu0 == 0xed) { |
| return (cu1 >= 0x80 && cu1 <= 0x9f) ? 2 : 1 |
| } |
| if (cu0 >= 0xee && cu0 <= 0xef) { |
| return (cu1 >= 0x80 && cu1 <= 0xbf) ? 2 : 1 |
| } |
| if (cu0 == 0xf0) { |
| if (cu1 >= 0x90 && cu1 <= 0xbf) { |
| if validBytes == 0 { |
| return 2 |
| } |
| |
| let cu2 = UInt8(buffer & 0xff) |
| return (cu2 >= 0x80 && cu2 <= 0xbf) ? 3 : 2 |
| } |
| return 1 |
| } |
| if (cu0 >= 0xf1 && cu0 <= 0xf3) { |
| if (cu1 >= 0x80 && cu1 <= 0xbf) { |
| if validBytes == 0 { |
| return 2 |
| } |
| |
| let cu2 = UInt8(buffer & 0xff) |
| return (cu2 >= 0x80 && cu2 <= 0xbf) ? 3 : 2 |
| } |
| return 1 |
| } |
| if (cu0 == 0xf4) { |
| if (cu1 >= 0x80 && cu1 <= 0x8f) { |
| if validBytes == 0 { |
| return 2 |
| } |
| |
| let cu2 = UInt8(buffer & 0xff) |
| return (cu2 >= 0x80 && cu2 <= 0xbf) ? 3 : 2 |
| } |
| return 1 |
| } |
| |
| _sanityCheck((cu0 >= 0x80 && cu0 <= 0xc1) || cu0 >= 0xf5, |
| "case analysis above should have handled all valid first bytes") |
| |
| // There are no well-formed sequences that start with these bytes. Maximal |
| // subpart is defined to have length 1 in these cases. |
| return 1 |
| } |
| |
| /// Start or continue decoding a UTF sequence. |
| /// |
| /// In order to decode a code unit sequence completely, this function should |
| /// be called repeatedly until it returns `UnicodeDecodingResult.EmptyInput`. |
| /// Checking that the generator was exhausted is not sufficient. The decoder |
| /// can have an internal buffer that is pre-filled with data from the input |
| /// generator. |
| /// |
| /// Because of buffering, it is impossible to find the corresponding position |
| /// in the generator for a given returned `UnicodeScalar` or an error. |
| /// |
| /// - parameter next: A *generator* of code units to be decoded. |
| public mutating func decode< |
| G : GeneratorType where G.Element == CodeUnit |
| >(inout next: G) -> UnicodeDecodingResult { |
| // If the EOF flag is not set, fill the lookahead buffer from the input |
| // generator. |
| if _lookaheadFlags & 0b1111_0000 == 0 { |
| // Add more bytes into the buffer until we have 4. |
| while _lookaheadFlags != 0b0000_1111 { |
| if let codeUnit = next.next() { |
| _decodeLookahead = (_decodeLookahead << 8) | UInt32(codeUnit) |
| _lookaheadFlags = (_lookaheadFlags << 1) | 1 |
| } else { |
| // Set the EOF flag. |
| switch _lookaheadFlags & 0b0000_1111 { |
| case 0b1111: |
| _sanityCheckFailure("should have not entered buffer refill loop") |
| case 0b0111: |
| _lookaheadFlags |= 0b0100_0000 |
| case 0b0011: |
| _lookaheadFlags |= 0b0010_0000 |
| case 0b0001: |
| _lookaheadFlags |= 0b0001_0000 |
| case 0b0000: |
| _lookaheadFlags |= 0b1000_0000 |
| return .EmptyInput |
| default: |
| _sanityCheckFailure("bad value in _lookaheadFlags") |
| } |
| break |
| } |
| } |
| } |
| |
| if _slowPath(_lookaheadFlags & 0b0000_1111 == 0) { |
| return .EmptyInput |
| } |
| |
| if _slowPath(_lookaheadFlags & 0b1111_0000 != 0) { |
| // Reached EOF. Restore the invariant: first unread byte is always at |
| // MSB. |
| switch _lookaheadFlags & 0b1111_0000 { |
| case 0b1000_0000: |
| break |
| case 0b0100_0000: |
| _decodeLookahead <<= 1 * 8 |
| case 0b0010_0000: |
| _decodeLookahead <<= 2 * 8 |
| case 0b0001_0000: |
| _decodeLookahead <<= 3 * 8 |
| default: |
| _sanityCheckFailure("bad value in _lookaheadFlags") |
| } |
| _lookaheadFlags = (_lookaheadFlags & 0b0000_1111) | 0b1000_0000 |
| } |
| |
| // The first byte to read is located at MSB of `_decodeLookahead`. Get a |
| // representation of the buffer where we can read bytes starting from LSB. |
| var buffer = _decodeLookahead.byteSwapped |
| if _slowPath(!UTF8._isValidUTF8(buffer, validBytes: _lookaheadFlags)) { |
| // The code unit sequence is ill-formed. According to Unicode |
| // recommendation, replace the maximal subpart of ill-formed sequence |
| // with one replacement character. |
| _lookaheadFlags >>= |
| UTF8._findMaximalSubpartOfIllFormedUTF8Sequence(buffer, |
| validBytes: _lookaheadFlags) |
| return .Error |
| } |
| |
| // At this point we know that `buffer` starts with a well-formed code unit |
| // sequence. Decode it. |
| // |
| // When consuming bytes from the `buffer`, we just need to update |
| // `_lookaheadFlags`. The stored buffer in `_decodeLookahead` will be |
| // shifted at the beginning of the next decoding cycle. |
| let cu0 = UInt8(buffer & 0xff) |
| buffer >>= 8 |
| _lookaheadFlags >>= 1 |
| |
| if cu0 < 0x80 { |
| // 1-byte sequences. |
| return .Result(UnicodeScalar(UInt32(cu0))) |
| } |
| |
| // Start with octet 1 (we'll mask off high bits later). |
| var result = UInt32(cu0) |
| |
| let cu1 = UInt8(buffer & 0xff) |
| buffer >>= 8 |
| _lookaheadFlags >>= 1 |
| result = (result << 6) | UInt32(cu1 & 0x3f) |
| if cu0 < 0xe0 { |
| // 2-byte sequences. |
| return .Result(UnicodeScalar(result & 0x000007ff)) // 11 bits |
| } |
| |
| let cu2 = UInt8(buffer & 0xff) |
| buffer >>= 8 |
| _lookaheadFlags >>= 1 |
| result = (result << 6) | UInt32(cu2 & 0x3f) |
| if cu0 < 0xf0 { |
| // 3-byte sequences. |
| return .Result(UnicodeScalar(result & 0x0000ffff)) // 16 bits |
| } |
| |
| // 4-byte sequences. |
| let cu3 = UInt8(buffer & 0xff) |
| _lookaheadFlags >>= 1 |
| result = (result << 6) | UInt32(cu3 & 0x3f) |
| return .Result(UnicodeScalar(result & 0x001fffff)) // 21 bits |
| } |
| |
| /// Encode a `UnicodeScalar` as a series of `CodeUnit`s by |
| /// calling `output` on each `CodeUnit`. |
| public static func encode( |
| input: UnicodeScalar, |
| output put: (CodeUnit) -> Void |
| ) { |
| var c = UInt32(input) |
| var buf3 = UInt8(c & 0xFF) |
| |
| if c >= UInt32(1<<7) { |
| c >>= 6 |
| buf3 = (buf3 & 0x3F) | 0x80 // 10xxxxxx |
| var buf2 = UInt8(c & 0xFF) |
| if c < UInt32(1<<5) { |
| buf2 |= 0xC0 // 110xxxxx |
| } |
| else { |
| c >>= 6 |
| buf2 = (buf2 & 0x3F) | 0x80 // 10xxxxxx |
| var buf1 = UInt8(c & 0xFF) |
| if c < UInt32(1<<4) { |
| buf1 |= 0xE0 // 1110xxxx |
| } |
| else { |
| c >>= 6 |
| buf1 = (buf1 & 0x3F) | 0x80 // 10xxxxxx |
| put(UInt8(c | 0xF0)) // 11110xxx |
| } |
| put(buf1) |
| } |
| put(buf2) |
| } |
| put(buf3) |
| } |
| |
| /// Return `true` if `byte` is a continuation byte of the form |
| /// `0b10xxxxxx`. |
| @warn_unused_result |
| public static func isContinuation(byte: CodeUnit) -> Bool { |
| return byte & 0b11_00__0000 == 0b10_00__0000 |
| } |
| |
| var _value = UInt8() |
| } |
| |
| /// A codec for [UTF-16](http://www.unicode.org/glossary/#UTF_16). |
| public struct UTF16 : UnicodeCodecType { |
| /// A type that can hold [code unit](http://www.unicode.org/glossary/#code_unit) values for this |
| /// encoding. |
| public typealias CodeUnit = UInt16 |
| |
| public init() {} |
| |
| /// A lookahead buffer for one UTF-16 code unit. |
| var _decodeLookahead: UInt32 = 0 |
| |
| /// Flags with layout: `0b0000_00xy`. |
| /// |
| /// `y` is the EOF flag. |
| /// |
| /// `x` is set when `_decodeLookahead` contains a code unit. |
| var _lookaheadFlags: UInt8 = 0 |
| |
| /// Start or continue decoding a UTF sequence. |
| /// |
| /// In order to decode a code unit sequence completely, this function should |
| /// be called repeatedly until it returns `UnicodeDecodingResult.EmptyInput`. |
| /// Checking that the generator was exhausted is not sufficient. The decoder |
| /// can have an internal buffer that is pre-filled with data from the input |
| /// generator. |
| /// |
| /// Because of buffering, it is impossible to find the corresponding position |
| /// in the generator for a given returned `UnicodeScalar` or an error. |
| /// |
| /// - parameter next: A *generator* of code units to be decoded. |
| public mutating func decode< |
| G : GeneratorType where G.Element == CodeUnit |
| >(inout input: G) -> UnicodeDecodingResult { |
| if _lookaheadFlags & 0b01 != 0 { |
| return .EmptyInput |
| } |
| |
| // Note: maximal subpart of ill-formed sequence for UTF-16 can only have |
| // length 1. Length 0 does not make sense. Neither does length 2 -- in |
| // that case the sequence is valid. |
| |
| var unit0: UInt32 |
| if _fastPath(_lookaheadFlags & 0b10 == 0) { |
| if let first = input.next() { |
| unit0 = UInt32(first) |
| } else { |
| // Set EOF flag. |
| _lookaheadFlags |= 0b01 |
| return .EmptyInput |
| } |
| } else { |
| // Fetch code unit from the lookahead buffer and note this fact in flags. |
| unit0 = _decodeLookahead |
| _lookaheadFlags &= 0b01 |
| } |
| |
| // A well-formed pair of surrogates looks like this: |
| // [1101 10ww wwxx xxxx] [1101 11xx xxxx xxxx] |
| |
| if _fastPath((unit0 >> 11) != 0b1101_1) { |
| // Neither high-surrogate, nor low-surrogate -- sequence of 1 code unit, |
| // decoding is trivial. |
| return .Result(UnicodeScalar(unit0)) |
| } |
| |
| if _slowPath((unit0 >> 10) == 0b1101_11) { |
| // `unit0` is a low-surrogate. We have an ill-formed sequence. |
| return .Error |
| } |
| |
| // At this point we know that `unit0` is a high-surrogate. |
| |
| var unit1: UInt32 |
| if let second = input.next() { |
| unit1 = UInt32(second) |
| } else { |
| // EOF reached. Set EOF flag. |
| _lookaheadFlags |= 0b01 |
| |
| // We have seen a high-surrogate and EOF, so we have an ill-formed |
| // sequence. |
| return .Error |
| } |
| |
| if _fastPath((unit1 >> 10) == 0b1101_11) { |
| // `unit1` is a low-surrogate. We have a well-formed surrogate pair. |
| |
| let result = 0x10000 + (((unit0 & 0x03ff) << 10) | (unit1 & 0x03ff)) |
| return .Result(UnicodeScalar(result)) |
| } |
| |
| // Otherwise, we have an ill-formed sequence. These are the possible |
| // cases: |
| // |
| // * `unit1` is a high-surrogate, so we have a pair of two high-surrogates. |
| // |
| // * `unit1` is not a surrogate. We have an ill-formed sequence: |
| // high-surrogate followed by a non-surrogate. |
| |
| // Save the second code unit in the lookahead buffer. |
| _decodeLookahead = unit1 |
| _lookaheadFlags |= 0b10 |
| return .Error |
| } |
| |
| /// Try to decode one Unicode scalar, and return the actual number of code |
| /// units it spanned in the input. This function may consume more code |
| /// units than required for this scalar. |
| mutating func _decodeOne< |
| G : GeneratorType where G.Element == CodeUnit |
| >(inout input: G) -> (UnicodeDecodingResult, Int) { |
| let result = decode(&input) |
| switch result { |
| case .Result(let us): |
| return (result, UTF16.width(us)) |
| |
| case .EmptyInput: |
| return (result, 0) |
| |
| case .Error: |
| return (result, 1) |
| } |
| } |
| |
| /// Encode a `UnicodeScalar` as a series of `CodeUnit`s by |
| /// calling `output` on each `CodeUnit`. |
| public static func encode( |
| input: UnicodeScalar, |
| output put: (CodeUnit) -> Void |
| ) { |
| let scalarValue: UInt32 = UInt32(input) |
| |
| if scalarValue <= UInt32(UInt16.max) { |
| put(UInt16(scalarValue)) |
| } |
| else { |
| let lead_offset = UInt32(0xd800) - UInt32(0x10000 >> 10) |
| put(UInt16(lead_offset + (scalarValue >> 10))) |
| put(UInt16(0xdc00 + (scalarValue & 0x3ff))) |
| } |
| } |
| |
| var _value = UInt16() |
| } |
| |
| /// A codec for [UTF-32](http://www.unicode.org/glossary/#UTF_32). |
| public struct UTF32 : UnicodeCodecType { |
| /// A type that can hold [code unit](http://www.unicode.org/glossary/#code_unit) values for this |
| /// encoding. |
| public typealias CodeUnit = UInt32 |
| |
| public init() {} |
| |
| /// Start or continue decoding a UTF sequence. |
| /// |
| /// In order to decode a code unit sequence completely, this function should |
| /// be called repeatedly until it returns `UnicodeDecodingResult.EmptyInput`. |
| /// Checking that the generator was exhausted is not sufficient. The decoder |
| /// can have an internal buffer that is pre-filled with data from the input |
| /// generator. |
| /// |
| /// Because of buffering, it is impossible to find the corresponding position |
| /// in the generator for a given returned `UnicodeScalar` or an error. |
| /// |
| /// - parameter next: A *generator* of code units to be decoded. |
| public mutating func decode< |
| G : GeneratorType where G.Element == CodeUnit |
| >(inout input: G) -> UnicodeDecodingResult { |
| return UTF32._decode(&input) |
| } |
| |
| static func _decode< |
| G : GeneratorType where G.Element == CodeUnit |
| >(inout input: G) -> UnicodeDecodingResult { |
| guard let x = input.next() else { return .EmptyInput } |
| if _fastPath((x >> 11) != 0b1101_1 && x <= 0x10ffff) { |
| return .Result(UnicodeScalar(x)) |
| } else { |
| return .Error |
| } |
| } |
| |
| /// Encode a `UnicodeScalar` as a series of `CodeUnit`s by |
| /// calling `output` on each `CodeUnit`. |
| public static func encode( |
| input: UnicodeScalar, |
| output put: (CodeUnit) -> Void |
| ) { |
| put(UInt32(input)) |
| } |
| } |
| |
| /// Translate `input`, in the given `InputEncoding`, into `output`, in |
| /// the given `OutputEncoding`. |
| /// |
| /// - parameter stopOnError: Causes encoding to stop when an encoding |
| /// error is detected in `input`, if `true`. Otherwise, U+FFFD |
| /// replacement characters are inserted for each detected error. |
| public func transcode< |
| Input : GeneratorType, |
| InputEncoding : UnicodeCodecType, |
| OutputEncoding : UnicodeCodecType |
| where InputEncoding.CodeUnit == Input.Element>( |
| inputEncoding: InputEncoding.Type, _ outputEncoding: OutputEncoding.Type, |
| _ input: Input, _ output: (OutputEncoding.CodeUnit) -> Void, |
| stopOnError: Bool |
| ) -> Bool { |
| |
| var input = input |
| |
| // NB. It is not possible to optimize this routine to a memcpy if |
| // InputEncoding == OutputEncoding. The reason is that memcpy will not |
| // substitute U+FFFD replacement characters for ill-formed sequences. |
| |
| var inputDecoder = inputEncoding.init() |
| var hadError = false |
| for var scalar = inputDecoder.decode(&input); |
| !scalar.isEmptyInput(); |
| scalar = inputDecoder.decode(&input) { |
| switch scalar { |
| case .Result(let us): |
| OutputEncoding.encode(us, output: output) |
| case .EmptyInput: |
| _sanityCheckFailure("should not enter the loop when input becomes empty") |
| case .Error: |
| if stopOnError { |
| return (hadError: true) |
| } else { |
| OutputEncoding.encode("\u{fffd}", output: output) |
| hadError = true |
| } |
| } |
| } |
| return hadError |
| } |
| |
| /// Transcode UTF-16 to UTF-8, replacing ill-formed sequences with U+FFFD. |
| /// |
| /// Returns the index of the first unhandled code unit and the UTF-8 data |
| /// that was encoded. |
| @warn_unused_result |
| internal func _transcodeSomeUTF16AsUTF8< |
| Input : CollectionType |
| where Input.Generator.Element == UInt16>( |
| input: Input, _ startIndex: Input.Index |
| ) -> (Input.Index, _StringCore.UTF8Chunk) { |
| typealias UTF8Chunk = _StringCore.UTF8Chunk |
| |
| let endIndex = input.endIndex |
| let utf8Max = sizeof(UTF8Chunk.self) |
| var result: UTF8Chunk = 0 |
| var utf8Count = 0 |
| var nextIndex = startIndex |
| while nextIndex != input.endIndex && utf8Count != utf8Max { |
| let u = UInt(input[nextIndex]) |
| let shift = UTF8Chunk(utf8Count * 8) |
| var utf16Length: Input.Index.Distance = 1 |
| |
| if _fastPath(u <= 0x7f) { |
| result |= UTF8Chunk(u) << shift |
| utf8Count += 1 |
| } else { |
| var scalarUtf8Length: Int |
| var r: UInt |
| if _fastPath((u >> 11) != 0b1101_1) { |
| // Neither high-surrogate, nor low-surrogate -- well-formed sequence |
| // of 1 code unit, decoding is trivial. |
| if u < 0x800 { |
| r = 0b10__00_0000__110__0_0000 |
| r |= u >> 6 |
| r |= (u & 0b11_1111) << 8 |
| scalarUtf8Length = 2 |
| } |
| else { |
| r = 0b10__00_0000__10__00_0000__1110__0000 |
| r |= u >> 12 |
| r |= ((u >> 6) & 0b11_1111) << 8 |
| r |= (u & 0b11_1111) << 16 |
| scalarUtf8Length = 3 |
| } |
| } else { |
| let unit0 = u |
| if _slowPath((unit0 >> 10) == 0b1101_11) { |
| // `unit0` is a low-surrogate. We have an ill-formed sequence. |
| // Replace it with U+FFFD. |
| r = 0xbdbfef |
| scalarUtf8Length = 3 |
| } else if _slowPath(nextIndex.advancedBy(1) == endIndex) { |
| // We have seen a high-surrogate and EOF, so we have an ill-formed |
| // sequence. Replace it with U+FFFD. |
| r = 0xbdbfef |
| scalarUtf8Length = 3 |
| } else { |
| let unit1 = UInt(input[nextIndex.advancedBy(1)]) |
| if _fastPath((unit1 >> 10) == 0b1101_11) { |
| // `unit1` is a low-surrogate. We have a well-formed surrogate |
| // pair. |
| let v = 0x10000 + (((unit0 & 0x03ff) << 10) | (unit1 & 0x03ff)) |
| |
| r = 0b10__00_0000__10__00_0000__10__00_0000__1111_0__000 |
| r |= v >> 18 |
| r |= ((v >> 12) & 0b11_1111) << 8 |
| r |= ((v >> 6) & 0b11_1111) << 16 |
| r |= (v & 0b11_1111) << 24 |
| scalarUtf8Length = 4 |
| utf16Length = 2 |
| } else { |
| // Otherwise, we have an ill-formed sequence. Replace it with |
| // U+FFFD. |
| r = 0xbdbfef |
| scalarUtf8Length = 3 |
| } |
| } |
| } |
| // Don't overrun the buffer |
| if utf8Count + scalarUtf8Length > utf8Max { |
| break |
| } |
| result |= numericCast(r) << shift |
| utf8Count += scalarUtf8Length |
| } |
| nextIndex = nextIndex.advancedBy(utf16Length) |
| } |
| // FIXME: Annoying check, courtesy of <rdar://problem/16740169> |
| if utf8Count < sizeofValue(result) { |
| result |= ~0 << numericCast(utf8Count * 8) |
| } |
| return (nextIndex, result) |
| } |
| |
| /// Instances of conforming types are used in internal `String` |
| /// representation. |
| public // @testable |
| protocol _StringElementType { |
| @warn_unused_result |
| static func _toUTF16CodeUnit(_: Self) -> UTF16.CodeUnit |
| |
| @warn_unused_result |
| static func _fromUTF16CodeUnit(utf16: UTF16.CodeUnit) -> Self |
| } |
| |
| extension UTF16.CodeUnit : _StringElementType { |
| public // @testable |
| static func _toUTF16CodeUnit(x: UTF16.CodeUnit) -> UTF16.CodeUnit { |
| return x |
| } |
| public // @testable |
| static func _fromUTF16CodeUnit( |
| utf16: UTF16.CodeUnit |
| ) -> UTF16.CodeUnit { |
| return utf16 |
| } |
| } |
| |
| extension UTF8.CodeUnit : _StringElementType { |
| public // @testable |
| static func _toUTF16CodeUnit(x: UTF8.CodeUnit) -> UTF16.CodeUnit { |
| _sanityCheck(x <= 0x7f, "should only be doing this with ASCII") |
| return UTF16.CodeUnit(x) |
| } |
| public // @testable |
| static func _fromUTF16CodeUnit( |
| utf16: UTF16.CodeUnit |
| ) -> UTF8.CodeUnit { |
| _sanityCheck(utf16 <= 0x7f, "should only be doing this with ASCII") |
| return UTF8.CodeUnit(utf16) |
| } |
| } |
| |
| extension UTF16 { |
| /// Return the number of code units required to encode `x`. |
| @warn_unused_result |
| public static func width(x: UnicodeScalar) -> Int { |
| return x.value <= 0xFFFF ? 1 : 2 |
| } |
| |
| /// Return the high surrogate code unit of a [surrogate pair](http://www.unicode.org/glossary/#surrogate_pair) representing |
| /// `x`. |
| /// |
| /// - Requires: `width(x) == 2`. |
| @warn_unused_result |
| public static func leadSurrogate(x: UnicodeScalar) -> UTF16.CodeUnit { |
| _precondition(width(x) == 2) |
| return UTF16.CodeUnit((x.value - 0x1_0000) >> (10 as UInt32)) + 0xD800 |
| } |
| |
| /// Return the low surrogate code unit of a [surrogate pair](http://www.unicode.org/glossary/#surrogate_pair) representing |
| /// `x`. |
| /// |
| /// - Requires: `width(x) == 2`. |
| @warn_unused_result |
| public static func trailSurrogate(x: UnicodeScalar) -> UTF16.CodeUnit { |
| _precondition(width(x) == 2) |
| return UTF16.CodeUnit( |
| (x.value - 0x1_0000) & (((1 as UInt32) << 10) - 1) |
| ) + 0xDC00 |
| } |
| |
| @warn_unused_result |
| public static func isLeadSurrogate(x: CodeUnit) -> Bool { |
| return 0xD800...0xDBFF ~= x |
| } |
| |
| @warn_unused_result |
| public static func isTrailSurrogate(x: CodeUnit) -> Bool { |
| return 0xDC00...0xDFFF ~= x |
| } |
| |
| public // @testable |
| static func _copy<T : _StringElementType, U : _StringElementType>( |
| source: UnsafeMutablePointer<T>, |
| destination: UnsafeMutablePointer<U>, count: Int |
| ) { |
| if strideof(T.self) == strideof(U.self) { |
| _memcpy( |
| dest: UnsafeMutablePointer(destination), |
| src: UnsafeMutablePointer(source), |
| size: UInt(count) * UInt(strideof(U.self))) |
| } |
| else { |
| for i in 0..<count { |
| let u16 = T._toUTF16CodeUnit((source + i).memory) |
| (destination + i).memory = U._fromUTF16CodeUnit(u16) |
| } |
| } |
| } |
| |
| /// Returns the number of UTF-16 code units required for the given code unit |
| /// sequence when transcoded to UTF-16, and a bit describing if the sequence |
| /// was found to contain only ASCII characters. |
| /// |
| /// If `repairIllFormedSequences` is `true`, the function always succeeds. |
| /// If it is `false`, `nil` is returned if an ill-formed code unit sequence is |
| /// found in `input`. |
| @warn_unused_result |
| public static func measure< |
| Encoding : UnicodeCodecType, Input : GeneratorType |
| where Encoding.CodeUnit == Input.Element |
| >( |
| _: Encoding.Type, input: Input, repairIllFormedSequences: Bool |
| ) -> (Int, Bool)? { |
| var input = input |
| var count = 0 |
| var isAscii = true |
| |
| var inputDecoder = Encoding() |
| loop: |
| while true { |
| switch inputDecoder.decode(&input) { |
| case .Result(let us): |
| if us.value > 0x7f { |
| isAscii = false |
| } |
| count += width(us) |
| case .EmptyInput: |
| break loop |
| case .Error: |
| if !repairIllFormedSequences { |
| return nil |
| } |
| isAscii = false |
| count += width(UnicodeScalar(0xfffd)) |
| } |
| } |
| return (count, isAscii) |
| } |
| } |
| |