blob: e19b28551f40a7099b29d584bb9b2e46b71b7ad2 [file] [log] [blame]
//===--- UnicodeDecoders.swift --------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//
// RUN: %target-build-swift %s -g -Onone -o %t
// RUN: %target-codesign %t
// RUN: %target-run %t
// REQUIRES: executable_test
// Benchmarking: use the following script with your swift-4-enabled swiftc.
// The BASELINE timings come from the existing standard library Codecs
/*
for x in BASELINE FORWARD REVERSE SEQUENCE COLLECTION REVERSE_COLLECTION ; do
echo $x
swiftc -DBENCHMARK -D$x -O -swift-version 4 UnicodeDecoders.swift -o /tmp/u3-$x
for i in {1..3}; do
(time nice -19 /tmp/u3-$x) 2>&1 | grep user
done
done
*/
//===----------------------------------------------------------------------===//
extension Unicode.Scalar {
// Hack providing an efficient API that is available to the standard library
@usableFromInline
@inline(__always)
init(_unchecked x: UInt32) { self = unsafeBitCast(x, to: Unicode.Scalar.self) }
}
//===----------------------------------------------------------------------===//
extension Unicode {
@_fixed_layout
public // @testable
struct _ParsingIterator<
CodeUnitIterator : IteratorProtocol,
Parser: Unicode.Parser
> where Parser.Encoding.CodeUnit == CodeUnitIterator.Element {
@inline(__always)
@inlinable
public init(codeUnits: CodeUnitIterator, parser: Parser) {
self.codeUnits = codeUnits
self.parser = parser
}
public var codeUnits: CodeUnitIterator
public var parser: Parser
}
}
extension Unicode._ParsingIterator : IteratorProtocol, Sequence {
@inline(__always)
@inlinable
public mutating func next() -> Parser.Encoding.EncodedScalar? {
switch parser.parseScalar(from: &codeUnits) {
case let .valid(scalarContent): return scalarContent
case .error: return Parser.Encoding.encodedReplacementCharacter
case .emptyInput: return nil
}
}
}
extension _UnicodeParser {
@inlinable // FIXME(sil-serialize-all)
@inline(__always)
@discardableResult
internal static func _parse<I: IteratorProtocol>(
_ input: inout I,
repairingIllFormedSequences makeRepairs: Bool = true,
into output: (Encoding.EncodedScalar)->Void
) -> Int
where I.Element == Encoding.CodeUnit
{
var errorCount = 0
var d = Self()
while true {
switch d.parseScalar(from: &input) {
case let .valid(scalarContent):
output(scalarContent)
case .error:
if _slowPath(!makeRepairs) { return 1 }
errorCount += 1
output(Encoding.encodedReplacementCharacter)
case .emptyInput:
return errorCount
}
}
}
@inlinable // FIXME(sil-serialize-all)
@inline(__always)
@discardableResult
public static func _decode<I: IteratorProtocol>(
_ input: inout I,
repairingIllFormedSequences makeRepairs: Bool,
into output: (Unicode.Scalar)->Void
) -> Int
where I.Element == Encoding.CodeUnit
{
return _parse(&input, repairingIllFormedSequences: makeRepairs) {
output(Encoding.decode($0))
}
}
}
extension Unicode {
struct DefaultScalarView<
CodeUnits: BidirectionalCollection,
Encoding: Unicode.Encoding
> where CodeUnits.Element == Encoding.CodeUnit {
var codeUnits: CodeUnits
init(
_ codeUnits: CodeUnits,
fromEncoding _: Encoding.Type = Encoding.self) {
self.codeUnits = codeUnits
}
}
}
extension Unicode.DefaultScalarView : Sequence {
struct Iterator {
var parsing: Unicode._ParsingIterator<
CodeUnits.Iterator, Encoding.ForwardParser>
}
func makeIterator() -> Iterator {
return Iterator(
parsing: Unicode._ParsingIterator(
codeUnits: codeUnits.makeIterator(),
parser: Encoding.ForwardParser()
))
}
}
extension Unicode.DefaultScalarView.Iterator : IteratorProtocol, Sequence {
mutating func next() -> Unicode.Scalar? {
return parsing.next().map { Encoding.decode($0) }
}
}
extension Unicode.DefaultScalarView {
struct Index {
var codeUnitIndex: CodeUnits.Index
var scalar: Unicode.Scalar
var stride: UInt8
}
}
extension Unicode.DefaultScalarView.Index : Comparable {
@inline(__always)
public static func < (
lhs: Unicode.DefaultScalarView<CodeUnits,Encoding>.Index,
rhs: Unicode.DefaultScalarView<CodeUnits,Encoding>.Index
) -> Bool {
return lhs.codeUnitIndex < rhs.codeUnitIndex
}
@inline(__always)
public static func == (
lhs: Unicode.DefaultScalarView<CodeUnits,Encoding>.Index,
rhs: Unicode.DefaultScalarView<CodeUnits,Encoding>.Index
) -> Bool {
return lhs.codeUnitIndex == rhs.codeUnitIndex
}
}
extension Unicode.DefaultScalarView : Collection {
public var startIndex: Index {
@inline(__always)
get {
return index(
after: Index(
codeUnitIndex: codeUnits.startIndex,
scalar: Unicode.Scalar(_unchecked: 0),
stride: 0)
)
}
}
public var endIndex: Index {
@inline(__always)
get {
return Index(
codeUnitIndex: codeUnits.endIndex,
scalar: Unicode.Scalar(_unchecked: 0),
stride: 0)
}
}
public subscript(i: Index) -> Unicode.Scalar {
@inline(__always) get { return i.scalar }
}
@inline(__always)
public func index(after i: Index) -> Index {
let nextPosition = codeUnits.index(
i.codeUnitIndex, offsetBy: numericCast(i.stride))
var i = IndexingIterator(
_elements: codeUnits, _position: nextPosition
)
var d = Encoding.ForwardParser()
switch d.parseScalar(from: &i) {
case .valid(let scalarContent):
return Index(
codeUnitIndex: nextPosition,
scalar: Encoding.decode(scalarContent),
stride: numericCast(scalarContent.count))
case .error(let stride):
return Index(
codeUnitIndex: nextPosition,
scalar: Unicode.Scalar(_unchecked: 0xfffd),
stride: numericCast(stride))
case .emptyInput:
return endIndex
}
}
}
extension Unicode.DefaultScalarView : BidirectionalCollection {
@inline(__always)
public func index(before i: Index) -> Index {
var parser = Encoding.ReverseParser()
var more = codeUnits[..<i.codeUnitIndex].reversed().makeIterator()
switch parser.parseScalar(from: &more) {
case .valid(let scalarContent):
let d: Int = -scalarContent.count
return Index(
codeUnitIndex: codeUnits.index(i.codeUnitIndex, offsetBy: d),
scalar: Encoding.decode(scalarContent),
stride: numericCast(scalarContent.count))
case .error(let stride):
let d: Int = -stride
return Index(
codeUnitIndex: codeUnits.index(i.codeUnitIndex, offsetBy: d) ,
scalar: Unicode.Scalar(_unchecked: 0xfffd),
stride: numericCast(stride))
case .emptyInput: fatalError("index out of bounds.")
}
}
}
#if !BENCHMARK
//===--- testing ----------------------------------------------------------===//
import StdlibUnittest
import SwiftPrivate
func utf32<S : StringProtocol>(_ s: S) -> [UInt32] {
return s.unicodeScalars.map { $0.value }
}
func checkStringProtocol<S : StringProtocol, Encoding: Unicode.Encoding>(
_ s: S,
_ utfStr: [Encoding.CodeUnit],
encodedAs: Encoding.Type,
expectingUTF32 expected: [UInt32]
) {
expectEqualSequence(
expected, utf32(S(decoding: utfStr, as: Encoding.self)),
"\(S.self) init(decoding:as:)")
if !utfStr.contains(0) {
if Encoding.self == Unicode.UTF8.self {
var ntbs = utfStr.map { CChar(truncatingIfNeeded: $0) }
ntbs.append(0)
expectEqualSequence(
expected, utf32(S(cString: ntbs)), "\(S.self) init(cString:)")
}
var ntbs = Array(utfStr); ntbs.append(0)
expectEqualSequence(
expected, utf32(S(decodingCString: ntbs, as: Encoding.self)),
"\(S.self) init(cString:encoding:)"
)
s.withCString {
expectEqual(s, S(cString: $0), "\(S.self) withCString(_:)")
}
s.withCString(encodedAs: Encoding.self) {
expectEqual(s, S(decodingCString: $0, as: Encoding.self),
"\(S.self) withCString(encoding:_:)")
}
}
}
func checkDecodeUTF<Codec : UnicodeCodec>(
_ codec: Codec.Type, _ expectedHead: [UInt32],
_ expectedRepairedTail: [UInt32], _ utfStr: [Codec.CodeUnit]
) -> AssertionResult {
var decoded = [UInt32]()
var expected = expectedHead
func output(_ scalar: UInt32) {
decoded.append(scalar)
expectEqual(
Unicode.Scalar(scalar),
Codec.decode(Codec.encode(Unicode.Scalar(scalar)!)!))
}
func output1(_ scalar: Unicode.Scalar) {
decoded.append(scalar.value)
expectEqual(scalar, Codec.decode(Codec.encode(scalar)!))
}
var result = assertionSuccess()
func check<C: Collection>(_ expected: C, _ description: String)
where C.Element == UInt32
{
if !expected.elementsEqual(decoded) {
if result.description == "" { result = assertionFailure() }
result = result.withDescription(" [\(description)]\n")
.withDescription("expected: \(asHex(expectedHead))\n")
.withDescription("actual: \(asHex(decoded))")
}
decoded.removeAll(keepingCapacity: true)
}
//===--- Tests without repairs ------------------------------------------===//
do {
let iterator = utfStr.makeIterator()
_ = transcode(
iterator, from: codec, to: Unicode.UTF32.self,
stoppingOnError: true, into: output)
}
check(expected, "legacy, repairing: false")
do {
var iterator = utfStr.makeIterator()
let errorCount = Codec.ForwardParser._decode(
&iterator, repairingIllFormedSequences: false, into: output1)
expectEqual(expectedRepairedTail.isEmpty ? 0 : 1, errorCount)
}
check(expected, "forward, repairing: false")
do {
var iterator = utfStr.reversed().makeIterator()
let errorCount = Codec.ReverseParser._decode(
&iterator, repairingIllFormedSequences: false, into: output1)
if expectedRepairedTail.isEmpty {
expectEqual(0, errorCount)
check(expected.reversed(), "reverse, repairing: false")
}
else {
expectEqual(1, errorCount)
let x = (expected + expectedRepairedTail).reversed()
expectTrue(
x.starts(with: decoded),
"reverse, repairing: false\n\t\(Array(x)) does not start with \(decoded)")
decoded.removeAll(keepingCapacity: true)
}
}
//===--- Tests with repairs ------------------------------------------===//
expected += expectedRepairedTail
do {
let iterator = utfStr.makeIterator()
_ = transcode(iterator, from: codec, to: Unicode.UTF32.self,
stoppingOnError: false, into: output)
}
check(expected, "legacy, repairing: true")
do {
var iterator = utfStr.makeIterator()
let errorCount = Codec.ForwardParser._decode(
&iterator, repairingIllFormedSequences: true, into: output1)
if expectedRepairedTail.isEmpty { expectEqual(0, errorCount) }
else { expectNotEqual(0, errorCount) }
}
check(expected, "forward, repairing: true")
do {
var iterator = utfStr.reversed().makeIterator()
let errorCount = Codec.ReverseParser._decode(
&iterator, repairingIllFormedSequences: true, into: output1)
if expectedRepairedTail.isEmpty { expectEqual(0, errorCount) }
else { expectNotEqual(0, errorCount) }
}
check(expected.reversed(), "reverse, repairing: true")
//===--- String/Substring Construction and C-String interop -------------===//
do {
let s = String(decoding: utfStr, as: Codec.self)
checkStringProtocol(
s, utfStr, encodedAs: Codec.self, expectingUTF32: expected)
}
do {
let s0 = "\n" + String(decoding: utfStr, as: Codec.self) + "\n"
let s = s0.dropFirst().dropLast()
expectEqualSequence(expected, utf32(s), "Sliced Substring")
checkStringProtocol(
s0.dropFirst().dropLast(),
utfStr, encodedAs: Codec.self, expectingUTF32: expected)
}
//===--- Transcoded Scalars ---------------------------------------------===//
for x in decoded.lazy.map({ Unicode.Scalar($0)! }) {
expectEqualSequence(
Unicode.UTF8.encode(x)!,
Unicode.UTF8.transcode(
Codec.encode(x)!, from: Codec.self)!
)
expectEqualSequence(
Unicode.UTF16.encode(x)!,
Unicode.UTF16.transcode(
Codec.encode(x)!, from: Codec.self)!
)
expectEqualSequence(
Unicode.UTF32.encode(x)!,
Unicode.UTF32.transcode(
Codec.encode(x)!, from: Codec.self)!
)
}
//===--- Scalar View ----------------------------------------------------===//
let scalars = Unicode.DefaultScalarView(utfStr, fromEncoding: Codec.self)
expectEqualSequence(expected, scalars.map { $0.value })
expectEqualSequence(
expected.reversed(),
scalars.reversed().map { $0.value })
do {
var x = scalars.makeIterator()
var j = scalars.startIndex
while (j != scalars.endIndex) {
expectEqual(x.next()!, scalars[j])
j = scalars.index(after: j)
}
expectNil(x.next())
}
return result
}
func checkDecodeUTF8(
_ expectedHead: [UInt32],
_ expectedRepairedTail: [UInt32], _ utf8Str: [UInt8]
) -> AssertionResult {
return checkDecodeUTF(Unicode.UTF8.self, expectedHead, expectedRepairedTail, utf8Str)
}
func checkDecodeUTF16(
_ expectedHead: [UInt32],
_ expectedRepairedTail: [UInt32], _ utf16Str: [UInt16]
) -> AssertionResult {
return checkDecodeUTF(Unicode.UTF16.self, expectedHead, expectedRepairedTail,
utf16Str)
}
func checkDecodeUTF32(
_ expectedHead: [UInt32],
_ expectedRepairedTail: [UInt32], _ utf32Str: [UInt32]
) -> AssertionResult {
return checkDecodeUTF(Unicode.UTF32.self, expectedHead, expectedRepairedTail,
utf32Str)
}
func checkEncodeUTF8(_ expected: [UInt8],
_ scalars: [UInt32]) -> AssertionResult {
var encoded = [UInt8]()
let output: (UInt8) -> Void = { encoded.append($0) }
let iterator = scalars.makeIterator()
let hadError = transcode(
iterator,
from: Unicode.UTF32.self,
to: Unicode.UTF8.self,
stoppingOnError: true,
into: output)
expectFalse(hadError)
if expected != encoded {
return assertionFailure()
.withDescription("\n")
.withDescription("expected: \(asHex(expected))\n")
.withDescription("actual: \(asHex(encoded))")
}
return assertionSuccess()
}
//===----------------------------------------------------------------------===//
var UTF32Decoder = TestSuite("UTF32Decoder")
UTF32Decoder.test("Empty") {
expectTrue(checkDecodeUTF32([], [], []))
}
UTF32Decoder.test("SmokeTest") {
// U+0041 LATIN CAPITAL LETTER A
expectTrue(checkDecodeUTF32([ 0x0041 ], [], [ 0x0000_0041 ]))
// U+0041 LATIN CAPITAL LETTER A
// U+0042 LATIN CAPITAL LETTER B
expectTrue(checkDecodeUTF32(
[ 0x0041, 0x0042 ], [],
[ 0x0000_0041, 0x0000_0042 ]))
// U+0000 NULL
// U+0041 LATIN CAPITAL LETTER A
// U+0042 LATIN CAPITAL LETTER B
// U+0000 NULL
expectTrue(checkDecodeUTF32(
[ 0x0000, 0x0041, 0x0042, 0x0000 ], [],
[ 0x0000_0000, 0x0000_0041, 0x0000_0042, 0x0000_0000 ]))
// U+0283 LATIN SMALL LETTER ESH
expectTrue(checkDecodeUTF32([ 0x0283 ], [], [ 0x0000_0283 ]))
// U+03BA GREEK SMALL LETTER KAPPA
// U+1F79 GREEK SMALL LETTER OMICRON WITH OXIA
// U+03C3 GREEK SMALL LETTER SIGMA
// U+03BC GREEK SMALL LETTER MU
// U+03B5 GREEK SMALL LETTER EPSILON
expectTrue(checkDecodeUTF32(
[ 0x03ba, 0x1f79, 0x03c3, 0x03bc, 0x03b5 ], [],
[ 0x0000_03ba, 0x0000_1f79, 0x0000_03c3, 0x0000_03bc, 0x0000_03b5 ]))
// U+4F8B CJK UNIFIED IDEOGRAPH-4F8B
// U+6587 CJK UNIFIED IDEOGRAPH-6587
expectTrue(checkDecodeUTF32(
[ 0x4f8b, 0x6587 ], [],
[ 0x0000_4f8b, 0x0000_6587 ]))
// U+D55C HANGUL SYLLABLE HAN
// U+AE00 HANGUL SYLLABLE GEUL
expectTrue(checkDecodeUTF32(
[ 0xd55c, 0xae00 ], [],
[ 0x0000_d55c, 0x0000_ae00 ]))
// U+1112 HANGUL CHOSEONG HIEUH
// U+1161 HANGUL JUNGSEONG A
// U+11AB HANGUL JONGSEONG NIEUN
// U+1100 HANGUL CHOSEONG KIYEOK
// U+1173 HANGUL JUNGSEONG EU
// U+11AF HANGUL JONGSEONG RIEUL
expectTrue(checkDecodeUTF32(
[ 0x1112, 0x1161, 0x11ab, 0x1100, 0x1173, 0x11af ], [],
[ 0x0000_1112, 0x0000_1161, 0x0000_11ab, 0x0000_1100, 0x0000_1173,
0x0000_11af ]))
// U+D7FF (unassigned)
expectTrue(checkDecodeUTF16([ 0xd7ff ], [], [ 0x0000_d7ff ]))
// U+E000 (private use)
expectTrue(checkDecodeUTF16([ 0xe000 ], [], [ 0x0000_e000 ]))
// U+FFFD REPLACEMENT CHARACTER
expectTrue(checkDecodeUTF16([ 0xfffd ], [], [ 0x0000_fffd ]))
// U+FFFF (noncharacter)
expectTrue(checkDecodeUTF16([ 0xffff ], [], [ 0x0000_ffff ]))
// U+10000 LINEAR B SYLLABLE B008 A
expectTrue(checkDecodeUTF32([ 0x00010000 ], [], [ 0x0001_0000 ]))
// U+10100 AEGEAN WORD SEPARATOR LINE
expectTrue(checkDecodeUTF32([ 0x00010100 ], [], [ 0x0001_0100 ]))
// U+103FF (unassigned)
expectTrue(checkDecodeUTF32([ 0x000103ff ], [], [ 0x0001_03ff ]))
// U+1D800 (unassigned)
expectTrue(checkDecodeUTF32([ 0x0001d800 ], [], [ 0x0001_d800 ]))
// U+E0000 (unassigned)
expectTrue(checkDecodeUTF32([ 0x000e0000 ], [], [ 0x000e_0000 ]))
// U+E0100 VARIATION SELECTOR-17
expectTrue(checkDecodeUTF32([ 0x000e0100 ], [], [ 0x000e_0100 ]))
// U+E03FF (unassigned)
expectTrue(checkDecodeUTF32([ 0x000e03ff ], [], [ 0x000e_03ff ]))
// U+10FC00 (private use)
expectTrue(checkDecodeUTF32([ 0x0010fc00 ], [], [ 0x0010_fc00 ]))
// U+10FD00 (private use)
expectTrue(checkDecodeUTF32([ 0x0010fd00 ], [], [ 0x0010_fd00 ]))
// U+10FFFF (private use, noncharacter)
expectTrue(checkDecodeUTF32([ 0x0010ffff ], [], [ 0x0010_ffff ]))
}
UTF32Decoder.test("IllFormed") {
// U+D800 (high-surrogate)
expectTrue(checkDecodeUTF32([], [ 0xfffd ], [ 0x0000_d800 ]))
// U+DB40 (high-surrogate)
expectTrue(checkDecodeUTF32([], [ 0xfffd ], [ 0x0000_db40 ]))
// U+DBFF (high-surrogate)
expectTrue(checkDecodeUTF32([], [ 0xfffd ], [ 0x0000_dbff ]))
// U+DC00 (low-surrogate)
expectTrue(checkDecodeUTF32([], [ 0xfffd ], [ 0x0000_dc00 ]))
// U+DD00 (low-surrogate)
expectTrue(checkDecodeUTF32([], [ 0xfffd ], [ 0x0000_dd00 ]))
// U+DFFF (low-surrogate)
expectTrue(checkDecodeUTF32([], [ 0xfffd ], [ 0x0000_dfff ]))
// U+110000 (invalid)
expectTrue(checkDecodeUTF32([], [ 0xfffd ], [ 0x0011_0000 ]))
// U+1000000 (invalid)
expectTrue(checkDecodeUTF32([], [ 0xfffd ], [ 0x0100_0000 ]))
// U+80000000 (invalid)
expectTrue(checkDecodeUTF32([], [ 0xfffd ], [ 0x8000_0000 ]))
// U+FFFF0000 (invalid)
expectTrue(checkDecodeUTF32([], [ 0xfffd ], [ 0xffff_0000 ]))
// U+FFFFFFFF (invalid)
expectTrue(checkDecodeUTF32([], [ 0xfffd ], [ 0xffff_ffff ]))
}
var UTF8Decoder = TestSuite("UTF8Decoder")
//===----------------------------------------------------------------------===//
public struct UTFTest {
public struct Flags : OptionSet {
public let rawValue: Int
public init(rawValue: Int) {
self.rawValue = rawValue
}
public static let utf8IsInvalid = Flags(rawValue: 1 << 0)
public static let utf16IsInvalid = Flags(rawValue: 1 << 1)
}
public let string: String
public let utf8: [UInt8]
public let utf16: [UInt16]
public let unicodeScalars: [Unicode.Scalar]
public let unicodeScalarsRepairedTail: [Unicode.Scalar]
public let flags: Flags
public let loc: SourceLoc
public var utf32: [UInt32] {
return unicodeScalars.map(UInt32.init)
}
public var utf32RepairedTail: [UInt32] {
return unicodeScalarsRepairedTail.map(UInt32.init)
}
public init(
string: String,
utf8: [UInt8],
utf16: [UInt16],
scalars: [UInt32],
scalarsRepairedTail: [UInt32] = [],
flags: Flags = [],
file: String = #file, line: UInt = #line
) {
self.string = string
self.utf8 = utf8
self.utf16 = utf16
self.unicodeScalars = scalars.map { Unicode.Scalar($0)! }
self.unicodeScalarsRepairedTail =
scalarsRepairedTail.map { Unicode.Scalar($0)! }
self.flags = flags
self.loc = SourceLoc(file, line, comment: "test data")
}
}
public var utfTests: [UTFTest] = []
//
// Empty sequence.
//
utfTests.append(
UTFTest(
string: "",
utf8: [],
utf16: [],
scalars: []))
//
// 1-byte sequences.
//
// U+0000 NULL
utfTests.append(
UTFTest(
string: "\u{0000}",
utf8: [ 0x00 ],
utf16: [ 0x00 ],
scalars: [ 0x00 ]))
// U+0041 LATIN CAPITAL LETTER A
utfTests.append(
UTFTest(
string: "A",
utf8: [ 0x41 ],
utf16: [ 0x41 ],
scalars: [ 0x41 ]))
// U+0041 LATIN CAPITAL LETTER A
// U+0042 LATIN CAPITAL LETTER B
utfTests.append(
UTFTest(
string: "AB",
utf8: [ 0x41, 0x42 ],
utf16: [ 0x41, 0x42 ],
scalars: [ 0x41, 0x42 ]))
// U+0061 LATIN SMALL LETTER A
// U+0062 LATIN SMALL LETTER B
// U+0063 LATIN SMALL LETTER C
utfTests.append(
UTFTest(
string: "ABC",
utf8: [ 0x41, 0x42, 0x43 ],
utf16: [ 0x41, 0x42, 0x43 ],
scalars: [ 0x41, 0x42, 0x43 ]))
// U+0000 NULL
// U+0041 LATIN CAPITAL LETTER A
// U+0042 LATIN CAPITAL LETTER B
// U+0000 NULL
utfTests.append(
UTFTest(
string: "\u{0000}AB\u{0000}",
utf8: [ 0x00, 0x41, 0x42, 0x00 ],
utf16: [ 0x00, 0x41, 0x42, 0x00 ],
scalars: [ 0x00, 0x41, 0x42, 0x00 ]))
// U+007F DELETE
utfTests.append(
UTFTest(
string: "\u{007F}",
utf8: [ 0x7F ],
utf16: [ 0x7F ],
scalars: [ 0x7F ]))
//
// 2-byte sequences.
//
// U+0283 LATIN SMALL LETTER ESH
utfTests.append(
UTFTest(
string: "\u{0283}",
utf8: [ 0xCA, 0x83 ],
utf16: [ 0x0283 ],
scalars: [ 0x0283 ]))
// U+03BA GREEK SMALL LETTER KAPPA
// U+1F79 GREEK SMALL LETTER OMICRON WITH OXIA
// U+03C3 GREEK SMALL LETTER SIGMA
// U+03BC GREEK SMALL LETTER MU
// U+03B5 GREEK SMALL LETTER EPSILON
utfTests.append(
UTFTest(
string: "\u{03BA}\u{1F79}\u{03C3}\u{03BC}\u{03B5}",
utf8: [ 0xCE, 0xBA, 0xE1, 0xBD, 0xB9, 0xCF, 0x83, 0xCE, 0xBC, 0xCE, 0xB5 ],
utf16: [ 0x03BA, 0x1F79, 0x03C3, 0x03BC, 0x03B5 ],
scalars: [ 0x03BA, 0x1F79, 0x03C3, 0x03BC, 0x03B5 ]))
// U+0430 CYRILLIC SMALL LETTER A
// U+0431 CYRILLIC SMALL LETTER BE
// U+0432 CYRILLIC SMALL LETTER VE
utfTests.append(
UTFTest(
string: "\u{0430}\u{0431}\u{0432}",
utf8: [ 0xD0, 0xB0, 0xD0, 0xB1, 0xD0, 0xB2 ],
utf16: [ 0x0430, 0x0431, 0x0432 ],
scalars: [ 0x0430, 0x0431, 0x0432 ]))
//
// 3-byte sequences.
//
// U+4F8B CJK UNIFIED IDEOGRAPH-4F8B
// U+6587 CJK UNIFIED IDEOGRAPH-6587
utfTests.append(
UTFTest(
string: "\u{4F8b}\u{6587}",
utf8: [ 0xE4, 0xBE, 0x8B, 0xE6, 0x96, 0x87 ],
utf16: [ 0x4F8B, 0x6587 ],
scalars: [ 0x4F8B, 0x6587 ]))
// U+D55C HANGUL SYLLABLE HAN
// U+AE00 HANGUL SYLLABLE GEUL
utfTests.append(
UTFTest(
string: "\u{d55c}\u{ae00}",
utf8: [ 0xED, 0x95, 0x9C, 0xEA, 0xB8, 0x80 ],
utf16: [ 0xD55C, 0xAE00 ],
scalars: [ 0xD55C, 0xAE00 ]))
// U+1112 HANGUL CHOSEONG HIEUH
// U+1161 HANGUL JUNGSEONG A
// U+11AB HANGUL JONGSEONG NIEUN
// U+1100 HANGUL CHOSEONG KIYEOK
// U+1173 HANGUL JUNGSEONG EU
// U+11AF HANGUL JONGSEONG RIEUL
utfTests.append(
UTFTest(
string: "\u{1112}\u{1161}\u{11ab}\u{1100}\u{1173}\u{11af}",
utf8:
[ 0xE1, 0x84, 0x92, 0xE1, 0x85, 0xA1, 0xE1, 0x86, 0xAB,
0xE1, 0x84, 0x80, 0xE1, 0x85, 0xB3, 0xE1, 0x86, 0xAF ],
utf16: [ 0x1112, 0x1161, 0x11AB, 0x1100, 0x1173, 0x11AF ],
scalars: [ 0x1112, 0x1161, 0x11AB, 0x1100, 0x1173, 0x11AF ]))
// U+3042 HIRAGANA LETTER A
// U+3044 HIRAGANA LETTER I
// U+3046 HIRAGANA LETTER U
// U+3048 HIRAGANA LETTER E
// U+304A HIRAGANA LETTER O
utfTests.append(
UTFTest(
string: "\u{3042}\u{3044}\u{3046}\u{3048}\u{304a}",
utf8:
[ 0xE3, 0x81, 0x82, 0xE3, 0x81, 0x84, 0xE3, 0x81, 0x86,
0xE3, 0x81, 0x88, 0xE3, 0x81, 0x8A ],
utf16: [ 0x3042, 0x3044, 0x3046, 0x3048, 0x304A ],
scalars: [ 0x3042, 0x3044, 0x3046, 0x3048, 0x304A ]))
// U+D7FF (unassigned)
utfTests.append(
UTFTest(
string: "\u{D7FF}",
utf8: [ 0xED, 0x9F, 0xBF ],
utf16: [ 0xD7FF ],
scalars: [ 0xD7FF ]))
// U+E000 (private use)
utfTests.append(
UTFTest(
string: "\u{E000}",
utf8: [ 0xEE, 0x80, 0x80 ],
utf16: [ 0xE000 ],
scalars: [ 0xE000 ]))
// U+FFFD REPLACEMENT CHARACTER
utfTests.append(
UTFTest(
string: "\u{FFFD}",
utf8: [ 0xEF, 0xBF, 0xBD ],
utf16: [ 0xFFFD ],
scalars: [ 0xFFFD ]))
// U+FFFF (noncharacter)
utfTests.append(
UTFTest(
string: "\u{FFFF}",
utf8: [ 0xEF, 0xBF, 0xBF ],
utf16: [ 0xFFFF ],
scalars: [ 0xFFFF ]))
//
// 4-byte sequences.
//
// U+1F425 FRONT-FACING BABY CHICK
utfTests.append(
UTFTest(
string: "\u{1F425}",
utf8: [ 0xF0, 0x9F, 0x90, 0xA5 ],
utf16: [ 0xD83D, 0xDC25 ],
scalars: [ 0x0001_F425 ]))
// U+0041 LATIN CAPITAL LETTER A
// U+1F425 FRONT-FACING BABY CHICK
utfTests.append(
UTFTest(
string: "A\u{1F425}",
utf8: [ 0x41, 0xF0, 0x9F, 0x90, 0xA5 ],
utf16: [ 0x41, 0xD83D, 0xDC25 ],
scalars: [ 0x41, 0x0001_F425 ]))
// U+0041 LATIN CAPITAL LETTER A
// U+0042 LATIN CAPITAL LETTER B
// U+1F425 FRONT-FACING BABY CHICK
utfTests.append(
UTFTest(
string: "AB\u{1F425}",
utf8: [ 0x41, 0x42, 0xF0, 0x9F, 0x90, 0xA5 ],
utf16: [ 0x41, 0x42, 0xD83D, 0xDC25 ],
scalars: [ 0x41, 0x42, 0x0001_F425 ]))
// U+0041 LATIN CAPITAL LETTER A
// U+0042 LATIN CAPITAL LETTER B
// U+0043 LATIN CAPITAL LETTER C
// U+1F425 FRONT-FACING BABY CHICK
utfTests.append(
UTFTest(
string: "ABC\u{1F425}",
utf8: [ 0x41, 0x42, 0x43, 0xF0, 0x9F, 0x90, 0xA5 ],
utf16: [ 0x41, 0x42, 0x43, 0xD83D, 0xDC25 ],
scalars: [ 0x41, 0x42, 0x43, 0x0001_F425 ]))
// U+0041 LATIN CAPITAL LETTER A
// U+0042 LATIN CAPITAL LETTER B
// U+0043 LATIN CAPITAL LETTER C
// U+0044 LATIN CAPITAL LETTER D
// U+1F425 FRONT-FACING BABY CHICK
utfTests.append(
UTFTest(
string: "ABCD\u{1F425}",
utf8: [ 0x41, 0x42, 0x43, 0x44, 0xF0, 0x9F, 0x90, 0xA5 ],
utf16: [ 0x41, 0x42, 0x43, 0x44, 0xD83D, 0xDC25 ],
scalars: [ 0x41, 0x42, 0x43, 0x44, 0x0001_F425 ]))
// U+0041 LATIN CAPITAL LETTER A
// U+0042 LATIN CAPITAL LETTER B
// U+0043 LATIN CAPITAL LETTER C
// U+0044 LATIN CAPITAL LETTER D
// U+0045 LATIN CAPITAL LETTER E
// U+1F425 FRONT-FACING BABY CHICK
utfTests.append(
UTFTest(
string: "ABCDE\u{1F425}",
utf8: [ 0x41, 0x42, 0x43, 0x44, 0x45, 0xF0, 0x9F, 0x90, 0xA5 ],
utf16: [ 0x41, 0x42, 0x43, 0x44, 0x45, 0xD83D, 0xDC25 ],
scalars: [ 0x41, 0x42, 0x43, 0x44, 0x45, 0x0001_F425 ]))
// U+0041 LATIN CAPITAL LETTER A
// U+0042 LATIN CAPITAL LETTER B
// U+0043 LATIN CAPITAL LETTER C
// U+0044 LATIN CAPITAL LETTER D
// U+0045 LATIN CAPITAL LETTER E
// U+0046 LATIN CAPITAL LETTER F
// U+1F425 FRONT-FACING BABY CHICK
utfTests.append(
UTFTest(
string: "ABCDEF\u{1F425}",
utf8: [ 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0xF0, 0x9F, 0x90, 0xA5 ],
utf16: [ 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0xD83D, 0xDC25 ],
scalars: [ 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x0001_F425 ]))
// U+0041 LATIN CAPITAL LETTER A
// U+0042 LATIN CAPITAL LETTER B
// U+0043 LATIN CAPITAL LETTER C
// U+0044 LATIN CAPITAL LETTER D
// U+0045 LATIN CAPITAL LETTER E
// U+0046 LATIN CAPITAL LETTER F
// U+0047 LATIN CAPITAL LETTER G
// U+1F425 FRONT-FACING BABY CHICK
utfTests.append(
UTFTest(
string: "ABCDEFG\u{1F425}",
utf8: [ 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0xF0, 0x9F, 0x90, 0xA5 ],
utf16: [ 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0xD83D, 0xDC25 ],
scalars: [ 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x0001_F425 ]))
// U+0041 LATIN CAPITAL LETTER A
// U+0042 LATIN CAPITAL LETTER B
// U+0043 LATIN CAPITAL LETTER C
// U+0044 LATIN CAPITAL LETTER D
// U+0045 LATIN CAPITAL LETTER E
// U+0046 LATIN CAPITAL LETTER F
// U+0047 LATIN CAPITAL LETTER G
// U+0048 LATIN CAPITAL LETTER H
// U+1F425 FRONT-FACING BABY CHICK
utfTests.append(
UTFTest(
string: "ABCDEFGH\u{1F425}",
utf8:
[ 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48,
0xF0, 0x9F, 0x90, 0xA5 ],
utf16:
[ 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48,
0xD83D, 0xDC25 ],
scalars:
[ 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x0001_F425 ]))
// U+0041 LATIN CAPITAL LETTER A
// U+0042 LATIN CAPITAL LETTER B
// U+0043 LATIN CAPITAL LETTER C
// U+0044 LATIN CAPITAL LETTER D
// U+0045 LATIN CAPITAL LETTER E
// U+0046 LATIN CAPITAL LETTER F
// U+0047 LATIN CAPITAL LETTER G
// U+0048 LATIN CAPITAL LETTER H
// U+0049 LATIN CAPITAL LETTER I
// U+1F425 FRONT-FACING BABY CHICK
utfTests.append(
UTFTest(
string: "ABCDEFGHI\u{1F425}",
utf8:
[ 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49,
0xF0, 0x9F, 0x90, 0xA5 ],
utf16:
[ 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49,
0xD83D, 0xDC25 ],
scalars:
[ 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x0001_F425 ]))
// U+10000 LINEAR B SYLLABLE B008 A
utfTests.append(
UTFTest(
string: "\u{10000}",
utf8: [ 0xF0, 0x90, 0x80, 0x80 ],
utf16: [ 0xD800, 0xDC00 ],
scalars: [ 0x0001_0000 ]))
// U+10100 AEGEAN WORD SEPARATOR LINE
utfTests.append(
UTFTest(
string: "\u{10100}",
utf8: [ 0xF0, 0x90, 0x84, 0x80 ],
utf16: [ 0xD800, 0xDD00 ],
scalars: [ 0x0001_0100 ]))
// U+103FF (unassigned)
utfTests.append(
UTFTest(
string: "\u{103FF}",
utf8: [ 0xF0, 0x90, 0x8F, 0xBF ],
utf16: [ 0xD800, 0xDFFF ],
scalars: [ 0x0001_03FF ]))
// U+E0000 (unassigned)
utfTests.append(
UTFTest(
string: "\u{E0000}",
utf8: [ 0xF3, 0xA0, 0x80, 0x80 ],
utf16: [ 0xDB40, 0xDC00 ],
scalars: [ 0x000E_0000 ]))
// U+E0100 VARIATION SELECTOR-17
utfTests.append(
UTFTest(
string: "\u{E0100}",
utf8: [ 0xF3, 0xA0, 0x84, 0x80 ],
utf16: [ 0xDB40, 0xDD00 ],
scalars: [ 0x000E_0100 ]))
// U+E03FF (unassigned)
utfTests.append(
UTFTest(
string: "\u{E03FF}",
utf8: [ 0xF3, 0xA0, 0x8F, 0xBF ],
utf16: [ 0xDB40, 0xDFFF ],
scalars: [ 0x000E_03FF ]))
// U+10FC00 (private use)
utfTests.append(
UTFTest(
string: "\u{10FC00}",
utf8: [ 0xF4, 0x8F, 0xB0, 0x80 ],
utf16: [ 0xDBFF, 0xDC00 ],
scalars: [ 0x0010_FC00 ]))
// U+10FD00 (private use)
utfTests.append(
UTFTest(
string: "\u{10FD00}",
utf8: [ 0xF4, 0x8F, 0xB4, 0x80 ],
utf16: [ 0xDBFF, 0xDD00 ],
scalars: [ 0x0010_FD00 ]))
// U+10FFFF (private use, noncharacter)
utfTests.append(
UTFTest(
string: "\u{10FFFF}",
utf8: [ 0xF4, 0x8F, 0xBF, 0xBF ],
utf16: [ 0xDBFF, 0xDFFF ],
scalars: [ 0x0010_FFFF ]))
//===----------------------------------------------------------------------===//
UTF8Decoder.test("SmokeTest").forEach(in: utfTests) {
test in
expectTrue(
checkDecodeUTF8(test.utf32, [], test.utf8),
stackTrace: test.loc.withCurrentLoc())
return ()
}
UTF8Decoder.test("FirstPossibleSequence") {
//
// First possible sequence of a certain length
//
// U+0000 NULL
expectTrue(checkDecodeUTF8([ 0x0000 ], [], [ 0x00 ]))
// U+0080 PADDING CHARACTER
expectTrue(checkDecodeUTF8([ 0x0080 ], [], [ 0xc2, 0x80 ]))
// U+0800 SAMARITAN LETTER ALAF
expectTrue(checkDecodeUTF8(
[ 0x0800 ], [],
[ 0xe0, 0xa0, 0x80 ]))
// U+10000 LINEAR B SYLLABLE B008 A
expectTrue(checkDecodeUTF8(
[ 0x10000 ], [],
[ 0xf0, 0x90, 0x80, 0x80 ]))
// U+200000 (invalid)
expectTrue(checkDecodeUTF8(
[], [ 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd ],
[ 0xf8, 0x88, 0x80, 0x80, 0x80 ]))
// U+4000000 (invalid)
expectTrue(checkDecodeUTF8(
[], [ 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd ],
[ 0xfc, 0x84, 0x80, 0x80, 0x80, 0x80 ]))
}
UTF8Decoder.test("LastPossibleSequence") {
//
// Last possible sequence of a certain length
//
// U+007F DELETE
expectTrue(checkDecodeUTF8([ 0x007f ], [], [ 0x7f ]))
// U+07FF (unassigned)
expectTrue(checkDecodeUTF8([ 0x07ff ], [], [ 0xdf, 0xbf ]))
// U+FFFF (noncharacter)
expectTrue(checkDecodeUTF8(
[ 0xffff ], [],
[ 0xef, 0xbf, 0xbf ]))
// U+1FFFFF (invalid)
expectTrue(checkDecodeUTF8(
[], [ 0xfffd, 0xfffd, 0xfffd, 0xfffd ],
[ 0xf7, 0xbf, 0xbf, 0xbf ]))
// U+3FFFFFF (invalid)
expectTrue(checkDecodeUTF8(
[], [ 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd ],
[ 0xfb, 0xbf, 0xbf, 0xbf, 0xbf ]))
// U+7FFFFFFF (invalid)
expectTrue(checkDecodeUTF8(
[], [ 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd ],
[ 0xfd, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf ]))
}
UTF8Decoder.test("CodeSpaceBoundaryConditions") {
//
// Other boundary conditions
//
// U+D7FF (unassigned)
expectTrue(checkDecodeUTF8([ 0xd7ff ], [], [ 0xed, 0x9f, 0xbf ]))
// U+E000 (private use)
expectTrue(checkDecodeUTF8([ 0xe000 ], [], [ 0xee, 0x80, 0x80 ]))
// U+FFFD REPLACEMENT CHARACTER
expectTrue(checkDecodeUTF8([ 0xfffd ], [], [ 0xef, 0xbf, 0xbd ]))
// U+10FFFF (noncharacter)
expectTrue(checkDecodeUTF8([ 0x10ffff ], [], [ 0xf4, 0x8f, 0xbf, 0xbf ]))
// U+110000 (invalid)
expectTrue(checkDecodeUTF8(
[], [ 0xfffd, 0xfffd, 0xfffd, 0xfffd ],
[ 0xf4, 0x90, 0x80, 0x80 ]))
}
UTF8Decoder.test("UnexpectedContinuationBytes") {
//
// Unexpected continuation bytes
//
// A sequence of unexpected continuation bytes that don't follow a first
// byte, every byte is a maximal subpart.
expectTrue(checkDecodeUTF8([], [ 0xfffd ], [ 0x80 ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd ], [ 0xbf ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd, 0xfffd ], [ 0x80, 0x80 ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd, 0xfffd ], [ 0x80, 0xbf ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd, 0xfffd ], [ 0xbf, 0x80 ]))
expectTrue(checkDecodeUTF8(
[], [ 0xfffd, 0xfffd, 0xfffd ],
[ 0x80, 0xbf, 0x80 ]))
expectTrue(checkDecodeUTF8(
[], [ 0xfffd, 0xfffd, 0xfffd, 0xfffd ],
[ 0x80, 0xbf, 0x80, 0xbf ]))
expectTrue(checkDecodeUTF8(
[], [ 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd ],
[ 0x80, 0xbf, 0x82, 0xbf, 0xaa ]))
expectTrue(checkDecodeUTF8(
[], [ 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd ],
[ 0xaa, 0xb0, 0xbb, 0xbf, 0xaa, 0xa0 ]))
expectTrue(checkDecodeUTF8(
[], [ 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd ],
[ 0xaa, 0xb0, 0xbb, 0xbf, 0xaa, 0xa0, 0x8f ]))
// All continuation bytes (0x80--0xbf).
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd ],
[ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf ]))
}
UTF8Decoder.test("LonelyStartBytes") {
//
// Lonely start bytes
//
// Start bytes of 2-byte sequences (0xc0--0xdf).
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd ],
[ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf ]))
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020,
0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020,
0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020,
0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020,
0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020,
0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020,
0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020,
0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020 ],
[ 0xc0, 0x20, 0xc1, 0x20, 0xc2, 0x20, 0xc3, 0x20,
0xc4, 0x20, 0xc5, 0x20, 0xc6, 0x20, 0xc7, 0x20,
0xc8, 0x20, 0xc9, 0x20, 0xca, 0x20, 0xcb, 0x20,
0xcc, 0x20, 0xcd, 0x20, 0xce, 0x20, 0xcf, 0x20,
0xd0, 0x20, 0xd1, 0x20, 0xd2, 0x20, 0xd3, 0x20,
0xd4, 0x20, 0xd5, 0x20, 0xd6, 0x20, 0xd7, 0x20,
0xd8, 0x20, 0xd9, 0x20, 0xda, 0x20, 0xdb, 0x20,
0xdc, 0x20, 0xdd, 0x20, 0xde, 0x20, 0xdf, 0x20 ]))
// Start bytes of 3-byte sequences (0xe0--0xef).
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd ],
[ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef ]))
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020,
0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020,
0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020,
0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020 ],
[ 0xe0, 0x20, 0xe1, 0x20, 0xe2, 0x20, 0xe3, 0x20,
0xe4, 0x20, 0xe5, 0x20, 0xe6, 0x20, 0xe7, 0x20,
0xe8, 0x20, 0xe9, 0x20, 0xea, 0x20, 0xeb, 0x20,
0xec, 0x20, 0xed, 0x20, 0xee, 0x20, 0xef, 0x20 ]))
// Start bytes of 4-byte sequences (0xf0--0xf7).
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd ],
[ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7 ]))
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020,
0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020 ],
[ 0xf0, 0x20, 0xf1, 0x20, 0xf2, 0x20, 0xf3, 0x20,
0xf4, 0x20, 0xf5, 0x20, 0xf6, 0x20, 0xf7, 0x20 ]))
// Start bytes of 5-byte sequences (0xf8--0xfb).
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd, 0xfffd ],
[ 0xf8, 0xf9, 0xfa, 0xfb ]))
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020 ],
[ 0xf8, 0x20, 0xf9, 0x20, 0xfa, 0x20, 0xfb, 0x20 ]))
// Start bytes of 6-byte sequences (0xfc--0xfd).
expectTrue(checkDecodeUTF8([], [ 0xfffd, 0xfffd ], [ 0xfc, 0xfd ]))
expectTrue(checkDecodeUTF8(
[], [ 0xfffd, 0x0020, 0xfffd, 0x0020 ],
[ 0xfc, 0x20, 0xfd, 0x20 ]))
}
UTF8Decoder.test("InvalidStartBytes") {
//
// Other bytes (0xc0--0xc1, 0xfe--0xff).
//
expectTrue(checkDecodeUTF8([], [ 0xfffd ], [ 0xc0 ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd ], [ 0xc1 ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd ], [ 0xfe ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd ], [ 0xff ]))
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd, 0xfffd ],
[ 0xc0, 0xc1, 0xfe, 0xff ]))
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd, 0xfffd ],
[ 0xfe, 0xfe, 0xff, 0xff ]))
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd ],
[ 0xfe, 0x80, 0x80, 0x80, 0x80, 0x80 ]))
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd ],
[ 0xff, 0x80, 0x80, 0x80, 0x80, 0x80 ]))
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020 ],
[ 0xc0, 0x20, 0xc1, 0x20, 0xfe, 0x20, 0xff, 0x20 ]))
}
UTF8Decoder.test("MissingContinuationBytes") {
//
// Sequences with one continuation byte missing
//
expectTrue(checkDecodeUTF8([], [ 0xfffd ], [ 0xc2 ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd ], [ 0xdf ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd, 0x0041 ], [ 0xc2, 0x41 ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd, 0x0041 ], [ 0xdf, 0x41 ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd ], [ 0xe0, 0xa0 ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd ], [ 0xe0, 0xbf ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd, 0x0041 ], [ 0xe0, 0xa0, 0x41 ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd, 0x0041 ], [ 0xe0, 0xbf, 0x41 ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd ], [ 0xe1, 0x80 ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd ], [ 0xec, 0xbf ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd, 0x0041 ], [ 0xe1, 0x80, 0x41 ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd, 0x0041 ], [ 0xec, 0xbf, 0x41 ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd ], [ 0xed, 0x80 ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd ], [ 0xed, 0x9f ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd, 0x0041 ], [ 0xed, 0x80, 0x41 ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd, 0x0041 ], [ 0xed, 0x9f, 0x41 ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd ], [ 0xee, 0x80 ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd ], [ 0xef, 0xbf ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd, 0x0041 ], [ 0xee, 0x80, 0x41 ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd, 0x0041 ], [ 0xef, 0xbf, 0x41 ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd ], [ 0xf0, 0x90, 0x80 ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd ], [ 0xf0, 0xbf, 0xbf ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd, 0x0041 ], [ 0xf0, 0x90, 0x80, 0x41 ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd, 0x0041 ], [ 0xf0, 0xbf, 0xbf, 0x41 ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd ], [ 0xf1, 0x80, 0x80 ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd ], [ 0xf3, 0xbf, 0xbf ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd, 0x0041 ], [ 0xf1, 0x80, 0x80, 0x41 ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd, 0x0041 ], [ 0xf3, 0xbf, 0xbf, 0x41 ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd ], [ 0xf4, 0x80, 0x80 ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd ], [ 0xf4, 0x8f, 0xbf ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd, 0x0041 ], [ 0xf4, 0x80, 0x80, 0x41 ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd, 0x0041 ], [ 0xf4, 0x8f, 0xbf, 0x41 ]))
// Overlong sequences with one trailing byte missing.
expectTrue(checkDecodeUTF8([], [ 0xfffd ], [ 0xc0 ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd ], [ 0xc1 ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd, 0xfffd ], [ 0xe0, 0x80 ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd, 0xfffd ], [ 0xe0, 0x9f ]))
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd ],
[ 0xf0, 0x80, 0x80 ]))
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd ],
[ 0xf0, 0x8f, 0x80 ]))
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd, 0xfffd ],
[ 0xf8, 0x80, 0x80, 0x80 ]))
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd ],
[ 0xfc, 0x80, 0x80, 0x80, 0x80 ]))
// Sequences that represent surrogates with one trailing byte missing.
// High-surrogates
expectTrue(checkDecodeUTF8([], [ 0xfffd, 0xfffd ], [ 0xed, 0xa0 ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd, 0xfffd ], [ 0xed, 0xac ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd, 0xfffd ], [ 0xed, 0xaf ]))
// Low-surrogates
expectTrue(checkDecodeUTF8([], [ 0xfffd, 0xfffd ], [ 0xed, 0xb0 ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd, 0xfffd ], [ 0xed, 0xb4 ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd, 0xfffd ], [ 0xed, 0xbf ]))
// Ill-formed 4-byte sequences.
// 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
// U+1100xx (invalid)
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd ],
[ 0xf4, 0x90, 0x80 ]))
// U+13FBxx (invalid)
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd ],
[ 0xf4, 0xbf, 0xbf ]))
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd ],
[ 0xf5, 0x80, 0x80 ]))
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd ],
[ 0xf6, 0x80, 0x80 ]))
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd ],
[ 0xf7, 0x80, 0x80 ]))
// U+1FFBxx (invalid)
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd ],
[ 0xf7, 0xbf, 0xbf ]))
// Ill-formed 5-byte sequences.
// 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
// U+2000xx (invalid)
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd, 0xfffd ],
[ 0xf8, 0x88, 0x80, 0x80 ]))
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd, 0xfffd ],
[ 0xf8, 0xbf, 0xbf, 0xbf ]))
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd, 0xfffd ],
[ 0xf9, 0x80, 0x80, 0x80 ]))
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd, 0xfffd ],
[ 0xfa, 0x80, 0x80, 0x80 ]))
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd, 0xfffd ],
[ 0xfb, 0x80, 0x80, 0x80 ]))
// U+3FFFFxx (invalid)
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd, 0xfffd ],
[ 0xfb, 0xbf, 0xbf, 0xbf ]))
// Ill-formed 6-byte sequences.
// 1111110u 10uuuuuu 10uzzzzz 10zzzyyyy 10yyyyxx 10xxxxxx
// U+40000xx (invalid)
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd ],
[ 0xfc, 0x84, 0x80, 0x80, 0x80 ]))
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd ],
[ 0xfc, 0xbf, 0xbf, 0xbf, 0xbf ]))
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd ],
[ 0xfd, 0x80, 0x80, 0x80, 0x80 ]))
// U+7FFFFFxx (invalid)
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd ],
[ 0xfd, 0xbf, 0xbf, 0xbf, 0xbf ]))
//
// Sequences with two continuation bytes missing
//
expectTrue(checkDecodeUTF8([], [ 0xfffd ], [ 0xf0, 0x90 ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd ], [ 0xf0, 0xbf ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd ], [ 0xf1, 0x80 ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd ], [ 0xf3, 0xbf ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd ], [ 0xf4, 0x80 ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd ], [ 0xf4, 0x8f ]))
// Overlong sequences with two trailing byte missing.
expectTrue(checkDecodeUTF8([], [ 0xfffd ], [ 0xe0 ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd, 0xfffd ], [ 0xf0, 0x80 ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd, 0xfffd ], [ 0xf0, 0x8f ]))
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd ],
[ 0xf8, 0x80, 0x80 ]))
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd, 0xfffd ],
[ 0xfc, 0x80, 0x80, 0x80 ]))
// Sequences that represent surrogates with two trailing bytes missing.
expectTrue(checkDecodeUTF8([], [ 0xfffd ], [ 0xed ]))
// Ill-formed 4-byte sequences.
// 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
// U+110yxx (invalid)
expectTrue(checkDecodeUTF8([], [ 0xfffd, 0xfffd ], [ 0xf4, 0x90 ]))
// U+13Fyxx (invalid)
expectTrue(checkDecodeUTF8([], [ 0xfffd, 0xfffd ], [ 0xf4, 0xbf ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd, 0xfffd ], [ 0xf5, 0x80 ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd, 0xfffd ], [ 0xf6, 0x80 ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd, 0xfffd ], [ 0xf7, 0x80 ]))
// U+1FFyxx (invalid)
expectTrue(checkDecodeUTF8([], [ 0xfffd, 0xfffd ], [ 0xf7, 0xbf ]))
// Ill-formed 5-byte sequences.
// 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
// U+200yxx (invalid)
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd ],
[ 0xf8, 0x88, 0x80 ]))
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd ],
[ 0xf8, 0xbf, 0xbf ]))
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd ],
[ 0xf9, 0x80, 0x80 ]))
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd ],
[ 0xfa, 0x80, 0x80 ]))
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd ],
[ 0xfb, 0x80, 0x80 ]))
// U+3FFFyxx (invalid)
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd ],
[ 0xfb, 0xbf, 0xbf ]))
// Ill-formed 6-byte sequences.
// 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
// U+4000yxx (invalid)
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd, 0xfffd ],
[ 0xfc, 0x84, 0x80, 0x80 ]))
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd, 0xfffd ],
[ 0xfc, 0xbf, 0xbf, 0xbf ]))
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd, 0xfffd ],
[ 0xfd, 0x80, 0x80, 0x80 ]))
// U+7FFFFyxx (invalid)
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd, 0xfffd ],
[ 0xfd, 0xbf, 0xbf, 0xbf ]))
//
// Sequences with three continuation bytes missing
//
expectTrue(checkDecodeUTF8([], [ 0xfffd ], [ 0xf0 ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd ], [ 0xf1 ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd ], [ 0xf2 ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd ], [ 0xf3 ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd ], [ 0xf4 ]))
// Broken overlong sequences.
expectTrue(checkDecodeUTF8([], [ 0xfffd ], [ 0xf0 ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd, 0xfffd ], [ 0xf8, 0x80 ]))
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd ],
[ 0xfc, 0x80, 0x80 ]))
// Ill-formed 4-byte sequences.
// 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
// U+14yyxx (invalid)
expectTrue(checkDecodeUTF8([], [ 0xfffd ], [ 0xf5 ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd ], [ 0xf6 ]))
// U+1Cyyxx (invalid)
expectTrue(checkDecodeUTF8([], [ 0xfffd ], [ 0xf7 ]))
// Ill-formed 5-byte sequences.
// 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
// U+20yyxx (invalid)
expectTrue(checkDecodeUTF8([], [ 0xfffd, 0xfffd ], [ 0xf8, 0x88 ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd, 0xfffd ], [ 0xf8, 0xbf ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd, 0xfffd ], [ 0xf9, 0x80 ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd, 0xfffd ], [ 0xfa, 0x80 ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd, 0xfffd ], [ 0xfb, 0x80 ]))
// U+3FCyyxx (invalid)
expectTrue(checkDecodeUTF8([], [ 0xfffd, 0xfffd ], [ 0xfb, 0xbf ]))
// Ill-formed 6-byte sequences.
// 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
// U+400yyxx (invalid)
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd ],
[ 0xfc, 0x84, 0x80 ]))
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd ],
[ 0xfc, 0xbf, 0xbf ]))
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd ],
[ 0xfd, 0x80, 0x80 ]))
// U+7FFCyyxx (invalid)
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd ],
[ 0xfd, 0xbf, 0xbf ]))
//
// Sequences with four continuation bytes missing
//
// Ill-formed 5-byte sequences.
// 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
// U+uzyyxx (invalid)
expectTrue(checkDecodeUTF8([], [ 0xfffd ], [ 0xf8 ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd ], [ 0xf9 ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd ], [ 0xfa ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd ], [ 0xfb ]))
// U+3zyyxx (invalid)
expectTrue(checkDecodeUTF8([], [ 0xfffd ], [ 0xfb ]))
// Broken overlong sequences.
expectTrue(checkDecodeUTF8([], [ 0xfffd ], [ 0xf8 ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd, 0xfffd ], [ 0xfc, 0x80 ]))
// Ill-formed 6-byte sequences.
// 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
// U+uzzyyxx (invalid)
expectTrue(checkDecodeUTF8([], [ 0xfffd, 0xfffd ], [ 0xfc, 0x84 ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd, 0xfffd ], [ 0xfc, 0xbf ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd, 0xfffd ], [ 0xfd, 0x80 ]))
// U+7Fzzyyxx (invalid)
expectTrue(checkDecodeUTF8([], [ 0xfffd, 0xfffd ], [ 0xfd, 0xbf ]))
//
// Sequences with five continuation bytes missing
//
// Ill-formed 6-byte sequences.
// 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
// U+uzzyyxx (invalid)
expectTrue(checkDecodeUTF8([], [ 0xfffd ], [ 0xfc ]))
// U+uuzzyyxx (invalid)
expectTrue(checkDecodeUTF8([], [ 0xfffd ], [ 0xfd ]))
//
// Consecutive sequences with trailing bytes missing
//
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, /**/ 0xfffd, 0xfffd, /**/ 0xfffd, 0xfffd, 0xfffd,
0xfffd, 0xfffd, 0xfffd, 0xfffd,
0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
0xfffd, /**/ 0xfffd, /**/ 0xfffd, 0xfffd, 0xfffd,
0xfffd, 0xfffd, 0xfffd, 0xfffd,
0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd ],
[ 0xc0, /**/ 0xe0, 0x80, /**/ 0xf0, 0x80, 0x80,
0xf8, 0x80, 0x80, 0x80,
0xfc, 0x80, 0x80, 0x80, 0x80,
0xdf, /**/ 0xef, 0xbf, /**/ 0xf7, 0xbf, 0xbf,
0xfb, 0xbf, 0xbf, 0xbf,
0xfd, 0xbf, 0xbf, 0xbf, 0xbf ]))
}
UTF8Decoder.test("OverlongSequences") {
//
// Overlong UTF-8 sequences
//
// U+002F SOLIDUS
expectTrue(checkDecodeUTF8([ 0x002f ], [], [ 0x2f ]))
// Overlong sequences of the above.
expectTrue(checkDecodeUTF8([], [ 0xfffd, 0xfffd ], [ 0xc0, 0xaf ]))
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd ],
[ 0xe0, 0x80, 0xaf ]))
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd, 0xfffd ],
[ 0xf0, 0x80, 0x80, 0xaf ]))
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd ],
[ 0xf8, 0x80, 0x80, 0x80, 0xaf ]))
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd ],
[ 0xfc, 0x80, 0x80, 0x80, 0x80, 0xaf ]))
// U+0000 NULL
expectTrue(checkDecodeUTF8([ 0x0000 ], [], [ 0x00 ]))
// Overlong sequences of the above.
expectTrue(checkDecodeUTF8([], [ 0xfffd, 0xfffd ], [ 0xc0, 0x80 ]))
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd ],
[ 0xe0, 0x80, 0x80 ]))
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd, 0xfffd ],
[ 0xf0, 0x80, 0x80, 0x80 ]))
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd ],
[ 0xf8, 0x80, 0x80, 0x80, 0x80 ]))
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd ],
[ 0xfc, 0x80, 0x80, 0x80, 0x80, 0x80 ]))
// Other overlong and ill-formed sequences.
expectTrue(checkDecodeUTF8([], [ 0xfffd, 0xfffd ], [ 0xc0, 0xbf ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd, 0xfffd ], [ 0xc1, 0x80 ]))
expectTrue(checkDecodeUTF8([], [ 0xfffd, 0xfffd ], [ 0xc1, 0xbf ]))
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd ],
[ 0xe0, 0x9f, 0xbf ]))
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd ],
[ 0xed, 0xa0, 0x80 ]))
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd ],
[ 0xed, 0xbf, 0xbf ]))
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd, 0xfffd ],
[ 0xf0, 0x8f, 0x80, 0x80 ]))
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd, 0xfffd ],
[ 0xf0, 0x8f, 0xbf, 0xbf ]))
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd ],
[ 0xf8, 0x87, 0xbf, 0xbf, 0xbf ]))
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd ],
[ 0xfc, 0x83, 0xbf, 0xbf, 0xbf, 0xbf ]))
}
UTF8Decoder.test("IsolatedSurrogates") {
// Unicode 6.3.0:
//
// D71. High-surrogate code point: A Unicode code point in the range
// U+D800 to U+DBFF.
//
// D73. Low-surrogate code point: A Unicode code point in the range
// U+DC00 to U+DFFF.
// Note: U+E0100 is <DB40 DD00> in UTF-16.
// High-surrogates
// U+D800
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd ],
[ 0xed, 0xa0, 0x80 ]))
expectTrue(checkDecodeUTF8(
[ 0x0041 ],
[ 0xfffd, 0xfffd, 0xfffd, 0x0041 ],
[ 0x41, 0xed, 0xa0, 0x80, 0x41 ]))
// U+DB40
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd ],
[ 0xed, 0xac, 0xa0 ]))
// U+DBFF
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd ],
[ 0xed, 0xaf, 0xbf ]))
// Low-surrogates
// U+DC00
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd ],
[ 0xed, 0xb0, 0x80 ]))
// U+DD00
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd ],
[ 0xed, 0xb4, 0x80 ]))
// U+DFFF
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd ],
[ 0xed, 0xbf, 0xbf ]))
}
UTF8Decoder.test("SurrogatePairs") {
// Surrogate pairs
// U+D800 U+DC00
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd ],
[ 0xed, 0xa0, 0x80, 0xed, 0xb0, 0x80 ]))
// U+D800 U+DD00
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd ],
[ 0xed, 0xa0, 0x80, 0xed, 0xb4, 0x80 ]))
// U+D800 U+DFFF
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd ],
[ 0xed, 0xa0, 0x80, 0xed, 0xbf, 0xbf ]))
// U+DB40 U+DC00
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd ],
[ 0xed, 0xac, 0xa0, 0xed, 0xb0, 0x80 ]))
// U+DB40 U+DD00
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd ],
[ 0xed, 0xac, 0xa0, 0xed, 0xb4, 0x80 ]))
// U+DB40 U+DFFF
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd ],
[ 0xed, 0xac, 0xa0, 0xed, 0xbf, 0xbf ]))
// U+DBFF U+DC00
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd ],
[ 0xed, 0xaf, 0xbf, 0xed, 0xb0, 0x80 ]))
// U+DBFF U+DD00
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd ],
[ 0xed, 0xaf, 0xbf, 0xed, 0xb4, 0x80 ]))
// U+DBFF U+DFFF
expectTrue(checkDecodeUTF8(
[],
[ 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd ],
[ 0xed, 0xaf, 0xbf, 0xed, 0xbf, 0xbf ]))
}
UTF8Decoder.test("Noncharacters") {
//
// Noncharacters
//
// Unicode 6.3.0:
//
// D14. Noncharacter: A code point that is permanently reserved for
// internal use and that should never be interchanged. Noncharacters
// consist of the values U+nFFFE and U+nFFFF (where n is from 0 to 1016)
// and the values U+FDD0..U+FDEF.
// U+FFFE
expectTrue(checkDecodeUTF8([ 0xfffe ], [], [ 0xef, 0xbf, 0xbe ]))
// U+FFFF
expectTrue(checkDecodeUTF8([ 0xffff ], [], [ 0xef, 0xbf, 0xbf ]))
// U+1FFFE
expectTrue(checkDecodeUTF8([ 0x1fffe ], [], [ 0xf0, 0x9f, 0xbf, 0xbe ]))
// U+1FFFF
expectTrue(checkDecodeUTF8([ 0x1ffff ], [], [ 0xf0, 0x9f, 0xbf, 0xbf ]))
// U+2FFFE
expectTrue(checkDecodeUTF8([ 0x2fffe ], [], [ 0xf0, 0xaf, 0xbf, 0xbe ]))
// U+2FFFF
expectTrue(checkDecodeUTF8([ 0x2ffff ], [], [ 0xf0, 0xaf, 0xbf, 0xbf ]))
// U+3FFFE
expectTrue(checkDecodeUTF8([ 0x3fffe ], [], [ 0xf0, 0xbf, 0xbf, 0xbe ]))
// U+3FFFF
expectTrue(checkDecodeUTF8([ 0x3ffff ], [], [ 0xf0, 0xbf, 0xbf, 0xbf ]))
// U+4FFFE
expectTrue(checkDecodeUTF8([ 0x4fffe ], [], [ 0xf1, 0x8f, 0xbf, 0xbe ]))
// U+4FFFF
expectTrue(checkDecodeUTF8([ 0x4ffff ], [], [ 0xf1, 0x8f, 0xbf, 0xbf ]))
// U+5FFFE
expectTrue(checkDecodeUTF8([ 0x5fffe ], [], [ 0xf1, 0x9f, 0xbf, 0xbe ]))
// U+5FFFF
expectTrue(checkDecodeUTF8([ 0x5ffff ], [], [ 0xf1, 0x9f, 0xbf, 0xbf ]))
// U+6FFFE
expectTrue(checkDecodeUTF8([ 0x6fffe ], [], [ 0xf1, 0xaf, 0xbf, 0xbe ]))
// U+6FFFF
expectTrue(checkDecodeUTF8([ 0x6ffff ], [], [ 0xf1, 0xaf, 0xbf, 0xbf ]))
// U+7FFFE
expectTrue(checkDecodeUTF8([ 0x7fffe ], [], [ 0xf1, 0xbf, 0xbf, 0xbe ]))
// U+7FFFF
expectTrue(checkDecodeUTF8([ 0x7ffff ], [], [ 0xf1, 0xbf, 0xbf, 0xbf ]))
// U+8FFFE
expectTrue(checkDecodeUTF8([ 0x8fffe ], [], [ 0xf2, 0x8f, 0xbf, 0xbe ]))
// U+8FFFF
expectTrue(checkDecodeUTF8([ 0x8ffff ], [], [ 0xf2, 0x8f, 0xbf, 0xbf ]))
// U+9FFFE
expectTrue(checkDecodeUTF8([ 0x9fffe ], [], [ 0xf2, 0x9f, 0xbf, 0xbe ]))
// U+9FFFF
expectTrue(checkDecodeUTF8([ 0x9ffff ], [], [ 0xf2, 0x9f, 0xbf, 0xbf ]))
// U+AFFFE
expectTrue(checkDecodeUTF8([ 0xafffe ], [], [ 0xf2, 0xaf, 0xbf, 0xbe ]))
// U+AFFFF
expectTrue(checkDecodeUTF8([ 0xaffff ], [], [ 0xf2, 0xaf, 0xbf, 0xbf ]))
// U+BFFFE
expectTrue(checkDecodeUTF8([ 0xbfffe ], [], [ 0xf2, 0xbf, 0xbf, 0xbe ]))
// U+BFFFF
expectTrue(checkDecodeUTF8([ 0xbffff ], [], [ 0xf2, 0xbf, 0xbf, 0xbf ]))
// U+CFFFE
expectTrue(checkDecodeUTF8([ 0xcfffe ], [], [ 0xf3, 0x8f, 0xbf, 0xbe ]))
// U+CFFFF
expectTrue(checkDecodeUTF8([ 0xcfffF ], [], [ 0xf3, 0x8f, 0xbf, 0xbf ]))
// U+DFFFE
expectTrue(checkDecodeUTF8([ 0xdfffe ], [], [ 0xf3, 0x9f, 0xbf, 0xbe ]))
// U+DFFFF
expectTrue(checkDecodeUTF8([ 0xdffff ], [], [ 0xf3, 0x9f, 0xbf, 0xbf ]))
// U+EFFFE
expectTrue(checkDecodeUTF8([ 0xefffe ], [], [ 0xf3, 0xaf, 0xbf, 0xbe ]))
// U+EFFFF
expectTrue(checkDecodeUTF8([ 0xeffff ], [], [ 0xf3, 0xaf, 0xbf, 0xbf ]))
// U+FFFFE
expectTrue(checkDecodeUTF8([ 0xffffe ], [], [ 0xf3, 0xbf, 0xbf, 0xbe ]))
// U+FFFFF
expectTrue(checkDecodeUTF8([ 0xfffff ], [], [ 0xf3, 0xbf, 0xbf, 0xbf ]))
// U+10FFFE
expectTrue(checkDecodeUTF8([ 0x10fffe ], [], [ 0xf4, 0x8f, 0xbf, 0xbe ]))
// U+10FFFF
expectTrue(checkDecodeUTF8([ 0x10ffff ], [], [ 0xf4, 0x8f, 0xbf, 0xbf ]))
// U+FDD0
expectTrue(checkDecodeUTF8([ 0xfdd0 ], [], [ 0xef, 0xb7, 0x90 ]))
// U+FDD1
expectTrue(checkDecodeUTF8([ 0xfdd1 ], [], [ 0xef, 0xb7, 0x91 ]))
// U+FDD2
expectTrue(checkDecodeUTF8([ 0xfdd2 ], [], [ 0xef, 0xb7, 0x92 ]))
// U+FDD3
expectTrue(checkDecodeUTF8([ 0xfdd3 ], [], [ 0xef, 0xb7, 0x93 ]))
// U+FDD4
expectTrue(checkDecodeUTF8([ 0xfdd4 ], [], [ 0xef, 0xb7, 0x94 ]))
// U+FDD5
expectTrue(checkDecodeUTF8([ 0xfdd5 ], [], [ 0xef, 0xb7, 0x95 ]))
// U+FDD6
expectTrue(checkDecodeUTF8([ 0xfdd6 ], [], [ 0xef, 0xb7, 0x96 ]))
// U+FDD7
expectTrue(checkDecodeUTF8([ 0xfdd7 ], [], [ 0xef, 0xb7, 0x97 ]))
// U+FDD8
expectTrue(checkDecodeUTF8([ 0xfdd8 ], [], [ 0xef, 0xb7, 0x98 ]))
// U+FDD9
expectTrue(checkDecodeUTF8([ 0xfdd9 ], [], [ 0xef, 0xb7, 0x99 ]))
// U+FDDA
expectTrue(checkDecodeUTF8([ 0xfdda ], [], [ 0xef, 0xb7, 0x9a ]))
// U+FDDB
expectTrue(checkDecodeUTF8([ 0xfddb ], [], [ 0xef, 0xb7, 0x9b ]))
// U+FDDC
expectTrue(checkDecodeUTF8([ 0xfddc ], [], [ 0xef, 0xb7, 0x9c ]))
// U+FDDD
expectTrue(checkDecodeUTF8([ 0xfddd ], [], [ 0xef, 0xb7, 0x9d ]))
// U+FDDE
expectTrue(checkDecodeUTF8([ 0xfdde ], [], [ 0xef, 0xb7, 0x9e ]))
// U+FDDF
expectTrue(checkDecodeUTF8([ 0xfddf ], [], [ 0xef, 0xb7, 0x9f ]))
// U+FDE0
expectTrue(checkDecodeUTF8([ 0xfde0 ], [], [ 0xef, 0xb7, 0xa0 ]))
// U+FDE1
expectTrue(checkDecodeUTF8([ 0xfde1 ], [], [ 0xef, 0xb7, 0xa1 ]))
// U+FDE2
expectTrue(checkDecodeUTF8([ 0xfde2 ], [], [ 0xef, 0xb7, 0xa2 ]))
// U+FDE3
expectTrue(checkDecodeUTF8([ 0xfde3 ], [], [ 0xef, 0xb7, 0xa3 ]))
// U+FDE4
expectTrue(checkDecodeUTF8([ 0xfde4 ], [], [ 0xef, 0xb7, 0xa4 ]))
// U+FDE5
expectTrue(checkDecodeUTF8([ 0xfde5 ], [], [ 0xef, 0xb7, 0xa5 ]))
// U+FDE6
expectTrue(checkDecodeUTF8([ 0xfde6 ], [], [ 0xef, 0xb7, 0xa6 ]))
// U+FDE7
expectTrue(checkDecodeUTF8([ 0xfde7 ], [], [ 0xef, 0xb7, 0xa7 ]))
// U+FDE8
expectTrue(checkDecodeUTF8([ 0xfde8 ], [], [ 0xef, 0xb7, 0xa8 ]))
// U+FDE9
expectTrue(checkDecodeUTF8([ 0xfde9 ], [], [ 0xef, 0xb7, 0xa9 ]))
// U+FDEA
expectTrue(checkDecodeUTF8([ 0xfdea ], [], [ 0xef, 0xb7, 0xaa ]))
// U+FDEB
expectTrue(checkDecodeUTF8([ 0xfdeb ], [], [ 0xef, 0xb7, 0xab ]))
// U+FDEC
expectTrue(checkDecodeUTF8([ 0xfdec ], [], [ 0xef, 0xb7, 0xac ]))
// U+FDED
expectTrue(checkDecodeUTF8([ 0xfded ], [], [ 0xef, 0xb7, 0xad ]))
// U+FDEE
expectTrue(checkDecodeUTF8([ 0xfdee ], [], [ 0xef, 0xb7, 0xae ]))
// U+FDEF
expectTrue(checkDecodeUTF8([ 0xfdef ], [], [ 0xef, 0xb7, 0xaf ]))
// U+FDF0
expectTrue(checkDecodeUTF8([ 0xfdf0 ], [], [ 0xef, 0xb7, 0xb0 ]))
// U+FDF1
expectTrue(checkDecodeUTF8([ 0xfdf1 ], [], [ 0xef, 0xb7, 0xb1 ]))
// U+FDF2
expectTrue(checkDecodeUTF8([ 0xfdf2 ], [], [ 0xef, 0xb7, 0xb2 ]))
// U+FDF3
expectTrue(checkDecodeUTF8([ 0xfdf3 ], [], [ 0xef, 0xb7, 0xb3 ]))
// U+FDF4
expectTrue(checkDecodeUTF8([ 0xfdf4 ], [], [ 0xef, 0xb7, 0xb4 ]))
// U+FDF5
expectTrue(checkDecodeUTF8([ 0xfdf5 ], [], [ 0xef, 0xb7, 0xb5 ]))
// U+FDF6
expectTrue(checkDecodeUTF8([ 0xfdf6 ], [], [ 0xef, 0xb7, 0xb6 ]))
// U+FDF7
expectTrue(checkDecodeUTF8([ 0xfdf7 ], [], [ 0xef, 0xb7, 0xb7 ]))
// U+FDF8
expectTrue(checkDecodeUTF8([ 0xfdf8 ], [], [ 0xef, 0xb7, 0xb8 ]))
// U+FDF9
expectTrue(checkDecodeUTF8([ 0xfdf9 ], [], [ 0xef, 0xb7, 0xb9 ]))
// U+FDFA
expectTrue(checkDecodeUTF8([ 0xfdfa ], [], [ 0xef, 0xb7, 0xba ]))
// U+FDFB
expectTrue(checkDecodeUTF8([ 0xfdfb ], [], [ 0xef, 0xb7, 0xbb ]))
// U+FDFC
expectTrue(checkDecodeUTF8([ 0xfdfc ], [], [ 0xef, 0xb7, 0xbc ]))
// U+FDFD
expectTrue(checkDecodeUTF8([ 0xfdfd ], [], [ 0xef, 0xb7, 0xbd ]))
// U+FDFE
expectTrue(checkDecodeUTF8([ 0xfdfe ], [], [ 0xef, 0xb7, 0xbe ]))
// U+FDFF
expectTrue(checkDecodeUTF8([ 0xfdff ], [], [ 0xef, 0xb7, 0xbf ]))
}
var UTF16Decoder = TestSuite("UTF16Decoder")
UTF16Decoder.test("UTF16.transcodedLength") {
do {
let u8: [Unicode.UTF8.CodeUnit] = [ 0, 1, 2, 3, 4, 5 ]
let (count, isASCII) = Unicode.UTF16.transcodedLength(
of: u8.makeIterator(),
decodedAs: Unicode.UTF8.self,
repairingIllFormedSequences: false)!
expectEqual(6, count)
expectTrue(isASCII)
}
do {
// "€" == U+20AC.
let u8: [Unicode.UTF8.CodeUnit] = [ 0xF0, 0xA4, 0xAD, 0xA2 ]
let (count, isASCII) = Unicode.UTF16.transcodedLength(
of: u8.makeIterator(),
decodedAs: Unicode.UTF8.self,
repairingIllFormedSequences: false)!
expectEqual(2, count)
expectFalse(isASCII)
}
do {
let u16: [Unicode.UTF16.CodeUnit] = [ 6, 7, 8, 9, 10, 11 ]
let (count, isASCII) = Unicode.UTF16.transcodedLength(
of: u16.makeIterator(),
decodedAs: Unicode.UTF16.self,
repairingIllFormedSequences: false)!
expectEqual(6, count)
expectTrue(isASCII)
}
}
UTF16Decoder.test("Decoding1").forEach(in: utfTests) {
test in
expectTrue(
checkDecodeUTF16(
test.utf32, test.utf32RepairedTail, test.utf16),
stackTrace: test.loc.withCurrentLoc())
return ()
}
UTF16Decoder.test("Decoding2") {
for (name, batch) in utf16Tests {
print("Batch: \(name)")
for test in batch {
expectTrue(checkDecodeUTF16(test.scalarsHead, test.scalarsRepairedTail,
test.encoded), stackTrace: test.loc.withCurrentLoc())
}
}
}
public struct UTF16Test {
public let scalarsHead: [UInt32]
public let scalarsRepairedTail: [UInt32]
public let encoded: [UInt16]
public let loc: SourceLoc
public init(
_ scalarsHead: [UInt32], _ scalarsRepairedTail: [UInt32],
_ encoded: [UInt16],
file: String = #file, line: UInt = #line
) {
self.scalarsHead = scalarsHead
self.scalarsRepairedTail = scalarsRepairedTail
self.encoded = encoded
self.loc = SourceLoc(file, line, comment: "test data")
}
}
public let utf16Tests = [
"Incomplete": [
//
// Incomplete sequences that end right before EOF.
//
// U+D800 (high-surrogate)
UTF16Test([], [ 0xFFFD ], [ 0xD800 ]),
// U+D800 (high-surrogate)
// U+D800 (high-surrogate)
UTF16Test([], [ 0xFFFD, 0xFFFD ], [ 0xD800, 0xD800 ]),
// U+0041 LATIN CAPITAL LETTER A
// U+D800 (high-surrogate)
UTF16Test([ 0x0041 ], [ 0xFFFD ], [ 0x0041, 0xD800 ]),
// U+10000 LINEAR B SYLLABLE B008 A
// U+D800 (high-surrogate)
UTF16Test(
[ 0x0001_0000 ], [ 0xFFFD ],
[ 0xD800, 0xDC00, 0xD800 ]),
//
// Incomplete sequences with more code units following them.
//
// U+D800 (high-surrogate)
// U+0041 LATIN CAPITAL LETTER A
UTF16Test([], [ 0xFFFD, 0x0041 ], [ 0xD800, 0x0041 ]),
// U+D800 (high-surrogate)
// U+10000 LINEAR B SYLLABLE B008 A
UTF16Test(
[], [ 0xFFFD, 0x0001_0000 ],
[ 0xD800, 0xD800, 0xDC00 ]),
// U+0041 LATIN CAPITAL LETTER A
// U+D800 (high-surrogate)
// U+0041 LATIN CAPITAL LETTER A
UTF16Test(
[ 0x0041 ], [ 0xFFFD, 0x0041 ],
[ 0x0041, 0xD800, 0x0041 ]),
// U+0041 LATIN CAPITAL LETTER A
// U+D800 (high-surrogate)
// U+10000 LINEAR B SYLLABLE B008 A
UTF16Test(
[ 0x0041 ], [ 0xFFFD, 0x0001_0000 ],
[ 0x0041, 0xD800, 0xD800, 0xDC00 ]),
// U+0041 LATIN CAPITAL LETTER A
// U+D800 (high-surrogate)
// U+DB40 (high-surrogate)
// U+0041 LATIN CAPITAL LETTER A
UTF16Test(
[ 0x0041 ], [ 0xFFFD, 0xFFFD, 0x0041 ],
[ 0x0041, 0xD800, 0xDB40, 0x0041 ]),
// U+0041 LATIN CAPITAL LETTER A
// U+D800 (high-surrogate)
// U+DB40 (high-surrogate)
// U+10000 LINEAR B SYLLABLE B008 A
UTF16Test(
[ 0x0041 ], [ 0xFFFD, 0xFFFD, 0x0001_0000 ],
[ 0x0041, 0xD800, 0xDB40, 0xD800, 0xDC00 ]),
// U+0041 LATIN CAPITAL LETTER A
// U+D800 (high-surrogate)
// U+DB40 (high-surrogate)
// U+DBFF (high-surrogate)
// U+0041 LATIN CAPITAL LETTER A
UTF16Test(
[ 0x0041 ], [ 0xFFFD, 0xFFFD, 0xFFFD, 0x0041 ],
[ 0x0041, 0xD800, 0xDB40, 0xDBFF, 0x0041 ]),
// U+0041 LATIN CAPITAL LETTER A
// U+D800 (high-surrogate)
// U+DB40 (high-surrogate)
// U+DBFF (high-surrogate)
// U+10000 LINEAR B SYLLABLE B008 A
UTF16Test(
[ 0x0041 ], [ 0xFFFD, 0xFFFD, 0xFFFD, 0x0001_0000 ],
[ 0x0041, 0xD800, 0xDB40, 0xDBFF, 0xD800, 0xDC00 ]),
],
"IllFormed": [
//
// Low-surrogate right before EOF.
//
// U+DC00 (low-surrogate)
UTF16Test([], [ 0xFFFD ], [ 0xDC00 ]),
// U+DC00 (low-surrogate)
// U+DC00 (low-surrogate)
UTF16Test([], [ 0xFFFD, 0xFFFD ], [ 0xDC00, 0xDC00 ]),
// U+0041 LATIN CAPITAL LETTER A
// U+DC00 (low-surrogate)
UTF16Test([ 0x0041 ], [ 0xFFFD ], [ 0x0041, 0xDC00 ]),
// U+10000 LINEAR B SYLLABLE B008 A
// U+DC00 (low-surrogate)
UTF16Test(
[ 0x0001_0000 ], [ 0xFFFD ],
[ 0xD800, 0xDC00, 0xDC00 ]),
//
// Low-surrogate with more code units following it.
//
// U+DC00 (low-surrogate)
// U+0041 LATIN CAPITAL LETTER A
UTF16Test([], [ 0xFFFD, 0x0041 ], [ 0xDC00, 0x0041 ]),
// U+DC00 (low-surrogate)
// U+10000 LINEAR B SYLLABLE B008 A
UTF16Test(
[], [ 0xFFFD, 0x0001_0000 ],
[ 0xDC00, 0xD800, 0xDC00 ]),