blob: 0cd324d44f3ca348f4cb80cc3000dc014bba6506 [file] [log] [blame]
//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//
import StdlibUnittest
struct CollationTableEntry {
let scalars: [Unicode.Scalar]
let collationElements: [UInt64]
let comment: String
init(
_ scalars: [UInt32],
_ collationElements: [UInt64],
_ comment: String
) {
self.scalars = scalars.map { Unicode.Scalar($0)! }
self.collationElements = collationElements
self.comment = comment
}
}
/// An excerpt from the DUCET (Default Unicode Collation Element Table).
///
/// The data was extracted from
/// http://www.unicode.org/Public/UCA/9.0.0/allkeys.txt.
let ducetExtractData: [CollationTableEntry] = [
CollationTableEntry([0x00], [0x0000_0000_0000], "NULL"),
CollationTableEntry([0x01], [0x0000_0000_0000], "START OF HEADING"),
CollationTableEntry([0x02], [0x0000_0000_0000], "START OF TEXT"),
CollationTableEntry([0x03], [0x0000_0000_0000], "END OF TEXT"),
CollationTableEntry([0x04], [0x0000_0000_0000], "END OF TRANSMISSION"),
CollationTableEntry([0x05], [0x0000_0000_0000], "ENQUIRY"),
CollationTableEntry([0x06], [0x0000_0000_0000], "ACKNOWLEDGE"),
CollationTableEntry([0x07], [0x0000_0000_0000], "BELL"),
CollationTableEntry([0x08], [0x0000_0000_0000], "BACKSPACE"),
CollationTableEntry([0x09], [0x0201_0020_0002], "HORIZONTAL TABULATION"),
CollationTableEntry([0x0A], [0x0202_0020_0002], "LINE FEED"),
CollationTableEntry([0x0B], [0x0203_0020_0002], "VERTICAL TABULATION"),
CollationTableEntry([0x0C], [0x0204_0020_0002], "FORM FEED"),
CollationTableEntry([0x0D], [0x0205_0020_0002], "CARRIAGE RETURN"),
CollationTableEntry([0x0E], [0x0000_0000_0000], "SHIFT OUT"),
CollationTableEntry([0x0F], [0x0000_0000_0000], "SHIFT IN"),
CollationTableEntry([0x10], [0x0000_0000_0000], "DATA LINK ESCAPE"),
CollationTableEntry([0x11], [0x0000_0000_0000], "DEVICE CONTROL ONE"),
CollationTableEntry([0x12], [0x0000_0000_0000], "DEVICE CONTROL TWO"),
CollationTableEntry([0x13], [0x0000_0000_0000], "DEVICE CONTROL THREE"),
CollationTableEntry([0x14], [0x0000_0000_0000], "DEVICE CONTROL FOUR"),
CollationTableEntry([0x15], [0x0000_0000_0000], "NEGATIVE ACKNOWLEDGE"),
CollationTableEntry([0x16], [0x0000_0000_0000], "SYNCHRONOUS IDLE"),
CollationTableEntry([0x17], [0x0000_0000_0000], "END OF TRANSMISSION BLOCK"),
CollationTableEntry([0x18], [0x0000_0000_0000], "CANCEL"),
CollationTableEntry([0x19], [0x0000_0000_0000], "END OF MEDIUM"),
CollationTableEntry([0x1A], [0x0000_0000_0000], "SUBSTITUTE"),
CollationTableEntry([0x1B], [0x0000_0000_0000], "ESCAPE"),
CollationTableEntry([0x1C], [0x0000_0000_0000], "FILE SEPARATOR"),
CollationTableEntry([0x1D], [0x0000_0000_0000], "GROUP SEPARATOR"),
CollationTableEntry([0x1E], [0x0000_0000_0000], "RECORD SEPARATOR"),
CollationTableEntry([0x1F], [0x0000_0000_0000], "UNIT SEPARATOR"),
CollationTableEntry([0x20], [0x0209_0020_0002], "SPACE"),
CollationTableEntry([0x21], [0x0260_0020_0002], "EXCLAMATION MARK"),
CollationTableEntry([0x22], [0x030C_0020_0002], "QUOTATION MARK"),
CollationTableEntry([0x23], [0x0398_0020_0002], "NUMBER SIGN"),
CollationTableEntry([0x24], [0x1C12_0020_0002], "DOLLAR SIGN"),
CollationTableEntry([0x25], [0x0399_0020_0002], "PERCENT SIGN"),
CollationTableEntry([0x26], [0x0396_0020_0002], "AMPERSAND"),
CollationTableEntry([0x27], [0x0305_0020_0002], "APOSTROPHE"),
CollationTableEntry([0x28], [0x0317_0020_0002], "LEFT PARENTHESIS"),
CollationTableEntry([0x29], [0x0318_0020_0002], "RIGHT PARENTHESIS"),
CollationTableEntry([0x2A], [0x038F_0020_0002], "ASTERISK"),
CollationTableEntry([0x2B], [0x0616_0020_0002], "PLUS SIGN"),
CollationTableEntry([0x2C], [0x0222_0020_0002], "COMMA"),
CollationTableEntry([0x2D], [0x020D_0020_0002], "HYPHEN-MINUS"),
CollationTableEntry([0x2E], [0x0277_0020_0002], "FULL STOP"),
CollationTableEntry([0x2F], [0x0394_0020_0002], "SOLIDUS"),
CollationTableEntry([0x30], [0x1C3D_0020_0002], "DIGIT ZERO"),
CollationTableEntry([0x31], [0x1C3E_0020_0002], "DIGIT ONE"),
CollationTableEntry([0x32], [0x1C3F_0020_0002], "DIGIT TWO"),
CollationTableEntry([0x33], [0x1C40_0020_0002], "DIGIT THREE"),
CollationTableEntry([0x34], [0x1C41_0020_0002], "DIGIT FOUR"),
CollationTableEntry([0x35], [0x1C42_0020_0002], "DIGIT FIVE"),
CollationTableEntry([0x36], [0x1C43_0020_0002], "DIGIT SIX"),
CollationTableEntry([0x37], [0x1C44_0020_0002], "DIGIT SEVEN"),
CollationTableEntry([0x38], [0x1C45_0020_0002], "DIGIT EIGHT"),
CollationTableEntry([0x39], [0x1C46_0020_0002], "DIGIT NINE"),
CollationTableEntry([0x3A], [0x0239_0020_0002], "COLON"),
CollationTableEntry([0x3B], [0x0234_0020_0002], "SEMICOLON"),
CollationTableEntry([0x3C], [0x061A_0020_0002], "LESS-THAN SIGN"),
CollationTableEntry([0x3D], [0x061B_0020_0002], "EQUALS SIGN"),
CollationTableEntry([0x3E], [0x061C_0020_0002], "GREATER-THAN SIGN"),
CollationTableEntry([0x3F], [0x0266_0020_0002], "QUESTION MARK"),
CollationTableEntry([0x40], [0x038E_0020_0002], "COMMERCIAL AT"),
CollationTableEntry([0x41], [0x1C47_0020_0008], "LATIN CAPITAL LETTER A"),
CollationTableEntry([0x42], [0x1C60_0020_0008], "LATIN CAPITAL LETTER B"),
CollationTableEntry([0x43], [0x1C7A_0020_0008], "LATIN CAPITAL LETTER C"),
CollationTableEntry([0x44], [0x1C8F_0020_0008], "LATIN CAPITAL LETTER D"),
CollationTableEntry([0x45], [0x1CAA_0020_0008], "LATIN CAPITAL LETTER E"),
CollationTableEntry([0x46], [0x1CE5_0020_0008], "LATIN CAPITAL LETTER F"),
CollationTableEntry([0x47], [0x1CF4_0020_0008], "LATIN CAPITAL LETTER G"),
CollationTableEntry([0x48], [0x1D18_0020_0008], "LATIN CAPITAL LETTER H"),
CollationTableEntry([0x49], [0x1D32_0020_0008], "LATIN CAPITAL LETTER I"),
CollationTableEntry([0x4A], [0x1D4C_0020_0008], "LATIN CAPITAL LETTER J"),
CollationTableEntry([0x4B], [0x1D65_0020_0008], "LATIN CAPITAL LETTER K"),
CollationTableEntry([0x4C], [0x1D77_0020_0008], "LATIN CAPITAL LETTER L"),
CollationTableEntry([0x4D], [0x1DAA_0020_0008], "LATIN CAPITAL LETTER M"),
CollationTableEntry([0x4E], [0x1DB9_0020_0008], "LATIN CAPITAL LETTER N"),
CollationTableEntry([0x4F], [0x1DDD_0020_0008], "LATIN CAPITAL LETTER O"),
CollationTableEntry([0x50], [0x1E0C_0020_0008], "LATIN CAPITAL LETTER P"),
CollationTableEntry([0x51], [0x1E21_0020_0008], "LATIN CAPITAL LETTER Q"),
CollationTableEntry([0x52], [0x1E33_0020_0008], "LATIN CAPITAL LETTER R"),
CollationTableEntry([0x53], [0x1E71_0020_0008], "LATIN CAPITAL LETTER S"),
CollationTableEntry([0x54], [0x1E95_0020_0008], "LATIN CAPITAL LETTER T"),
CollationTableEntry([0x55], [0x1EB5_0020_0008], "LATIN CAPITAL LETTER U"),
CollationTableEntry([0x56], [0x1EE3_0020_0008], "LATIN CAPITAL LETTER V"),
CollationTableEntry([0x57], [0x1EF5_0020_0008], "LATIN CAPITAL LETTER W"),
CollationTableEntry([0x58], [0x1EFF_0020_0008], "LATIN CAPITAL LETTER X"),
CollationTableEntry([0x59], [0x1F0B_0020_0008], "LATIN CAPITAL LETTER Y"),
CollationTableEntry([0x5A], [0x1F21_0020_0008], "LATIN CAPITAL LETTER Z"),
CollationTableEntry([0x5B], [0x0319_0020_0002], "LEFT SQUARE BRACKET"),
CollationTableEntry([0x5C], [0x0395_0020_0002], "REVERSE SOLIDUS"),
CollationTableEntry([0x5D], [0x031A_0020_0002], "RIGHT SQUARE BRACKET"),
CollationTableEntry([0x5E], [0x0485_0020_0002], "CIRCUMFLEX ACCENT"),
CollationTableEntry([0x5F], [0x020B_0020_0002], "LOW LINE"),
CollationTableEntry([0x60], [0x0482_0020_0002], "GRAVE ACCENT"),
CollationTableEntry([0x61], [0x1C47_0020_0002], "LATIN SMALL LETTER A"),
CollationTableEntry([0x62], [0x1C60_0020_0002], "LATIN SMALL LETTER B"),
CollationTableEntry([0x63], [0x1C7A_0020_0002], "LATIN SMALL LETTER C"),
CollationTableEntry([0x64], [0x1C8F_0020_0002], "LATIN SMALL LETTER D"),
CollationTableEntry([0x65], [0x1CAA_0020_0002], "LATIN SMALL LETTER E"),
CollationTableEntry([0x66], [0x1CE5_0020_0002], "LATIN SMALL LETTER F"),
CollationTableEntry([0x67], [0x1CF4_0020_0002], "LATIN SMALL LETTER G"),
CollationTableEntry([0x68], [0x1D18_0020_0002], "LATIN SMALL LETTER H"),
CollationTableEntry([0x69], [0x1D32_0020_0002], "LATIN SMALL LETTER I"),
CollationTableEntry([0x6A], [0x1D4C_0020_0002], "LATIN SMALL LETTER J"),
CollationTableEntry([0x6B], [0x1D65_0020_0002], "LATIN SMALL LETTER K"),
CollationTableEntry([0x6C], [0x1D77_0020_0002], "LATIN SMALL LETTER L"),
CollationTableEntry([0x6D], [0x1DAA_0020_0002], "LATIN SMALL LETTER M"),
CollationTableEntry([0x6E], [0x1DB9_0020_0002], "LATIN SMALL LETTER N"),
CollationTableEntry([0x6F], [0x1DDD_0020_0002], "LATIN SMALL LETTER O"),
CollationTableEntry([0x70], [0x1E0C_0020_0002], "LATIN SMALL LETTER P"),
CollationTableEntry([0x71], [0x1E21_0020_0002], "LATIN SMALL LETTER Q"),
CollationTableEntry([0x72], [0x1E33_0020_0002], "LATIN SMALL LETTER R"),
CollationTableEntry([0x73], [0x1E71_0020_0002], "LATIN SMALL LETTER S"),
CollationTableEntry([0x74], [0x1E95_0020_0002], "LATIN SMALL LETTER T"),
CollationTableEntry([0x75], [0x1EB5_0020_0002], "LATIN SMALL LETTER U"),
CollationTableEntry([0x76], [0x1EE3_0020_0002], "LATIN SMALL LETTER V"),
CollationTableEntry([0x77], [0x1EF5_0020_0002], "LATIN SMALL LETTER W"),
CollationTableEntry([0x78], [0x1EFF_0020_0002], "LATIN SMALL LETTER X"),
CollationTableEntry([0x79], [0x1F0B_0020_0002], "LATIN SMALL LETTER Y"),
CollationTableEntry([0x7A], [0x1F21_0020_0002], "LATIN SMALL LETTER Z"),
CollationTableEntry([0x7B], [0x031B_0020_0002], "LEFT CURLY BRACKET"),
CollationTableEntry([0x7C], [0x061E_0020_0002], "VERTICAL LINE"),
CollationTableEntry([0x7D], [0x031C_0020_0002], "RIGHT CURLY BRACKET"),
CollationTableEntry([0x7E], [0x0620_0020_0002], "TILDE"),
CollationTableEntry([0x7F], [0x0000_0000_0000], "DELETE"),
// When String starts to use Latin-1 as one of its in-memory representations,
// this table should be extended to cover all scalars in U+0080 ... U+00FF.
CollationTableEntry([0x80], [0x0000_0000_0000], "<control>"),
CollationTableEntry([0xE1], [0x1C47_0020_0002, 0x0000_0024_0002], "LATIN SMALL LETTER A WITH ACUTE"),
CollationTableEntry([0xE2], [0x1C47_0020_0002, 0x0000_0027_0002], "LATIN SMALL LETTER A WITH CIRCUMFLEX"),
CollationTableEntry([0xFF], [0x1F0B_0020_0002, 0x0000_002B_0002], "LATIN SMALL LETTER Y WITH DIAERESIS"),
CollationTableEntry([0x3041], [0x3D5A_0020_000D], "HIRAGANA LETTER SMALL A"),
CollationTableEntry([0x3042], [0x3D5A_0020_000E], "HIRAGANA LETTER A"),
CollationTableEntry([0x30A1], [0x3D5A_0020_000F], "KATAKANA LETTER SMALL A"),
CollationTableEntry([0xFF67], [0x3D5A_0020_0010], "HALFWIDTH KATAKANA LETTER SMALL A"),
CollationTableEntry([0x30A2], [0x3D5A_0020_0011], "KATAKANA LETTER A"),
CollationTableEntry([0xFF71], [0x3D5A_0020_0012], "HALFWIDTH KATAKANA LETTER A"),
CollationTableEntry([0xFE00], [0x0000_0000_0000], "VARIATION SELECTOR-1"),
CollationTableEntry([0xFE01], [0x0000_0000_0000], "VARIATION SELECTOR-2"),
CollationTableEntry([0xE01EE], [0x0000_0000_0000], "VARIATION SELECTOR-255"),
CollationTableEntry([0xE01EF], [0x0000_0000_0000], "VARIATION SELECTOR-256"),
]
let ducetExtract: [[Unicode.Scalar]: CollationTableEntry] = {
() in
var result: [[Unicode.Scalar]: CollationTableEntry] = [:]
for entry in ducetExtractData {
result[entry.scalars] = entry
}
return result
}()
extension String {
/// Calculate collation elements for trivial cases.
///
/// Warning: this implementation does not conform to Unicode TR10!
/// It is a gross oversimplification that is only used to reduce the repetition
/// of test inputs in this file. Among other things, this algorithm does not
/// handle contractions in the collation table, does not perform string
/// normalization, does not synthesize derived collation weights etc.
internal var _collationElements: [UInt64] {
var result: [UInt64] = []
for us in self.unicodeScalars {
let scalars: [Unicode.Scalar] = [us]
let collationElements = ducetExtract[scalars]!.collationElements
if collationElements[0] != 0 {
result += collationElements
}
}
return result
}
}
public struct StringComparisonTest {
public let string: String
public let collationElements: [UInt64]
public let loc: SourceLoc
public var order: Int?
public init(
_ string: String,
inferCollationElements: Void,
file: String = #file, line: UInt = #line
) {
self.string = string
self.collationElements = string._collationElements
self.loc = SourceLoc(file, line, comment: "test data")
}
public init(
_ string: String,
_ collationElements: [UInt64],
sourceLocation: SourceLoc
) {
self.string = string
self.collationElements = collationElements
self.loc = sourceLocation
}
public init(
_ string: String,
_ collationElements: [UInt64],
file: String = #file, line: UInt = #line
) {
self.init(
string,
collationElements,
sourceLocation: SourceLoc(file, line, comment: "test data"))
}
public static let testsFromDUCET: [StringComparisonTest] = {
() in
var result: [StringComparisonTest] = []
for entry in ducetExtractData {
var s = ""
for c in entry.scalars {
s.append(Character(c))
}
if entry.collationElements[0] != 0 {
result.append(StringComparisonTest(s, entry.collationElements))
}
}
return result
}()
public static let hardcodedTests: [StringComparisonTest] = [
StringComparisonTest("", inferCollationElements: ()),
// Completely ignorable characters in ASCII strings.
StringComparisonTest("\u{00}\u{61}", inferCollationElements: ()),
StringComparisonTest("\u{61}\u{00}", inferCollationElements: ()),
StringComparisonTest("\u{00}\u{61}\u{00}", inferCollationElements: ()),
StringComparisonTest("\u{61}\u{00}\u{62}", inferCollationElements: ()),
// Completely ignorable characters in Latin-1 strings.
StringComparisonTest("\u{00}\u{E1}", inferCollationElements: ()),
StringComparisonTest("\u{E1}\u{00}", inferCollationElements: ()),
StringComparisonTest("\u{00}\u{E1}\u{00}", inferCollationElements: ()),
StringComparisonTest("\u{E1}\u{00}\u{E2}", inferCollationElements: ()),
// Completely ignorable characters in non-Latin-1 strings.
StringComparisonTest("\u{0000}\u{3041}", inferCollationElements: ()),
StringComparisonTest("\u{3041}\u{0000}", inferCollationElements: ()),
StringComparisonTest("\u{0000}\u{3041}\u{0000}", inferCollationElements: ()),
StringComparisonTest("\u{3041}\u{0000}\u{3042}", inferCollationElements: ()),
StringComparisonTest("\u{FE00}\u{3041}", inferCollationElements: ()),
StringComparisonTest("\u{3041}\u{FE00}", inferCollationElements: ()),
StringComparisonTest("\u{FE00}\u{3041}\u{FE00}", inferCollationElements: ()),
StringComparisonTest("\u{3041}\u{FE00}\u{3042}", inferCollationElements: ()),
StringComparisonTest("\u{E01EF}\u{3041}", inferCollationElements: ()),
StringComparisonTest("\u{03041}\u{E01EF}", inferCollationElements: ()),
StringComparisonTest("\u{E01EF}\u{03041}\u{E01EF}", inferCollationElements: ()),
StringComparisonTest("\u{03041}\u{E01EF}\u{03042}", inferCollationElements: ()),
// U+0061 LATIN SMALL LETTER A
// U+0301 COMBINING ACUTE ACCENT
// U+00E1 LATIN SMALL LETTER A WITH ACUTE
StringComparisonTest("\u{61}\u{301}", "\u{E1}"._collationElements),
]
public static let allTests: [StringComparisonTest] = {
() in
return testsFromDUCET + hardcodedTests
}()
}
public func sortKey(forCollationElements ces: [UInt64]) -> ([UInt16], [UInt16], [UInt16]) {
func L1(_ ce: UInt64) -> UInt16 {
return UInt16(truncatingIfNeeded: ce &>> 32)
}
func L2(_ ce: UInt64) -> UInt16 {
return UInt16(truncatingIfNeeded: ce &>> 16)
}
func L3(_ ce: UInt64) -> UInt16 {
return UInt16(truncatingIfNeeded: ce)
}
var result1: [UInt16] = []
for ce in ces { result1.append(L1(ce)) }
var result2: [UInt16] = []
for ce in ces { result2.append(L2(ce)) }
var result3: [UInt16] = []
for ce in ces { result3.append(L3(ce)) }
return (result1, result2, result3)
}
public func collationElements(
_ lhs: [UInt64], areLessThan rhs: [UInt64]
) -> Bool {
let lhsKey = sortKey(forCollationElements: lhs)
let rhsKey = sortKey(forCollationElements: rhs)
if lhsKey.0 != rhsKey.0 {
return lhsKey.0.lexicographicallyPrecedes(rhsKey.0)
}
if lhsKey.1 != rhsKey.1 {
return lhsKey.1.lexicographicallyPrecedes(rhsKey.1)
}
return lhsKey.2.lexicographicallyPrecedes(rhsKey.2)
}