| //===----------------------------------------------------------------------===// |
| // |
| // This source file is part of the Swift.org open source project |
| // |
| // Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors |
| // Licensed under Apache License v2.0 with Runtime Library Exception |
| // |
| // See https://swift.org/LICENSE.txt for license information |
| // See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors |
| // |
| //===----------------------------------------------------------------------===// |
| |
| import StdlibUnittest |
| |
| struct CollationTableEntry { |
| let scalars: [Unicode.Scalar] |
| let collationElements: [UInt64] |
| let comment: String |
| |
| init( |
| _ scalars: [UInt32], |
| _ collationElements: [UInt64], |
| _ comment: String |
| ) { |
| self.scalars = scalars.map { Unicode.Scalar($0)! } |
| self.collationElements = collationElements |
| self.comment = comment |
| } |
| } |
| |
| /// An excerpt from the DUCET (Default Unicode Collation Element Table). |
| /// |
| /// The data was extracted from |
| /// http://www.unicode.org/Public/UCA/9.0.0/allkeys.txt. |
| let ducetExtractData: [CollationTableEntry] = [ |
| CollationTableEntry([0x00], [0x0000_0000_0000], "NULL"), |
| CollationTableEntry([0x01], [0x0000_0000_0000], "START OF HEADING"), |
| CollationTableEntry([0x02], [0x0000_0000_0000], "START OF TEXT"), |
| CollationTableEntry([0x03], [0x0000_0000_0000], "END OF TEXT"), |
| CollationTableEntry([0x04], [0x0000_0000_0000], "END OF TRANSMISSION"), |
| CollationTableEntry([0x05], [0x0000_0000_0000], "ENQUIRY"), |
| CollationTableEntry([0x06], [0x0000_0000_0000], "ACKNOWLEDGE"), |
| CollationTableEntry([0x07], [0x0000_0000_0000], "BELL"), |
| CollationTableEntry([0x08], [0x0000_0000_0000], "BACKSPACE"), |
| CollationTableEntry([0x09], [0x0201_0020_0002], "HORIZONTAL TABULATION"), |
| CollationTableEntry([0x0A], [0x0202_0020_0002], "LINE FEED"), |
| CollationTableEntry([0x0B], [0x0203_0020_0002], "VERTICAL TABULATION"), |
| CollationTableEntry([0x0C], [0x0204_0020_0002], "FORM FEED"), |
| CollationTableEntry([0x0D], [0x0205_0020_0002], "CARRIAGE RETURN"), |
| CollationTableEntry([0x0E], [0x0000_0000_0000], "SHIFT OUT"), |
| CollationTableEntry([0x0F], [0x0000_0000_0000], "SHIFT IN"), |
| CollationTableEntry([0x10], [0x0000_0000_0000], "DATA LINK ESCAPE"), |
| CollationTableEntry([0x11], [0x0000_0000_0000], "DEVICE CONTROL ONE"), |
| CollationTableEntry([0x12], [0x0000_0000_0000], "DEVICE CONTROL TWO"), |
| CollationTableEntry([0x13], [0x0000_0000_0000], "DEVICE CONTROL THREE"), |
| CollationTableEntry([0x14], [0x0000_0000_0000], "DEVICE CONTROL FOUR"), |
| CollationTableEntry([0x15], [0x0000_0000_0000], "NEGATIVE ACKNOWLEDGE"), |
| CollationTableEntry([0x16], [0x0000_0000_0000], "SYNCHRONOUS IDLE"), |
| CollationTableEntry([0x17], [0x0000_0000_0000], "END OF TRANSMISSION BLOCK"), |
| CollationTableEntry([0x18], [0x0000_0000_0000], "CANCEL"), |
| CollationTableEntry([0x19], [0x0000_0000_0000], "END OF MEDIUM"), |
| CollationTableEntry([0x1A], [0x0000_0000_0000], "SUBSTITUTE"), |
| CollationTableEntry([0x1B], [0x0000_0000_0000], "ESCAPE"), |
| CollationTableEntry([0x1C], [0x0000_0000_0000], "FILE SEPARATOR"), |
| CollationTableEntry([0x1D], [0x0000_0000_0000], "GROUP SEPARATOR"), |
| CollationTableEntry([0x1E], [0x0000_0000_0000], "RECORD SEPARATOR"), |
| CollationTableEntry([0x1F], [0x0000_0000_0000], "UNIT SEPARATOR"), |
| CollationTableEntry([0x20], [0x0209_0020_0002], "SPACE"), |
| CollationTableEntry([0x21], [0x0260_0020_0002], "EXCLAMATION MARK"), |
| CollationTableEntry([0x22], [0x030C_0020_0002], "QUOTATION MARK"), |
| CollationTableEntry([0x23], [0x0398_0020_0002], "NUMBER SIGN"), |
| CollationTableEntry([0x24], [0x1C12_0020_0002], "DOLLAR SIGN"), |
| CollationTableEntry([0x25], [0x0399_0020_0002], "PERCENT SIGN"), |
| CollationTableEntry([0x26], [0x0396_0020_0002], "AMPERSAND"), |
| CollationTableEntry([0x27], [0x0305_0020_0002], "APOSTROPHE"), |
| CollationTableEntry([0x28], [0x0317_0020_0002], "LEFT PARENTHESIS"), |
| CollationTableEntry([0x29], [0x0318_0020_0002], "RIGHT PARENTHESIS"), |
| CollationTableEntry([0x2A], [0x038F_0020_0002], "ASTERISK"), |
| CollationTableEntry([0x2B], [0x0616_0020_0002], "PLUS SIGN"), |
| CollationTableEntry([0x2C], [0x0222_0020_0002], "COMMA"), |
| CollationTableEntry([0x2D], [0x020D_0020_0002], "HYPHEN-MINUS"), |
| CollationTableEntry([0x2E], [0x0277_0020_0002], "FULL STOP"), |
| CollationTableEntry([0x2F], [0x0394_0020_0002], "SOLIDUS"), |
| CollationTableEntry([0x30], [0x1C3D_0020_0002], "DIGIT ZERO"), |
| CollationTableEntry([0x31], [0x1C3E_0020_0002], "DIGIT ONE"), |
| CollationTableEntry([0x32], [0x1C3F_0020_0002], "DIGIT TWO"), |
| CollationTableEntry([0x33], [0x1C40_0020_0002], "DIGIT THREE"), |
| CollationTableEntry([0x34], [0x1C41_0020_0002], "DIGIT FOUR"), |
| CollationTableEntry([0x35], [0x1C42_0020_0002], "DIGIT FIVE"), |
| CollationTableEntry([0x36], [0x1C43_0020_0002], "DIGIT SIX"), |
| CollationTableEntry([0x37], [0x1C44_0020_0002], "DIGIT SEVEN"), |
| CollationTableEntry([0x38], [0x1C45_0020_0002], "DIGIT EIGHT"), |
| CollationTableEntry([0x39], [0x1C46_0020_0002], "DIGIT NINE"), |
| CollationTableEntry([0x3A], [0x0239_0020_0002], "COLON"), |
| CollationTableEntry([0x3B], [0x0234_0020_0002], "SEMICOLON"), |
| CollationTableEntry([0x3C], [0x061A_0020_0002], "LESS-THAN SIGN"), |
| CollationTableEntry([0x3D], [0x061B_0020_0002], "EQUALS SIGN"), |
| CollationTableEntry([0x3E], [0x061C_0020_0002], "GREATER-THAN SIGN"), |
| CollationTableEntry([0x3F], [0x0266_0020_0002], "QUESTION MARK"), |
| CollationTableEntry([0x40], [0x038E_0020_0002], "COMMERCIAL AT"), |
| CollationTableEntry([0x41], [0x1C47_0020_0008], "LATIN CAPITAL LETTER A"), |
| CollationTableEntry([0x42], [0x1C60_0020_0008], "LATIN CAPITAL LETTER B"), |
| CollationTableEntry([0x43], [0x1C7A_0020_0008], "LATIN CAPITAL LETTER C"), |
| CollationTableEntry([0x44], [0x1C8F_0020_0008], "LATIN CAPITAL LETTER D"), |
| CollationTableEntry([0x45], [0x1CAA_0020_0008], "LATIN CAPITAL LETTER E"), |
| CollationTableEntry([0x46], [0x1CE5_0020_0008], "LATIN CAPITAL LETTER F"), |
| CollationTableEntry([0x47], [0x1CF4_0020_0008], "LATIN CAPITAL LETTER G"), |
| CollationTableEntry([0x48], [0x1D18_0020_0008], "LATIN CAPITAL LETTER H"), |
| CollationTableEntry([0x49], [0x1D32_0020_0008], "LATIN CAPITAL LETTER I"), |
| CollationTableEntry([0x4A], [0x1D4C_0020_0008], "LATIN CAPITAL LETTER J"), |
| CollationTableEntry([0x4B], [0x1D65_0020_0008], "LATIN CAPITAL LETTER K"), |
| CollationTableEntry([0x4C], [0x1D77_0020_0008], "LATIN CAPITAL LETTER L"), |
| CollationTableEntry([0x4D], [0x1DAA_0020_0008], "LATIN CAPITAL LETTER M"), |
| CollationTableEntry([0x4E], [0x1DB9_0020_0008], "LATIN CAPITAL LETTER N"), |
| CollationTableEntry([0x4F], [0x1DDD_0020_0008], "LATIN CAPITAL LETTER O"), |
| CollationTableEntry([0x50], [0x1E0C_0020_0008], "LATIN CAPITAL LETTER P"), |
| CollationTableEntry([0x51], [0x1E21_0020_0008], "LATIN CAPITAL LETTER Q"), |
| CollationTableEntry([0x52], [0x1E33_0020_0008], "LATIN CAPITAL LETTER R"), |
| CollationTableEntry([0x53], [0x1E71_0020_0008], "LATIN CAPITAL LETTER S"), |
| CollationTableEntry([0x54], [0x1E95_0020_0008], "LATIN CAPITAL LETTER T"), |
| CollationTableEntry([0x55], [0x1EB5_0020_0008], "LATIN CAPITAL LETTER U"), |
| CollationTableEntry([0x56], [0x1EE3_0020_0008], "LATIN CAPITAL LETTER V"), |
| CollationTableEntry([0x57], [0x1EF5_0020_0008], "LATIN CAPITAL LETTER W"), |
| CollationTableEntry([0x58], [0x1EFF_0020_0008], "LATIN CAPITAL LETTER X"), |
| CollationTableEntry([0x59], [0x1F0B_0020_0008], "LATIN CAPITAL LETTER Y"), |
| CollationTableEntry([0x5A], [0x1F21_0020_0008], "LATIN CAPITAL LETTER Z"), |
| CollationTableEntry([0x5B], [0x0319_0020_0002], "LEFT SQUARE BRACKET"), |
| CollationTableEntry([0x5C], [0x0395_0020_0002], "REVERSE SOLIDUS"), |
| CollationTableEntry([0x5D], [0x031A_0020_0002], "RIGHT SQUARE BRACKET"), |
| CollationTableEntry([0x5E], [0x0485_0020_0002], "CIRCUMFLEX ACCENT"), |
| CollationTableEntry([0x5F], [0x020B_0020_0002], "LOW LINE"), |
| CollationTableEntry([0x60], [0x0482_0020_0002], "GRAVE ACCENT"), |
| CollationTableEntry([0x61], [0x1C47_0020_0002], "LATIN SMALL LETTER A"), |
| CollationTableEntry([0x62], [0x1C60_0020_0002], "LATIN SMALL LETTER B"), |
| CollationTableEntry([0x63], [0x1C7A_0020_0002], "LATIN SMALL LETTER C"), |
| CollationTableEntry([0x64], [0x1C8F_0020_0002], "LATIN SMALL LETTER D"), |
| CollationTableEntry([0x65], [0x1CAA_0020_0002], "LATIN SMALL LETTER E"), |
| CollationTableEntry([0x66], [0x1CE5_0020_0002], "LATIN SMALL LETTER F"), |
| CollationTableEntry([0x67], [0x1CF4_0020_0002], "LATIN SMALL LETTER G"), |
| CollationTableEntry([0x68], [0x1D18_0020_0002], "LATIN SMALL LETTER H"), |
| CollationTableEntry([0x69], [0x1D32_0020_0002], "LATIN SMALL LETTER I"), |
| CollationTableEntry([0x6A], [0x1D4C_0020_0002], "LATIN SMALL LETTER J"), |
| CollationTableEntry([0x6B], [0x1D65_0020_0002], "LATIN SMALL LETTER K"), |
| CollationTableEntry([0x6C], [0x1D77_0020_0002], "LATIN SMALL LETTER L"), |
| CollationTableEntry([0x6D], [0x1DAA_0020_0002], "LATIN SMALL LETTER M"), |
| CollationTableEntry([0x6E], [0x1DB9_0020_0002], "LATIN SMALL LETTER N"), |
| CollationTableEntry([0x6F], [0x1DDD_0020_0002], "LATIN SMALL LETTER O"), |
| CollationTableEntry([0x70], [0x1E0C_0020_0002], "LATIN SMALL LETTER P"), |
| CollationTableEntry([0x71], [0x1E21_0020_0002], "LATIN SMALL LETTER Q"), |
| CollationTableEntry([0x72], [0x1E33_0020_0002], "LATIN SMALL LETTER R"), |
| CollationTableEntry([0x73], [0x1E71_0020_0002], "LATIN SMALL LETTER S"), |
| CollationTableEntry([0x74], [0x1E95_0020_0002], "LATIN SMALL LETTER T"), |
| CollationTableEntry([0x75], [0x1EB5_0020_0002], "LATIN SMALL LETTER U"), |
| CollationTableEntry([0x76], [0x1EE3_0020_0002], "LATIN SMALL LETTER V"), |
| CollationTableEntry([0x77], [0x1EF5_0020_0002], "LATIN SMALL LETTER W"), |
| CollationTableEntry([0x78], [0x1EFF_0020_0002], "LATIN SMALL LETTER X"), |
| CollationTableEntry([0x79], [0x1F0B_0020_0002], "LATIN SMALL LETTER Y"), |
| CollationTableEntry([0x7A], [0x1F21_0020_0002], "LATIN SMALL LETTER Z"), |
| CollationTableEntry([0x7B], [0x031B_0020_0002], "LEFT CURLY BRACKET"), |
| CollationTableEntry([0x7C], [0x061E_0020_0002], "VERTICAL LINE"), |
| CollationTableEntry([0x7D], [0x031C_0020_0002], "RIGHT CURLY BRACKET"), |
| CollationTableEntry([0x7E], [0x0620_0020_0002], "TILDE"), |
| CollationTableEntry([0x7F], [0x0000_0000_0000], "DELETE"), |
| |
| // When String starts to use Latin-1 as one of its in-memory representations, |
| // this table should be extended to cover all scalars in U+0080 ... U+00FF. |
| CollationTableEntry([0x80], [0x0000_0000_0000], "<control>"), |
| CollationTableEntry([0xE1], [0x1C47_0020_0002, 0x0000_0024_0002], "LATIN SMALL LETTER A WITH ACUTE"), |
| CollationTableEntry([0xE2], [0x1C47_0020_0002, 0x0000_0027_0002], "LATIN SMALL LETTER A WITH CIRCUMFLEX"), |
| CollationTableEntry([0xFF], [0x1F0B_0020_0002, 0x0000_002B_0002], "LATIN SMALL LETTER Y WITH DIAERESIS"), |
| |
| CollationTableEntry([0x3041], [0x3D5A_0020_000D], "HIRAGANA LETTER SMALL A"), |
| CollationTableEntry([0x3042], [0x3D5A_0020_000E], "HIRAGANA LETTER A"), |
| CollationTableEntry([0x30A1], [0x3D5A_0020_000F], "KATAKANA LETTER SMALL A"), |
| CollationTableEntry([0xFF67], [0x3D5A_0020_0010], "HALFWIDTH KATAKANA LETTER SMALL A"), |
| CollationTableEntry([0x30A2], [0x3D5A_0020_0011], "KATAKANA LETTER A"), |
| CollationTableEntry([0xFF71], [0x3D5A_0020_0012], "HALFWIDTH KATAKANA LETTER A"), |
| CollationTableEntry([0xFE00], [0x0000_0000_0000], "VARIATION SELECTOR-1"), |
| CollationTableEntry([0xFE01], [0x0000_0000_0000], "VARIATION SELECTOR-2"), |
| CollationTableEntry([0xE01EE], [0x0000_0000_0000], "VARIATION SELECTOR-255"), |
| CollationTableEntry([0xE01EF], [0x0000_0000_0000], "VARIATION SELECTOR-256"), |
| ] |
| |
| let ducetExtract: [[Unicode.Scalar]: CollationTableEntry] = { |
| () in |
| var result: [[Unicode.Scalar]: CollationTableEntry] = [:] |
| for entry in ducetExtractData { |
| result[entry.scalars] = entry |
| } |
| return result |
| }() |
| |
| extension String { |
| /// Calculate collation elements for trivial cases. |
| /// |
| /// Warning: this implementation does not conform to Unicode TR10! |
| /// It is a gross oversimplification that is only used to reduce the repetition |
| /// of test inputs in this file. Among other things, this algorithm does not |
| /// handle contractions in the collation table, does not perform string |
| /// normalization, does not synthesize derived collation weights etc. |
| internal var _collationElements: [UInt64] { |
| var result: [UInt64] = [] |
| for us in self.unicodeScalars { |
| let scalars: [Unicode.Scalar] = [us] |
| let collationElements = ducetExtract[scalars]!.collationElements |
| if collationElements[0] != 0 { |
| result += collationElements |
| } |
| } |
| return result |
| } |
| } |
| |
| public struct StringComparisonTest { |
| public let string: String |
| public let collationElements: [UInt64] |
| public let loc: SourceLoc |
| |
| public var order: Int? |
| |
| public init( |
| _ string: String, |
| inferCollationElements: Void, |
| file: String = #file, line: UInt = #line |
| ) { |
| self.string = string |
| self.collationElements = string._collationElements |
| self.loc = SourceLoc(file, line, comment: "test data") |
| } |
| |
| public init( |
| _ string: String, |
| _ collationElements: [UInt64], |
| sourceLocation: SourceLoc |
| ) { |
| self.string = string |
| self.collationElements = collationElements |
| self.loc = sourceLocation |
| } |
| |
| public init( |
| _ string: String, |
| _ collationElements: [UInt64], |
| file: String = #file, line: UInt = #line |
| ) { |
| self.init( |
| string, |
| collationElements, |
| sourceLocation: SourceLoc(file, line, comment: "test data")) |
| } |
| |
| public static let testsFromDUCET: [StringComparisonTest] = { |
| () in |
| var result: [StringComparisonTest] = [] |
| for entry in ducetExtractData { |
| var s = "" |
| for c in entry.scalars { |
| s.append(Character(c)) |
| } |
| if entry.collationElements[0] != 0 { |
| result.append(StringComparisonTest(s, entry.collationElements)) |
| } |
| } |
| return result |
| }() |
| |
| public static let hardcodedTests: [StringComparisonTest] = [ |
| StringComparisonTest("", inferCollationElements: ()), |
| |
| // Completely ignorable characters in ASCII strings. |
| StringComparisonTest("\u{00}\u{61}", inferCollationElements: ()), |
| StringComparisonTest("\u{61}\u{00}", inferCollationElements: ()), |
| StringComparisonTest("\u{00}\u{61}\u{00}", inferCollationElements: ()), |
| StringComparisonTest("\u{61}\u{00}\u{62}", inferCollationElements: ()), |
| |
| // Completely ignorable characters in Latin-1 strings. |
| StringComparisonTest("\u{00}\u{E1}", inferCollationElements: ()), |
| StringComparisonTest("\u{E1}\u{00}", inferCollationElements: ()), |
| StringComparisonTest("\u{00}\u{E1}\u{00}", inferCollationElements: ()), |
| StringComparisonTest("\u{E1}\u{00}\u{E2}", inferCollationElements: ()), |
| |
| // Completely ignorable characters in non-Latin-1 strings. |
| StringComparisonTest("\u{0000}\u{3041}", inferCollationElements: ()), |
| StringComparisonTest("\u{3041}\u{0000}", inferCollationElements: ()), |
| StringComparisonTest("\u{0000}\u{3041}\u{0000}", inferCollationElements: ()), |
| StringComparisonTest("\u{3041}\u{0000}\u{3042}", inferCollationElements: ()), |
| StringComparisonTest("\u{FE00}\u{3041}", inferCollationElements: ()), |
| StringComparisonTest("\u{3041}\u{FE00}", inferCollationElements: ()), |
| StringComparisonTest("\u{FE00}\u{3041}\u{FE00}", inferCollationElements: ()), |
| StringComparisonTest("\u{3041}\u{FE00}\u{3042}", inferCollationElements: ()), |
| StringComparisonTest("\u{E01EF}\u{3041}", inferCollationElements: ()), |
| StringComparisonTest("\u{03041}\u{E01EF}", inferCollationElements: ()), |
| StringComparisonTest("\u{E01EF}\u{03041}\u{E01EF}", inferCollationElements: ()), |
| StringComparisonTest("\u{03041}\u{E01EF}\u{03042}", inferCollationElements: ()), |
| |
| // U+0061 LATIN SMALL LETTER A |
| // U+0301 COMBINING ACUTE ACCENT |
| // U+00E1 LATIN SMALL LETTER A WITH ACUTE |
| StringComparisonTest("\u{61}\u{301}", "\u{E1}"._collationElements), |
| ] |
| |
| public static let allTests: [StringComparisonTest] = { |
| () in |
| return testsFromDUCET + hardcodedTests |
| }() |
| } |
| |
| public func sortKey(forCollationElements ces: [UInt64]) -> ([UInt16], [UInt16], [UInt16]) { |
| func L1(_ ce: UInt64) -> UInt16 { |
| return UInt16(truncatingIfNeeded: ce &>> 32) |
| } |
| func L2(_ ce: UInt64) -> UInt16 { |
| return UInt16(truncatingIfNeeded: ce &>> 16) |
| } |
| func L3(_ ce: UInt64) -> UInt16 { |
| return UInt16(truncatingIfNeeded: ce) |
| } |
| |
| var result1: [UInt16] = [] |
| for ce in ces { result1.append(L1(ce)) } |
| |
| var result2: [UInt16] = [] |
| for ce in ces { result2.append(L2(ce)) } |
| |
| var result3: [UInt16] = [] |
| for ce in ces { result3.append(L3(ce)) } |
| |
| return (result1, result2, result3) |
| } |
| |
| public func collationElements( |
| _ lhs: [UInt64], areLessThan rhs: [UInt64] |
| ) -> Bool { |
| let lhsKey = sortKey(forCollationElements: lhs) |
| let rhsKey = sortKey(forCollationElements: rhs) |
| |
| if lhsKey.0 != rhsKey.0 { |
| return lhsKey.0.lexicographicallyPrecedes(rhsKey.0) |
| } |
| if lhsKey.1 != rhsKey.1 { |
| return lhsKey.1.lexicographicallyPrecedes(rhsKey.1) |
| } |
| return lhsKey.2.lexicographicallyPrecedes(rhsKey.2) |
| } |