blob: f592819c5c85223913ec0ebf2cca4d48f224eae1 [file] [log] [blame]
//===--- UnicodeTrie.swift.gyb --------------------------------*- swift -*-===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//
// RUN: rm -rf %t && mkdir -p %t && %gyb -DunicodeGraphemeBreakPropertyFile=%utils/UnicodeData/GraphemeBreakProperty.txt -DunicodeGraphemeBreakTestFile=%utils/UnicodeData/GraphemeBreakTest.txt %s -o %t/UnicodeTrie.swift
// RUN: %line-directive %t/UnicodeTrie.swift -- %target-build-swift %t/UnicodeTrie.swift -o %t/a.out -g -Xfrontend -disable-access-control
// RUN: %line-directive %t/UnicodeTrie.swift -- %target-run %t/a.out
// REQUIRES: executable_test
// FIXME: rdar://problem/19648117 Needs splitting objc parts out
// XFAIL: linux
%{
from GYBUnicodeDataUtils import *
grapheme_cluster_break_property_table = \
GraphemeClusterBreakPropertyTable(unicodeGraphemeBreakPropertyFile)
}%
import SwiftPrivate
import StdlibUnittest
import StdlibCollectionUnittest
import Darwin
import Foundation
var graphemeBreakPropertyTable = [
// 'as Int' annotations are needed to help prevent the type-checker from
// blowing the stack. <rdar://problem/17539704>
% for start_code_point, end_code_point, value in grapheme_cluster_break_property_table.property_value_ranges:
(${start_code_point} as Int, ${end_code_point} as Int, _GraphemeClusterBreakPropertyValue.${value}),
% end
]
var UnicodeTrie = TestSuite("UnicodeTrie")
UnicodeTrie.test("_UnicodeGraphemeClusterBreakPropertyTrie") {
// Verify that the trie reports correct values of the property for every code
// point.
var trie = _UnicodeGraphemeClusterBreakPropertyTrie()
var expected = [_GraphemeClusterBreakPropertyValue](
repeating: _GraphemeClusterBreakPropertyValue.Other,
count: 0x110000)
for (startCodePoint, endCodePoint, value) in graphemeBreakPropertyTable {
for cp in startCodePoint...endCodePoint {
expected[cp] = value
}
}
for cp in UInt32(0)...UInt32(0x10ffff) {
if cp % 0x10000 == 0 {
print("\(cp)...")
}
expectEqual(
expected[Int(cp)], trie.getPropertyValue(cp), "code point \(cp)")
}
}
%{
grapheme_cluster_break_tests = \
get_grapheme_cluster_break_tests_as_unicode_scalars(
unicodeGraphemeBreakTestFile)
}%
// The most simple subclass of NSString that CoreFoundation does not know
// about.
class NonContiguousNSString : NSString {
override init() {
_value = []
super.init()
}
required init(coder aDecoder: NSCoder) {
fatalError("don't call this initializer")
}
@nonobjc
init(_ value: [UInt16]) {
_value = value
super.init()
}
@nonobjc
convenience init(_ scalars: [UInt32]) {
var encoded: [UInt16] = []
let iter = scalars.makeIterator()
let output: (UInt16) -> Void = { encoded.append($0) }
let hadError = transcode(
iter,
from: UTF32.self,
to: UTF16.self,
stoppingOnError: true,
into: output)
expectFalse(hadError)
self.init(encoded)
}
@objc(copyWithZone:)
override func copy(with zone: NSZone?) -> Any {
// Ensure that copying this string produces a class that CoreFoundation
// does not know about.
return self
}
@objc override var length: Int {
return _value.count
}
@objc override func character(at index: Int) -> unichar {
return _value[index]
}
var _value: [UInt16]
}
/// Verify that extended grapheme cluster boundaries in `subject` occur at
/// positions specified in `expectedBoundaries`.
func checkGraphemeClusterSegmentation(
_ expectedBoundaries: [Int], _ subject: String, _ stackTrace: SourceLocStack
) {
var actualBoundaries: [Int] = [ 0 ]
var unicodeScalarCount = 0
for c in subject.characters {
let currentClusterSize = String(c).unicodeScalars.count
unicodeScalarCount += currentClusterSize
actualBoundaries += [unicodeScalarCount]
}
expectEqual(
expectedBoundaries, actualBoundaries,
"scalars: \(asHex(Array(subject.unicodeScalars.lazy.map { $0.value })))"
)
let expectedCharacters: [Character] = Array(subject.characters)
checkBidirectionalCollection(expectedCharacters, subject.characters)
}
func checkGraphemeClusterSegmentation(
_ expectedBoundaries: [Int], scalars: [UInt32], _ stackTrace: SourceLocStack
) {
let subject = NonContiguousNSString(scalars) as String
checkGraphemeClusterSegmentation(expectedBoundaries, subject,
stackTrace.withCurrentLoc())
}
func checkGraphemeClusterSegmentation(
_ expectedBoundaries: [Int], codeUnits: [UInt16], _ stackTrace: SourceLocStack
) {
let subject = NonContiguousNSString(codeUnits) as String
checkGraphemeClusterSegmentation(expectedBoundaries, subject,
stackTrace.withCurrentLoc())
}
UnicodeTrie.test("GraphemeClusterSegmentation/UnicodeSpec") {
// Test segmentation algorithm using test data from the Unicode
// specification.
% for code_points, expected_boundaries in grapheme_cluster_break_tests:
do {
let scalars: [UInt32] =
[ ${", ".join([str(cp) for cp in code_points])} ]
let expectedBoundaries: [Int] =
[ ${", ".join([str(x) for x in expected_boundaries])} ]
checkGraphemeClusterSegmentation(expectedBoundaries, scalars: scalars,
SourceLocStack().withCurrentLoc())
}
% end
}
UnicodeTrie.test("GraphemeClusterSegmentation/Extra") {
// Extra tests for input Strings that contain ill-formed code unit sequences.
// U+D800 (high-surrogate)
checkGraphemeClusterSegmentation(
[ 0, 1 ],
codeUnits: [ 0xd800 ],
SourceLocStack().withCurrentLoc())
// U+D800 (high-surrogate)
// U+D800 (high-surrogate)
checkGraphemeClusterSegmentation(
[ 0, 1, 2 ],
codeUnits: [ 0xd800, 0xd800 ],
SourceLocStack().withCurrentLoc())
// U+0041 LATIN CAPITAL LETTER A
// U+D800 (high-surrogate)
checkGraphemeClusterSegmentation(
[ 0, 1, 2 ],
codeUnits: [ 0x0041, 0xd800 ],
SourceLocStack().withCurrentLoc())
// U+D800 (high-surrogate)
// U+0041 LATIN CAPITAL LETTER A
checkGraphemeClusterSegmentation(
[ 0, 1, 2 ],
codeUnits: [ 0xd800, 0x0041 ],
SourceLocStack().withCurrentLoc())
// U+0041 LATIN CAPITAL LETTER A
// U+0301 COMBINING ACUTE ACCENT
// U+D800 (high-surrogate)
checkGraphemeClusterSegmentation(
[ 0, 2, 3 ],
codeUnits: [ 0x0041, 0x0301, 0xd800 ],
SourceLocStack().withCurrentLoc())
// U+D800 (high-surrogate)
// U+0041 LATIN CAPITAL LETTER A
// U+0301 COMBINING ACUTE ACCENT
checkGraphemeClusterSegmentation(
[ 0, 1, 3 ],
codeUnits: [ 0xd800, 0x0041, 0x0301 ],
SourceLocStack().withCurrentLoc())
}
UnicodeTrie.test("GraphemeClusterSegmentation/Unicode_7_0_0") {
// Verify that we are using Unicode 7.0.0+ data tables.
// In Unicode 6.3.0, this sequence was segmented into two grapheme clusters.
//
// U+0041 LATIN CAPITAL LETTER A
// U+1122C KHOJKI VOWEL SIGN AA
checkGraphemeClusterSegmentation(
[ 0, 2 ],
scalars: [ 0x0041, 0x1122c ],
SourceLocStack().withCurrentLoc())
}
UnicodeTrie.test("GraphemeClusterSegmentation/Unicode_8_0_0") {
// Verify that we are using Unicode 8.0.0+ data tables.
// In Unicode 7.0.0, this sequence was segmented into two grapheme clusters.
//
// U+0041 LATIN CAPITAL LETTER A
// U+11720 AHOM VOWEL SIGN A
checkGraphemeClusterSegmentation(
[ 0, 2 ],
scalars: [ 0x0041, 0x11720 ],
SourceLocStack().withCurrentLoc())
}
runAllTests()