| //===----------------------------------------------------------------------===// |
| // |
| // This source file is part of the Swift.org open source project |
| // |
| // Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors |
| // Licensed under Apache License v2.0 with Runtime Library Exception |
| // |
| // See https://swift.org/LICENSE.txt for license information |
| // See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors |
| // |
| //===----------------------------------------------------------------------===// |
| |
| import SwiftShims |
| |
| /// CR and LF are common special cases in grapheme breaking logic |
| private var _CR: UInt8 { return 0x0d } |
| private var _LF: UInt8 { return 0x0a } |
| |
| private func _hasGraphemeBreakBetween( |
| _ lhs: Unicode.Scalar, _ rhs: Unicode.Scalar |
| ) -> Bool { |
| |
| // CR-LF is a special case: no break between these |
| if lhs == Unicode.Scalar(_CR) && rhs == Unicode.Scalar(_LF) { return false } |
| |
| // Whether the given scalar, when it appears paired with another scalar |
| // satisfying this property, has a grapheme break between it and the other |
| // scalar. |
| func hasBreakWhenPaired(_ x: Unicode.Scalar) -> Bool { |
| // TODO: This doesn't generate optimal code, tune/re-write at a lower |
| // level. |
| // |
| // NOTE: Order of case ranges affects codegen, and thus performance. All |
| // things being equal, keep existing order below. |
| switch x.value { |
| // Unified CJK Han ideographs, common and some supplemental, amongst |
| // others: |
| // U+3400 ~ U+A4CF |
| case 0x3400...0xa4cf: return true |
| |
| // Repeat sub-300 check, this is beneficial for common cases of Latin |
| // characters embedded within non-Latin script (e.g. newlines, spaces, |
| // proper nouns and/or jargon, punctuation). |
| // |
| // NOTE: CR-LF special case has already been checked. |
| case 0x0000...0x02ff: return true |
| |
| // Non-combining kana: |
| // U+3041 ~ U+3096 |
| // U+30A1 ~ U+30FC |
| case 0x3041...0x3096: return true |
| case 0x30a1...0x30fc: return true |
| |
| // Non-combining modern (and some archaic) Cyrillic: |
| // U+0400 ~ U+0482 (first half of Cyrillic block) |
| case 0x0400...0x0482: return true |
| |
| // Modern Arabic, excluding extenders and prependers: |
| // U+061D ~ U+064A |
| case 0x061d...0x064a: return true |
| |
| // Precomposed Hangul syllables: |
| // U+AC00 ~ U+D7AF |
| case 0xac00...0xd7af: return true |
| |
| // Common general use punctuation, excluding extenders: |
| // U+2010 ~ U+2029 |
| case 0x2010...0x2029: return true |
| |
| // CJK punctuation characters, excluding extenders: |
| // U+3000 ~ U+3029 |
| case 0x3000...0x3029: return true |
| |
| // Full-width forms: |
| // U+FF01 ~ U+FF9D |
| case 0xFF01...0xFF9D: return true |
| |
| default: return false |
| } |
| } |
| return hasBreakWhenPaired(lhs) && hasBreakWhenPaired(rhs) |
| } |
| |
| @inline(never) // slow-path |
| @_effects(releasenone) |
| private func _measureCharacterStrideICU( |
| of utf8: UnsafeBufferPointer<UInt8>, startingAt i: Int |
| ) -> Int { |
| let iterator = _ThreadLocalStorage.getUBreakIterator(utf8) |
| let offset = __swift_stdlib_ubrk_following( |
| iterator, Int32(truncatingIfNeeded: i)) |
| // ubrk_following returns -1 (UBRK_DONE) when it hits the end of the buffer. |
| if _fastPath(offset != -1) { |
| // The offset into our buffer is the distance. |
| _internalInvariant(offset > i, "zero-sized grapheme?") |
| return Int(truncatingIfNeeded: offset) &- i |
| } |
| _internalInvariant(utf8.count > i) |
| return utf8.count &- i |
| } |
| |
| @inline(never) // slow-path |
| @_effects(releasenone) |
| private func _measureCharacterStrideICU( |
| of utf16: UnsafeBufferPointer<UInt16>, startingAt i: Int |
| ) -> Int { |
| let iterator = _ThreadLocalStorage.getUBreakIterator(utf16) |
| let offset = __swift_stdlib_ubrk_following( |
| iterator, Int32(truncatingIfNeeded: i)) |
| // ubrk_following returns -1 (UBRK_DONE) when it hits the end of the buffer. |
| if _fastPath(offset != -1) { |
| // The offset into our buffer is the distance. |
| _internalInvariant(offset > i, "zero-sized grapheme?") |
| return Int(truncatingIfNeeded: offset) &- i |
| } |
| return utf16.count &- i |
| } |
| |
| @inline(never) // slow-path |
| @_effects(releasenone) |
| private func _measureCharacterStrideICU( |
| of utf8: UnsafeBufferPointer<UInt8>, endingAt i: Int |
| ) -> Int { |
| let iterator = _ThreadLocalStorage.getUBreakIterator(utf8) |
| let offset = __swift_stdlib_ubrk_preceding( |
| iterator, Int32(truncatingIfNeeded: i)) |
| // ubrk_following returns -1 (UBRK_DONE) when it hits the end of the buffer. |
| if _fastPath(offset != -1) { |
| // The offset into our buffer is the distance. |
| _internalInvariant(offset < i, "zero-sized grapheme?") |
| return i &- Int(truncatingIfNeeded: offset) |
| } |
| return i &- utf8.count |
| } |
| |
| @inline(never) // slow-path |
| @_effects(releasenone) |
| private func _measureCharacterStrideICU( |
| of utf16: UnsafeBufferPointer<UInt16>, endingAt i: Int |
| ) -> Int { |
| let iterator = _ThreadLocalStorage.getUBreakIterator(utf16) |
| let offset = __swift_stdlib_ubrk_preceding( |
| iterator, Int32(truncatingIfNeeded: i)) |
| // ubrk_following returns -1 (UBRK_DONE) when it hits the end of the buffer. |
| if _fastPath(offset != -1) { |
| // The offset into our buffer is the distance. |
| _internalInvariant(offset < i, "zero-sized grapheme?") |
| return i &- Int(truncatingIfNeeded: offset) |
| } |
| return i &- utf16.count |
| } |
| |
| extension _StringGuts { |
| @usableFromInline @inline(never) |
| @_effects(releasenone) |
| internal func isOnGraphemeClusterBoundary(_ i: String.Index) -> Bool { |
| guard i.transcodedOffset == 0 else { return false } |
| |
| let offset = i._encodedOffset |
| if offset == 0 || offset == self.count { return true } |
| |
| guard isOnUnicodeScalarBoundary(i) else { return false } |
| |
| let str = String(self) |
| return i == str.index(before: str.index(after: i)) |
| } |
| |
| @usableFromInline @inline(never) |
| @_effects(releasenone) |
| internal func _opaqueCharacterStride(startingAt i: Int) -> Int { |
| if _slowPath(isForeign) { |
| return _foreignOpaqueCharacterStride(startingAt: i) |
| } |
| |
| return self.withFastUTF8 { utf8 in |
| let (sc1, len) = _decodeScalar(utf8, startingAt: i) |
| if i &+ len == utf8.endIndex { |
| // Last scalar is last grapheme |
| return len |
| } |
| let (sc2, _) = _decodeScalar(utf8, startingAt: i &+ len) |
| if _fastPath(_hasGraphemeBreakBetween(sc1, sc2)) { |
| return len |
| } |
| |
| return _measureCharacterStrideICU(of: utf8, startingAt: i) |
| } |
| } |
| |
| @inline(never) |
| @_effects(releasenone) |
| private func _foreignOpaqueCharacterStride(startingAt i: Int) -> Int { |
| #if _runtime(_ObjC) |
| _internalInvariant(isForeign) |
| |
| // TODO(String performance): Faster to do it from a pointer directly |
| let count = _object.largeCount |
| let cocoa = _object.cocoaObject |
| |
| let startIdx = String.Index(_encodedOffset: i) |
| let (sc1, len) = foreignErrorCorrectedScalar(startingAt: startIdx) |
| if i &+ len == count { |
| // Last scalar is last grapheme |
| return len |
| } |
| let (sc2, _) = foreignErrorCorrectedScalar( |
| startingAt: startIdx.encoded(offsetBy: len)) |
| if _fastPath(_hasGraphemeBreakBetween(sc1, sc2)) { |
| return len |
| } |
| |
| if let utf16Ptr = _stdlib_binary_CFStringGetCharactersPtr(cocoa) { |
| let utf16 = UnsafeBufferPointer(start: utf16Ptr, count: count) |
| return _measureCharacterStrideICU(of: utf16, startingAt: i) |
| } |
| |
| // TODO(String performance): Local small stack first, before making large |
| // array. Also, make a smaller initial array and grow over time. |
| var codeUnits = Array<UInt16>(repeating: 0, count: count) |
| |
| codeUnits.withUnsafeMutableBufferPointer { |
| _cocoaStringCopyCharacters( |
| from: cocoa, |
| range: 0..<count, |
| into: $0.baseAddress._unsafelyUnwrappedUnchecked) |
| } |
| return codeUnits.withUnsafeBufferPointer { |
| _measureCharacterStrideICU(of: $0, startingAt: i) |
| } |
| #else |
| fatalError("No foreign strings on Linux in this version of Swift") |
| #endif |
| } |
| |
| @usableFromInline @inline(never) |
| @_effects(releasenone) |
| internal func _opaqueCharacterStride(endingAt i: Int) -> Int { |
| if _slowPath(isForeign) { |
| return _foreignOpaqueCharacterStride(endingAt: i) |
| } |
| |
| return self.withFastUTF8 { utf8 in |
| let (sc2, len) = _decodeScalar(utf8, endingAt: i) |
| if i &- len == utf8.startIndex { |
| // First scalar is first grapheme |
| return len |
| } |
| let (sc1, _) = _decodeScalar(utf8, endingAt: i &- len) |
| if _fastPath(_hasGraphemeBreakBetween(sc1, sc2)) { |
| return len |
| } |
| return _measureCharacterStrideICU(of: utf8, endingAt: i) |
| } |
| } |
| |
| @inline(never) |
| @_effects(releasenone) |
| private func _foreignOpaqueCharacterStride(endingAt i: Int) -> Int { |
| #if _runtime(_ObjC) |
| _internalInvariant(isForeign) |
| |
| // TODO(String performance): Faster to do it from a pointer directly |
| let count = _object.largeCount |
| let cocoa = _object.cocoaObject |
| |
| let endIdx = String.Index(_encodedOffset: i) |
| let (sc2, len) = foreignErrorCorrectedScalar(endingAt: endIdx) |
| if i &- len == 0 { |
| // First scalar is first grapheme |
| return len |
| } |
| let (sc1, _) = foreignErrorCorrectedScalar( |
| endingAt: endIdx.encoded(offsetBy: -len)) |
| if _fastPath(_hasGraphemeBreakBetween(sc1, sc2)) { |
| return len |
| } |
| |
| if let utf16Ptr = _stdlib_binary_CFStringGetCharactersPtr(cocoa) { |
| let utf16 = UnsafeBufferPointer(start: utf16Ptr, count: count) |
| return _measureCharacterStrideICU(of: utf16, endingAt: i) |
| } |
| |
| // TODO(String performance): Local small stack first, before making large |
| // array. Also, make a smaller initial array and grow over time. |
| var codeUnits = Array<UInt16>(repeating: 0, count: count) |
| |
| codeUnits.withUnsafeMutableBufferPointer { |
| _cocoaStringCopyCharacters( |
| from: cocoa, |
| range: 0..<count, |
| into: $0.baseAddress._unsafelyUnwrappedUnchecked) |
| } |
| return codeUnits.withUnsafeBufferPointer { |
| _measureCharacterStrideICU(of: $0, endingAt: i) |
| } |
| #else |
| fatalError("No foreign strings on Linux in this version of Swift") |
| #endif |
| } |
| } |
| |