blob: 668c1eea0234b61875ccce9fb7ac816ace331242 [file] [log] [blame]
//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//
//
// Low-level helper functions and utilities for interpreting Unicode
//
internal let _leadingSurrogateBias: UInt16 = 0xd800
internal let _trailingSurrogateBias: UInt16 = 0xdc00
internal let _surrogateMask: UInt16 = 0xfc00
@inline(__always)
internal func _isTrailingSurrogate(_ cu: UInt16) -> Bool {
return cu & _surrogateMask == _trailingSurrogateBias
}
@inline(__always)
internal func _isLeadingSurrogate(_ cu: UInt16) -> Bool {
return cu & _surrogateMask == _leadingSurrogateBias
}
@inline(__always)
internal func _isSurrogate(_ cu: UInt16) -> Bool {
// TODO(String micro-performance): check codegen
return _isLeadingSurrogate(cu) || _isTrailingSurrogate(cu)
}
@inlinable @inline(__always)
internal func _isASCII(_ x: UInt8) -> Bool {
return x & 0b1000_0000 == 0
}
@inlinable
@inline(__always)
internal func _decodeUTF8(_ x: UInt8) -> Unicode.Scalar {
_internalInvariant(_isASCII(x))
return Unicode.Scalar(_unchecked: UInt32(x))
}
@inlinable
@inline(__always)
internal func _decodeUTF8(_ x: UInt8, _ y: UInt8) -> Unicode.Scalar {
_internalInvariant(_utf8ScalarLength(x) == 2)
_internalInvariant(_isContinuation(y))
let x = UInt32(x)
let value = ((x & 0b0001_1111) &<< 6) | _continuationPayload(y)
return Unicode.Scalar(_unchecked: value)
}
@inlinable
@inline(__always)
internal func _decodeUTF8(
_ x: UInt8, _ y: UInt8, _ z: UInt8
) -> Unicode.Scalar {
_internalInvariant(_utf8ScalarLength(x) == 3)
_internalInvariant(_isContinuation(y) && _isContinuation(z))
let x = UInt32(x)
let value = ((x & 0b0000_1111) &<< 12)
| (_continuationPayload(y) &<< 6)
| _continuationPayload(z)
return Unicode.Scalar(_unchecked: value)
}
@inlinable
@inline(__always)
internal func _decodeUTF8(
_ x: UInt8, _ y: UInt8, _ z: UInt8, _ w: UInt8
) -> Unicode.Scalar {
_internalInvariant(_utf8ScalarLength(x) == 4)
_internalInvariant(
_isContinuation(y) && _isContinuation(z) && _isContinuation(w))
let x = UInt32(x)
let value = ((x & 0b0000_1111) &<< 18)
| (_continuationPayload(y) &<< 12)
| (_continuationPayload(z) &<< 6)
| _continuationPayload(w)
return Unicode.Scalar(_unchecked: value)
}
internal func _decodeScalar(
_ utf16: UnsafeBufferPointer<UInt16>, startingAt i: Int
) -> (Unicode.Scalar, scalarLength: Int) {
let high = utf16[i]
if i + 1 >= utf16.count {
_internalInvariant(!_isLeadingSurrogate(high))
_internalInvariant(!_isTrailingSurrogate(high))
return (Unicode.Scalar(_unchecked: UInt32(high)), 1)
}
if !_isLeadingSurrogate(high) {
_internalInvariant(!_isTrailingSurrogate(high))
return (Unicode.Scalar(_unchecked: UInt32(high)), 1)
}
let low = utf16[i+1]
_internalInvariant(_isLeadingSurrogate(high))
_internalInvariant(_isTrailingSurrogate(low))
return (Unicode.Scalar(_unchecked: _decodeSurrogatePair(leading: high, trailing: low)), 2)
}
@inlinable
internal func _decodeScalar(
_ utf8: UnsafeBufferPointer<UInt8>, startingAt i: Int
) -> (Unicode.Scalar, scalarLength: Int) {
let cu0 = utf8[_unchecked: i]
let len = _utf8ScalarLength(cu0)
switch len {
case 1: return (_decodeUTF8(cu0), len)
case 2: return (_decodeUTF8(cu0, utf8[_unchecked: i &+ 1]), len)
case 3: return (_decodeUTF8(
cu0, utf8[_unchecked: i &+ 1], utf8[_unchecked: i &+ 2]), len)
case 4:
return (_decodeUTF8(
cu0,
utf8[_unchecked: i &+ 1],
utf8[_unchecked: i &+ 2],
utf8[_unchecked: i &+ 3]),
len)
default: Builtin.unreachable()
}
}
@inlinable
internal func _decodeScalar(
_ utf8: UnsafeBufferPointer<UInt8>, endingAt i: Int
) -> (Unicode.Scalar, scalarLength: Int) {
let len = _utf8ScalarLength(utf8, endingAt: i)
let (scalar, scalarLen) = _decodeScalar(utf8, startingAt: i &- len)
_internalInvariant(len == scalarLen)
return (scalar, len)
}
@inlinable @inline(__always)
internal func _utf8ScalarLength(_ x: UInt8) -> Int {
_internalInvariant(!_isContinuation(x))
if _isASCII(x) { return 1 }
// TODO(String micro-performance): check codegen
return (~x).leadingZeroBitCount
}
@inlinable @inline(__always)
internal func _utf8ScalarLength(
_ utf8: UnsafeBufferPointer<UInt8>, endingAt i: Int
) -> Int {
var len = 1
while _isContinuation(utf8[_unchecked: i &- len]) {
len &+= 1
}
_internalInvariant(len == _utf8ScalarLength(utf8[i &- len]))
return len
}
@inlinable @inline(__always)
internal func _isContinuation(_ x: UInt8) -> Bool {
return x & 0b1100_0000 == 0b1000_0000
}
@inlinable
@inline(__always)
internal func _continuationPayload(_ x: UInt8) -> UInt32 {
return UInt32(x & 0x3F)
}
@inline(__always)
internal func _decodeSurrogatePair(
leading high: UInt16, trailing low: UInt16
) -> UInt32 {
_internalInvariant(_isLeadingSurrogate(high) && _isTrailingSurrogate(low))
let hi10: UInt32 = UInt32(high) &- UInt32(_leadingSurrogateBias)
_internalInvariant(hi10 < 1<<10, "I said high 10. Not high, like, 20 or something")
let lo10: UInt32 = UInt32(low) &- UInt32(_trailingSurrogateBias)
_internalInvariant(lo10 < 1<<10, "I said low 10. Not low, like, 20 or something")
return ((hi10 &<< 10) | lo10) &+ 0x1_00_00
}
@inline(__always)
internal func _numUTF8CodeUnits(_ scalar: Unicode.Scalar) -> Int {
switch scalar.value {
case 0..<0x80: return 1
case 0x80..<0x0800: return 2
case 0x0800..<0x1_0000: return 3
default: return 4
}
}
@inline(__always)
internal func _numUTF16CodeUnits(_ scalar: Unicode.Scalar) -> Int {
return scalar.value <= UInt16.max ? 1 : 2
}
@inlinable @inline(__always)
internal func _scalarAlign(
_ utf8: UnsafeBufferPointer<UInt8>, _ idx: Int
) -> Int {
var i = idx
while _slowPath(_isContinuation(utf8[_unchecked: i])) {
i &-= 1
_internalInvariant(i >= 0,
"Malformed contents: starts with continuation byte")
}
return i
}
//
// Scalar helpers
//
extension _StringGuts {
@inlinable
@inline(__always) // fast-path: fold common fastUTF8 check
internal func scalarAlign(_ idx: Index) -> Index {
// TODO(String performance): isASCII check
if _slowPath(idx.transcodedOffset != 0 || idx._encodedOffset == 0) {
// Transcoded indices are already scalar aligned
return String.Index(_encodedOffset: idx._encodedOffset)
}
if _slowPath(self.isForeign) {
return foreignScalarAlign(idx)
}
return self.withFastUTF8 { utf8 in
let i = _scalarAlign(utf8, idx._encodedOffset)
// If no alignment is performed, keep grapheme cache
if i == idx._encodedOffset {
return idx
}
return Index(_encodedOffset: i)
}
}
@inlinable
internal func fastUTF8ScalarLength(startingAt i: Int) -> Int {
_internalInvariant(isFastUTF8)
let len = _utf8ScalarLength(self.withFastUTF8 { $0[i] })
_internalInvariant((1...4) ~= len)
return len
}
@inlinable
internal func fastUTF8ScalarLength(endingAt i: Int) -> Int {
_internalInvariant(isFastUTF8)
return self.withFastUTF8 { utf8 in
_internalInvariant(i == utf8.count || !_isContinuation(utf8[i]))
var len = 1
while _isContinuation(utf8[i &- len]) {
_internalInvariant(i &- len > 0)
len += 1
}
_internalInvariant(len <= 4)
return len
}
}
@inlinable
internal func fastUTF8Scalar(startingAt i: Int) -> Unicode.Scalar {
_internalInvariant(isFastUTF8)
return self.withFastUTF8 { _decodeScalar($0, startingAt: i).0 }
}
@usableFromInline
@_effects(releasenone)
internal func isOnUnicodeScalarBoundary(_ i: String.Index) -> Bool {
// TODO(String micro-performance): check isASCII
// Beginning and end are always scalar aligned; mid-scalar never is
guard i.transcodedOffset == 0 else { return false }
if i == self.startIndex || i == self.endIndex { return true }
if _fastPath(isFastUTF8) {
return self.withFastUTF8 { return !_isContinuation($0[i._encodedOffset]) }
}
return i == foreignScalarAlign(i)
}
}
//
// Error-correcting helpers (U+FFFD for unpaired surrogates) for accessing
// contents of foreign strings
//
extension _StringGuts {
@_effects(releasenone)
private func _getForeignCodeUnit(at i: Int) -> UInt16 {
#if _runtime(_ObjC)
// Currently, foreign means NSString
return _cocoaStringSubscript(_object.cocoaObject, i)
#else
fatalError("No foreign strings on Linux in this version of Swift")
#endif
}
@usableFromInline
@_effects(releasenone)
internal func foreignErrorCorrectedScalar(
startingAt idx: String.Index
) -> (Unicode.Scalar, scalarLength: Int) {
_internalInvariant(idx.transcodedOffset == 0)
_internalInvariant(idx._encodedOffset < self.count)
let start = idx._encodedOffset
let leading = _getForeignCodeUnit(at: start)
if _fastPath(!_isSurrogate(leading)) {
return (Unicode.Scalar(_unchecked: UInt32(leading)), 1)
}
// Validate foreign strings on-read: trailing surrogates are invalid,
// leading need following trailing
//
// TODO(String performance): Consider having a valid performance flag
// available to check, and assert it's not set in the condition here.
let nextOffset = start &+ 1
if _slowPath(_isTrailingSurrogate(leading) || nextOffset == self.count) {
return (Unicode.Scalar._replacementCharacter, 1)
}
let trailing = _getForeignCodeUnit(at: nextOffset)
if _slowPath(!_isTrailingSurrogate(trailing)) {
return (Unicode.Scalar._replacementCharacter, 1)
}
return (Unicode.Scalar(
_unchecked: _decodeSurrogatePair(leading: leading, trailing: trailing)),
2)
}
@_effects(releasenone)
internal func foreignErrorCorrectedScalar(
endingAt idx: String.Index
) -> (Unicode.Scalar, scalarLength: Int) {
_internalInvariant(idx.transcodedOffset == 0)
_internalInvariant(idx._encodedOffset <= self.count)
_internalInvariant(idx._encodedOffset > 0)
let end = idx._encodedOffset
let trailing = _getForeignCodeUnit(at: end &- 1)
if _fastPath(!_isSurrogate(trailing)) {
return (Unicode.Scalar(_unchecked: UInt32(trailing)), 1)
}
// Validate foreign strings on-read: trailing surrogates are invalid,
// leading need following trailing
//
// TODO(String performance): Consider having a valid performance flag
// available to check, and assert it's not set in the condition here.
let priorOffset = end &- 2
if _slowPath(_isLeadingSurrogate(trailing) || priorOffset < 0) {
return (Unicode.Scalar._replacementCharacter, 1)
}
let leading = _getForeignCodeUnit(at: priorOffset)
if _slowPath(!_isLeadingSurrogate(leading)) {
return (Unicode.Scalar._replacementCharacter, 1)
}
return (Unicode.Scalar(
_unchecked: _decodeSurrogatePair(leading: leading, trailing: trailing)),
2)
}
@_effects(releasenone)
internal func foreignErrorCorrectedUTF16CodeUnit(
at idx: String.Index
) -> UInt16 {
_internalInvariant(idx.transcodedOffset == 0)
_internalInvariant(idx._encodedOffset < self.count)
let start = idx._encodedOffset
let cu = _getForeignCodeUnit(at: start)
if _fastPath(!_isSurrogate(cu)) {
return cu
}
// Validate foreign strings on-read: trailing surrogates are invalid,
// leading need following trailing
//
// TODO(String performance): Consider having a valid performance flag
// available to check, and assert it's not set in the condition here.
if _isLeadingSurrogate(cu) {
let nextOffset = start &+ 1
guard nextOffset < self.count,
_isTrailingSurrogate(_getForeignCodeUnit(at: nextOffset))
else { return UTF16._replacementCodeUnit }
} else {
let priorOffset = start &- 1
guard priorOffset >= 0,
_isLeadingSurrogate(_getForeignCodeUnit(at: priorOffset))
else { return UTF16._replacementCodeUnit }
}
return cu
}
@usableFromInline @inline(never) // slow-path
@_effects(releasenone)
internal func foreignScalarAlign(_ idx: Index) -> Index {
_internalInvariant(idx._encodedOffset < self.count)
let ecCU = foreignErrorCorrectedUTF16CodeUnit(at: idx)
if _fastPath(!_isTrailingSurrogate(ecCU)) {
return idx
}
_internalInvariant(idx._encodedOffset > 0,
"Error-correction shouldn't give trailing surrogate at position zero")
return String.Index(_encodedOffset: idx._encodedOffset &- 1)
}
@usableFromInline @inline(never)
@_effects(releasenone)
internal func foreignErrorCorrectedGrapheme(
startingAt start: Int, endingAt end: Int
) -> Character {
#if _runtime(_ObjC)
_internalInvariant(self.isForeign)
// Both a fast-path for single-code-unit graphemes and validation:
// ICU treats isolated surrogates as isolated graphemes
let count = end &- start
if start &- end == 1 {
return Character(String(self.foreignErrorCorrectedScalar(
startingAt: String.Index(_encodedOffset: start)
).0))
}
// TODO(String performance): Stack buffer if small enough
var cus = Array<UInt16>(repeating: 0, count: count)
cus.withUnsafeMutableBufferPointer {
_cocoaStringCopyCharacters(
from: self._object.cocoaObject,
range: start..<end,
into: $0.baseAddress._unsafelyUnwrappedUnchecked)
}
return cus.withUnsafeBufferPointer {
return Character(String._uncheckedFromUTF16($0))
}
#else
fatalError("No foreign strings on Linux in this version of Swift")
#endif
}
}
// Higher level aggregate operations. These should only be called when the
// result is the sole operation done by a caller, otherwise it's always more
// efficient to use `withFastUTF8` in the caller.
extension _StringGuts {
@inlinable @inline(__always)
internal func errorCorrectedScalar(
startingAt i: Int
) -> (Unicode.Scalar, scalarLength: Int) {
if _fastPath(isFastUTF8) {
return withFastUTF8 { _decodeScalar($0, startingAt: i) }
}
return foreignErrorCorrectedScalar(
startingAt: String.Index(_encodedOffset: i))
}
@inlinable @inline(__always)
internal func errorCorrectedCharacter(
startingAt start: Int, endingAt end: Int
) -> Character {
if _fastPath(isFastUTF8) {
return withFastUTF8(range: start..<end) { utf8 in
return Character(unchecked: String._uncheckedFromUTF8(utf8))
}
}
return foreignErrorCorrectedGrapheme(startingAt: start, endingAt: end)
}
}