blob: d0addb39ddb233a34b13f1d0916ac3136bc7b9a6 [file] [log] [blame]
// This source file is part of the open source project
// Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
// See for license information
// See for the list of Swift project authors
// Low-level helper functions and utilities for interpreting Unicode
internal func _decodeUTF8(_ x: UInt8) -> Unicode.Scalar {
return Unicode.Scalar(_unchecked: UInt32(x))
internal func _decodeUTF8(_ x: UInt8, _ y: UInt8) -> Unicode.Scalar {
_internalInvariant(_utf8ScalarLength(x) == 2)
let x = UInt32(x)
let value = ((x & 0b0001_1111) &<< 6) | _continuationPayload(y)
return Unicode.Scalar(_unchecked: value)
internal func _decodeUTF8(
_ x: UInt8, _ y: UInt8, _ z: UInt8
) -> Unicode.Scalar {
_internalInvariant(_utf8ScalarLength(x) == 3)
_internalInvariant(UTF8.isContinuation(y) && UTF8.isContinuation(z))
let x = UInt32(x)
let value = ((x & 0b0000_1111) &<< 12)
| (_continuationPayload(y) &<< 6)
| _continuationPayload(z)
return Unicode.Scalar(_unchecked: value)
internal func _decodeUTF8(
_ x: UInt8, _ y: UInt8, _ z: UInt8, _ w: UInt8
) -> Unicode.Scalar {
_internalInvariant(_utf8ScalarLength(x) == 4)
UTF8.isContinuation(y) && UTF8.isContinuation(z)
&& UTF8.isContinuation(w))
let x = UInt32(x)
let value = ((x & 0b0000_1111) &<< 18)
| (_continuationPayload(y) &<< 12)
| (_continuationPayload(z) &<< 6)
| _continuationPayload(w)
return Unicode.Scalar(_unchecked: value)
internal func _decodeScalar(
_ utf16: UnsafeBufferPointer<UInt16>, startingAt i: Int
) -> (Unicode.Scalar, scalarLength: Int) {
let high = utf16[i]
if i + 1 >= utf16.count {
return (Unicode.Scalar(_unchecked: UInt32(high)), 1)
if !UTF16.isLeadSurrogate(high) {
return (Unicode.Scalar(_unchecked: UInt32(high)), 1)
let low = utf16[i+1]
return (UTF16._decodeSurrogates(high, low), 2)
internal func _decodeScalar(
_ utf8: UnsafeBufferPointer<UInt8>, startingAt i: Int
) -> (Unicode.Scalar, scalarLength: Int) {
let cu0 = utf8[_unchecked: i]
let len = _utf8ScalarLength(cu0)
switch len {
case 1: return (_decodeUTF8(cu0), len)
case 2: return (_decodeUTF8(cu0, utf8[_unchecked: i &+ 1]), len)
case 3: return (_decodeUTF8(
cu0, utf8[_unchecked: i &+ 1], utf8[_unchecked: i &+ 2]), len)
case 4:
return (_decodeUTF8(
utf8[_unchecked: i &+ 1],
utf8[_unchecked: i &+ 2],
utf8[_unchecked: i &+ 3]),
default: Builtin.unreachable()
internal func _decodeScalar(
_ utf8: UnsafeBufferPointer<UInt8>, endingAt i: Int
) -> (Unicode.Scalar, scalarLength: Int) {
let len = _utf8ScalarLength(utf8, endingAt: i)
let (scalar, scalarLen) = _decodeScalar(utf8, startingAt: i &- len)
_internalInvariant(len == scalarLen)
return (scalar, len)
@inlinable @inline(__always)
internal func _utf8ScalarLength(_ x: UInt8) -> Int {
if UTF8.isASCII(x) { return 1 }
// TODO(String micro-performance): check codegen
return (~x).leadingZeroBitCount
@inlinable @inline(__always)
internal func _utf8ScalarLength(
_ utf8: UnsafeBufferPointer<UInt8>, endingAt i: Int
) -> Int {
var len = 1
while UTF8.isContinuation(utf8[_unchecked: i &- len]) {
len &+= 1
_internalInvariant(len == _utf8ScalarLength(utf8[i &- len]))
return len
internal func _continuationPayload(_ x: UInt8) -> UInt32 {
return UInt32(x & 0x3F)
@inlinable @inline(__always)
internal func _scalarAlign(
_ utf8: UnsafeBufferPointer<UInt8>, _ idx: Int
) -> Int {
var i = idx
while _slowPath(UTF8.isContinuation(utf8[_unchecked: i])) {
i &-= 1
_internalInvariant(i >= 0,
"Malformed contents: starts with continuation byte")
return i
// Scalar helpers
extension _StringGuts {
@inline(__always) // fast-path: fold common fastUTF8 check
internal func scalarAlign(_ idx: Index) -> Index {
// TODO(String performance): isASCII check
if _slowPath(idx.transcodedOffset != 0 || idx._encodedOffset == 0) {
// Transcoded indices are already scalar aligned
return String.Index(_encodedOffset: idx._encodedOffset)
if _slowPath(self.isForeign) {
return foreignScalarAlign(idx)
return self.withFastUTF8 { utf8 in
let i = _scalarAlign(utf8, idx._encodedOffset)
// If no alignment is performed, keep grapheme cache
if i == idx._encodedOffset {
return idx
return Index(_encodedOffset: i)
internal func fastUTF8ScalarLength(startingAt i: Int) -> Int {
let len = _utf8ScalarLength(self.withFastUTF8 { $0[i] })
_internalInvariant((1...4) ~= len)
return len
internal func fastUTF8ScalarLength(endingAt i: Int) -> Int {
return self.withFastUTF8 { utf8 in
_internalInvariant(i == utf8.count || !UTF8.isContinuation(utf8[i]))
var len = 1
while UTF8.isContinuation(utf8[i &- len]) {
_internalInvariant(i &- len > 0)
len += 1
_internalInvariant(len <= 4)
return len
internal func fastUTF8Scalar(startingAt i: Int) -> Unicode.Scalar {
return self.withFastUTF8 { _decodeScalar($0, startingAt: i).0 }
internal func isOnUnicodeScalarBoundary(_ i: String.Index) -> Bool {
// TODO(String micro-performance): check isASCII
// Beginning and end are always scalar aligned; mid-scalar never is
guard i.transcodedOffset == 0 else { return false }
if i == self.startIndex || i == self.endIndex { return true }
if _fastPath(isFastUTF8) {
return self.withFastUTF8 {
return !UTF8.isContinuation($0[i._encodedOffset])
return i == foreignScalarAlign(i)
// Error-correcting helpers (U+FFFD for unpaired surrogates) for accessing
// contents of foreign strings
extension _StringGuts {
private func _getForeignCodeUnit(at i: Int) -> UInt16 {
#if _runtime(_ObjC)
// Currently, foreign means NSString
return _cocoaStringSubscript(_object.cocoaObject, i)
fatalError("No foreign strings on Linux in this version of Swift")
internal func foreignErrorCorrectedScalar(
startingAt idx: String.Index
) -> (Unicode.Scalar, scalarLength: Int) {
_internalInvariant(idx.transcodedOffset == 0)
_internalInvariant(idx._encodedOffset < self.count)
let start = idx._encodedOffset
let leading = _getForeignCodeUnit(at: start)
if _fastPath(!UTF16.isSurrogate(leading)) {
return (Unicode.Scalar(_unchecked: UInt32(leading)), 1)
// Validate foreign strings on-read: trailing surrogates are invalid,
// leading need following trailing
// TODO(String performance): Consider having a valid performance flag
// available to check, and assert it's not set in the condition here.
let nextOffset = start &+ 1
if _slowPath(UTF16.isTrailSurrogate(leading) || nextOffset == self.count) {
return (Unicode.Scalar._replacementCharacter, 1)
let trailing = _getForeignCodeUnit(at: nextOffset)
if _slowPath(!UTF16.isTrailSurrogate(trailing)) {
return (Unicode.Scalar._replacementCharacter, 1)
return (UTF16._decodeSurrogates(leading, trailing), 2)
internal func foreignErrorCorrectedScalar(
endingAt idx: String.Index
) -> (Unicode.Scalar, scalarLength: Int) {
_internalInvariant(idx.transcodedOffset == 0)
_internalInvariant(idx._encodedOffset <= self.count)
_internalInvariant(idx._encodedOffset > 0)
let end = idx._encodedOffset
let trailing = _getForeignCodeUnit(at: end &- 1)
if _fastPath(!UTF16.isSurrogate(trailing)) {
return (Unicode.Scalar(_unchecked: UInt32(trailing)), 1)
// Validate foreign strings on-read: trailing surrogates are invalid,
// leading need following trailing
// TODO(String performance): Consider having a valid performance flag
// available to check, and assert it's not set in the condition here.
let priorOffset = end &- 2
if _slowPath(UTF16.isLeadSurrogate(trailing) || priorOffset < 0) {
return (Unicode.Scalar._replacementCharacter, 1)
let leading = _getForeignCodeUnit(at: priorOffset)
if _slowPath(!UTF16.isLeadSurrogate(leading)) {
return (Unicode.Scalar._replacementCharacter, 1)
return (UTF16._decodeSurrogates(leading, trailing), 2)
internal func foreignErrorCorrectedUTF16CodeUnit(
at idx: String.Index
) -> UInt16 {
_internalInvariant(idx.transcodedOffset == 0)
_internalInvariant(idx._encodedOffset < self.count)
let start = idx._encodedOffset
let cu = _getForeignCodeUnit(at: start)
if _fastPath(!UTF16.isSurrogate(cu)) {
return cu
// Validate foreign strings on-read: trailing surrogates are invalid,
// leading need following trailing
// TODO(String performance): Consider having a valid performance flag
// available to check, and assert it's not set in the condition here.
if UTF16.isLeadSurrogate(cu) {
let nextOffset = start &+ 1
guard nextOffset < self.count,
UTF16.isTrailSurrogate(_getForeignCodeUnit(at: nextOffset))
else { return UTF16._replacementCodeUnit }
} else {
let priorOffset = start &- 1
guard priorOffset >= 0,
UTF16.isLeadSurrogate(_getForeignCodeUnit(at: priorOffset))
else { return UTF16._replacementCodeUnit }
return cu
@usableFromInline @inline(never) // slow-path
internal func foreignScalarAlign(_ idx: Index) -> Index {
_internalInvariant(idx._encodedOffset < self.count)
let ecCU = foreignErrorCorrectedUTF16CodeUnit(at: idx)
if _fastPath(!UTF16.isTrailSurrogate(ecCU)) {
return idx
_internalInvariant(idx._encodedOffset > 0,
"Error-correction shouldn't give trailing surrogate at position zero")
return String.Index(_encodedOffset: idx._encodedOffset &- 1)
@usableFromInline @inline(never)
internal func foreignErrorCorrectedGrapheme(
startingAt start: Int, endingAt end: Int
) -> Character {
#if _runtime(_ObjC)
// Both a fast-path for single-code-unit graphemes and validation:
// ICU treats isolated surrogates as isolated graphemes
let count = end &- start
if start &- end == 1 {
return Character(String(self.foreignErrorCorrectedScalar(
startingAt: String.Index(_encodedOffset: start)
// TODO(String performance): Stack buffer if small enough
var cus = Array<UInt16>(repeating: 0, count: count)
cus.withUnsafeMutableBufferPointer {
from: self._object.cocoaObject,
range: start..<end,
into: $0.baseAddress._unsafelyUnwrappedUnchecked)
return cus.withUnsafeBufferPointer {
return Character(String._uncheckedFromUTF16($0))
fatalError("No foreign strings on Linux in this version of Swift")
// Higher level aggregate operations. These should only be called when the
// result is the sole operation done by a caller, otherwise it's always more
// efficient to use `withFastUTF8` in the caller.
extension _StringGuts {
@inlinable @inline(__always)
internal func errorCorrectedScalar(
startingAt i: Int
) -> (Unicode.Scalar, scalarLength: Int) {
if _fastPath(isFastUTF8) {
return withFastUTF8 { _decodeScalar($0, startingAt: i) }
return foreignErrorCorrectedScalar(
startingAt: String.Index(_encodedOffset: i))
@inlinable @inline(__always)
internal func errorCorrectedCharacter(
startingAt start: Int, endingAt end: Int
) -> Character {
if _fastPath(isFastUTF8) {
return withFastUTF8(range: start..<end) { utf8 in
return Character(unchecked: String._uncheckedFromUTF8(utf8))
return foreignErrorCorrectedGrapheme(startingAt: start, endingAt: end)