blob: bd0ec9f72c5044498503560720d7c4cf5e862eaa [file] [log] [blame]
private func _isUTF8MultiByteLeading(_ x: UInt8) -> Bool {
return (0xC2...0xF4).contains(x)
}
private func _isNotOverlong_F0(_ x: UInt8) -> Bool {
return (0x90...0xBF).contains(x)
}
private func _isNotOverlong_F4(_ x: UInt8) -> Bool {
return UTF8.isContinuation(x) && x <= 0x8F
}
private func _isNotOverlong_E0(_ x: UInt8) -> Bool {
return (0xA0...0xBF).contains(x)
}
private func _isNotOverlong_ED(_ x: UInt8) -> Bool {
return UTF8.isContinuation(x) && x <= 0x9F
}
internal struct UTF8ExtraInfo: Equatable {
public var isASCII: Bool
}
internal enum UTF8ValidationResult {
case success(UTF8ExtraInfo)
case error(toBeReplaced: Range<Int>)
}
extension UTF8ValidationResult: Equatable {}
private struct UTF8ValidationError: Error {}
internal func validateUTF8(_ buf: UnsafeBufferPointer<UInt8>) -> UTF8ValidationResult {
if _allASCII(buf) {
return .success(UTF8ExtraInfo(isASCII: true))
}
var iter = buf.makeIterator()
var lastValidIndex = buf.startIndex
@inline(__always) func guaranteeIn(_ f: (UInt8) -> Bool) throws {
guard let cu = iter.next() else { throw UTF8ValidationError() }
guard f(cu) else { throw UTF8ValidationError() }
}
@inline(__always) func guaranteeContinuation() throws {
try guaranteeIn(UTF8.isContinuation)
}
func _legacyInvalidLengthCalculation(_ _buffer: (_storage: UInt32, ())) -> Int {
// function body copied from UTF8.ForwardParser._invalidLength
if _buffer._storage & 0b0__1100_0000__1111_0000
== 0b0__1000_0000__1110_0000 {
// 2-byte prefix of 3-byte sequence. The top 5 bits of the decoded result
// must be nonzero and not a surrogate
let top5Bits = _buffer._storage & 0b0__0010_0000__0000_1111
if top5Bits != 0 && top5Bits != 0b0__0010_0000__0000_1101 { return 2 }
}
else if _buffer._storage & 0b0__1100_0000__1111_1000
== 0b0__1000_0000__1111_0000
{
// Prefix of 4-byte sequence. The top 5 bits of the decoded result
// must be nonzero and no greater than 0b0__0100_0000
let top5bits = UInt16(_buffer._storage & 0b0__0011_0000__0000_0111)
if top5bits != 0 && top5bits.byteSwapped <= 0b0__0000_0100__0000_0000 {
return _buffer._storage & 0b0__1100_0000__0000_0000__0000_0000
== 0b0__1000_0000__0000_0000__0000_0000 ? 3 : 2
}
}
return 1
}
func _legacyNarrowIllegalRange(buf: Slice<UnsafeBufferPointer<UInt8>>) -> Range<Int> {
var reversePacked: UInt32 = 0
if let third = buf.dropFirst(2).first {
reversePacked |= UInt32(third)
reversePacked <<= 8
}
if let second = buf.dropFirst().first {
reversePacked |= UInt32(second)
reversePacked <<= 8
}
reversePacked |= UInt32(buf.first!)
let _buffer: (_storage: UInt32, x: ()) = (reversePacked, ())
let invalids = _legacyInvalidLengthCalculation(_buffer)
return buf.startIndex ..< buf.startIndex + invalids
}
func findInvalidRange(_ buf: Slice<UnsafeBufferPointer<UInt8>>) -> Range<Int> {
var endIndex = buf.startIndex
var iter = buf.makeIterator()
_ = iter.next()
while let cu = iter.next(), UTF8.isContinuation(cu) {
endIndex += 1
}
let illegalRange = Range(buf.startIndex...endIndex)
_internalInvariant(illegalRange.clamped(to: (buf.startIndex..<buf.endIndex)) == illegalRange,
"illegal range out of full range")
// FIXME: Remove the call to `_legacyNarrowIllegalRange` and return `illegalRange` directly
return _legacyNarrowIllegalRange(buf: buf[illegalRange])
}
do {
var isASCII = true
while let cu = iter.next() {
if UTF8.isASCII(cu) { lastValidIndex &+= 1; continue }
isASCII = false
if _slowPath(!_isUTF8MultiByteLeading(cu)) {
throw UTF8ValidationError()
}
switch cu {
case 0xC2...0xDF:
try guaranteeContinuation()
lastValidIndex &+= 2
case 0xE0:
try guaranteeIn(_isNotOverlong_E0)
try guaranteeContinuation()
lastValidIndex &+= 3
case 0xE1...0xEC:
try guaranteeContinuation()
try guaranteeContinuation()
lastValidIndex &+= 3
case 0xED:
try guaranteeIn(_isNotOverlong_ED)
try guaranteeContinuation()
lastValidIndex &+= 3
case 0xEE...0xEF:
try guaranteeContinuation()
try guaranteeContinuation()
lastValidIndex &+= 3
case 0xF0:
try guaranteeIn(_isNotOverlong_F0)
try guaranteeContinuation()
try guaranteeContinuation()
lastValidIndex &+= 4
case 0xF1...0xF3:
try guaranteeContinuation()
try guaranteeContinuation()
try guaranteeContinuation()
lastValidIndex &+= 4
case 0xF4:
try guaranteeIn(_isNotOverlong_F4)
try guaranteeContinuation()
try guaranteeContinuation()
lastValidIndex &+= 4
default:
Builtin.unreachable()
}
}
return .success(UTF8ExtraInfo(isASCII: isASCII))
} catch {
return .error(toBeReplaced: findInvalidRange(buf[lastValidIndex...]))
}
}
internal func repairUTF8(_ input: UnsafeBufferPointer<UInt8>, firstKnownBrokenRange: Range<Int>) -> String {
_internalInvariant(input.count > 0, "empty input doesn't need to be repaired")
_internalInvariant(firstKnownBrokenRange.clamped(to: input.indices) == firstKnownBrokenRange)
// During this process, `remainingInput` contains the remaining bytes to process. It's split into three
// non-overlapping sub-regions:
//
// 1. `goodChunk` (may be empty) containing bytes that are known good UTF-8 and can be copied into the output String
// 2. `brokenRange` (never empty) the next range of broken bytes,
// 3. the remainder (implicit, will become the next `remainingInput`)
//
// At the beginning of the process, the `goodChunk` starts at the beginning and extends to just before the first
// known broken byte. The known broken bytes are covered in the `brokenRange` and everything following that is
// the remainder.
// We then copy the `goodChunk` into the target buffer and append a UTF8 replacement character. `brokenRange` is
// skipped (replaced by the replacement character) and we restart the same process. This time, `goodChunk` extends
// from the byte after the previous `brokenRange` to the next `brokenRange`.
var result = _StringGuts()
let replacementCharacterCount = Unicode.Scalar._replacementCharacter.withUTF8CodeUnits { $0.count }
result.reserveCapacity(input.count + 5 * replacementCharacterCount) // extra space for some replacement characters
var brokenRange: Range<Int> = firstKnownBrokenRange
var remainingInput = input
repeat {
_internalInvariant(brokenRange.count > 0, "broken range empty")
_internalInvariant(remainingInput.count > 0, "empty remaining input doesn't need to be repaired")
let goodChunk = remainingInput[..<brokenRange.startIndex]
// very likely this capacity reservation does not actually do anything because we reserved space for the entire
// input plus up to five replacement characters up front
result.reserveCapacity(result.count + remainingInput.count + replacementCharacterCount)
// we can now safely append the next known good bytes and a replacement character
result.appendInPlace(UnsafeBufferPointer(rebasing: goodChunk),
isASCII: false /* appending replacement character anyway, so let's not bother */)
Unicode.Scalar._replacementCharacter.withUTF8CodeUnits {
result.appendInPlace($0, isASCII: false)
}
remainingInput = UnsafeBufferPointer(rebasing: remainingInput[brokenRange.endIndex...])
switch validateUTF8(remainingInput) {
case .success:
result.appendInPlace(remainingInput, isASCII: false)
return String(result)
case .error(let newBrokenRange):
brokenRange = newBrokenRange
}
} while remainingInput.count > 0
return String(result)
}