stdlib/public/core/StringUTF8Validation.swift - third_party/swift - Git at Google

 private func _isUTF8MultiByteLeading(_ x: UInt8) -> Bool {
   return (0xC2...0xF4).contains(x)
 }

 private func _isNotOverlong_F0(_ x: UInt8) -> Bool {
   return (0x90...0xBF).contains(x)
 }

 private func _isNotOverlong_F4(_ x: UInt8) -> Bool {
   return UTF8.isContinuation(x) && x <= 0x8F
 }

 private func _isNotOverlong_E0(_ x: UInt8) -> Bool {
   return (0xA0...0xBF).contains(x)
 }

 private func _isNotOverlong_ED(_ x: UInt8) -> Bool {
   return UTF8.isContinuation(x) && x <= 0x9F
 }

 internal struct UTF8ExtraInfo: Equatable {
   public var isASCII: Bool
 }

 internal enum UTF8ValidationResult {
   case success(UTF8ExtraInfo)
   case error(toBeReplaced: Range<Int>)
 }

 extension UTF8ValidationResult: Equatable {}

 private struct UTF8ValidationError: Error {}

 internal func validateUTF8(_ buf: UnsafeBufferPointer<UInt8>) -> UTF8ValidationResult {
   if _allASCII(buf) {
     return .success(UTF8ExtraInfo(isASCII: true))
   }

   var iter = buf.makeIterator()
   var lastValidIndex = buf.startIndex

   @inline(__always) func guaranteeIn(_ f: (UInt8) -> Bool) throws {
     guard let cu = iter.next() else { throw UTF8ValidationError() }
     guard f(cu) else { throw UTF8ValidationError() }
   }
   @inline(__always) func guaranteeContinuation() throws {
     try guaranteeIn(UTF8.isContinuation)
   }

   func _legacyInvalidLengthCalculation(_ _buffer: (_storage: UInt32, ())) -> Int {
     // function body copied from UTF8.ForwardParser._invalidLength
     if _buffer._storage               & 0b0__1100_0000__1111_0000
                                      == 0b0__1000_0000__1110_0000 {
       // 2-byte prefix of 3-byte sequence. The top 5 bits of the decoded result
       // must be nonzero and not a surrogate
       let top5Bits = _buffer._storage & 0b0__0010_0000__0000_1111
       if top5Bits != 0 && top5Bits   != 0b0__0010_0000__0000_1101 { return 2 }
     }
     else if _buffer._storage                & 0b0__1100_0000__1111_1000
                                            == 0b0__1000_0000__1111_0000
     {
       // Prefix of 4-byte sequence. The top 5 bits of the decoded result
       // must be nonzero and no greater than 0b0__0100_0000
       let top5bits = UInt16(_buffer._storage & 0b0__0011_0000__0000_0111)
       if top5bits != 0 && top5bits.byteSwapped <= 0b0__0000_0100__0000_0000 {
         return _buffer._storage   & 0b0__1100_0000__0000_0000__0000_0000
                                  == 0b0__1000_0000__0000_0000__0000_0000 ? 3 : 2
       }
     }
     return 1
   }

   func _legacyNarrowIllegalRange(buf: Slice<UnsafeBufferPointer<UInt8>>) -> Range<Int> {
     var reversePacked: UInt32 = 0
     if let third = buf.dropFirst(2).first {
       reversePacked |= UInt32(third)
       reversePacked <<= 8
     }
     if let second = buf.dropFirst().first {
       reversePacked |= UInt32(second)
       reversePacked <<= 8
     }
     reversePacked |= UInt32(buf.first!)
     let _buffer: (_storage: UInt32, x: ()) = (reversePacked, ())
     let invalids = _legacyInvalidLengthCalculation(_buffer)
     return buf.startIndex ..< buf.startIndex + invalids
   }

   func findInvalidRange(_ buf: Slice<UnsafeBufferPointer<UInt8>>) -> Range<Int> {
     var endIndex = buf.startIndex
     var iter = buf.makeIterator()
     _ = iter.next()
     while let cu = iter.next(), UTF8.isContinuation(cu) {
       endIndex += 1
     }
     let illegalRange = Range(buf.startIndex...endIndex)
     _internalInvariant(illegalRange.clamped(to: (buf.startIndex..<buf.endIndex)) == illegalRange,
                  "illegal range out of full range")
     // FIXME: Remove the call to `_legacyNarrowIllegalRange` and return `illegalRange` directly
     return _legacyNarrowIllegalRange(buf: buf[illegalRange])
   }

   do {
     var isASCII = true
     while let cu = iter.next() {
       if UTF8.isASCII(cu) { lastValidIndex &+= 1; continue }
       isASCII = false
       if _slowPath(!_isUTF8MultiByteLeading(cu)) {
         throw UTF8ValidationError()
       }
       switch cu {
       case 0xC2...0xDF:
         try guaranteeContinuation()
         lastValidIndex &+= 2
       case 0xE0:
         try guaranteeIn(_isNotOverlong_E0)
         try guaranteeContinuation()
         lastValidIndex &+= 3
       case 0xE1...0xEC:
         try guaranteeContinuation()
         try guaranteeContinuation()
         lastValidIndex &+= 3
       case 0xED:
         try guaranteeIn(_isNotOverlong_ED)
         try guaranteeContinuation()
         lastValidIndex &+= 3
       case 0xEE...0xEF:
         try guaranteeContinuation()
         try guaranteeContinuation()
         lastValidIndex &+= 3
       case 0xF0:
         try guaranteeIn(_isNotOverlong_F0)
         try guaranteeContinuation()
         try guaranteeContinuation()
         lastValidIndex &+= 4
       case 0xF1...0xF3:
         try guaranteeContinuation()
         try guaranteeContinuation()
         try guaranteeContinuation()
         lastValidIndex &+= 4
       case 0xF4:
         try guaranteeIn(_isNotOverlong_F4)
         try guaranteeContinuation()
         try guaranteeContinuation()
         lastValidIndex &+= 4
       default:
         Builtin.unreachable()
       }
     }
     return .success(UTF8ExtraInfo(isASCII: isASCII))
   } catch {
     return .error(toBeReplaced: findInvalidRange(buf[lastValidIndex...]))
   }
 }

 internal func repairUTF8(_ input: UnsafeBufferPointer<UInt8>, firstKnownBrokenRange: Range<Int>) -> String {
   _internalInvariant(input.count > 0, "empty input doesn't need to be repaired")
   _internalInvariant(firstKnownBrokenRange.clamped(to: input.indices) == firstKnownBrokenRange)
   // During this process, `remainingInput` contains the remaining bytes to process. It's split into three
   // non-overlapping sub-regions:
   //
   //  1. `goodChunk` (may be empty) containing bytes that are known good UTF-8 and can be copied into the output String
   //  2. `brokenRange` (never empty) the next range of broken bytes,
   //  3. the remainder (implicit, will become the next `remainingInput`)
   //
   // At the beginning of the process, the `goodChunk` starts at the beginning and extends to just before the first
   // known broken byte. The known broken bytes are covered in the `brokenRange` and everything following that is
   // the remainder.
   // We then copy the `goodChunk` into the target buffer and append a UTF8 replacement character. `brokenRange` is
   // skipped (replaced by the replacement character) and we restart the same process. This time, `goodChunk` extends
   // from the byte after the previous `brokenRange` to the next `brokenRange`.
   var result = _StringGuts()
   let replacementCharacterCount = Unicode.Scalar._replacementCharacter.withUTF8CodeUnits { $0.count }
   result.reserveCapacity(input.count + 5 * replacementCharacterCount) // extra space for some replacement characters

   var brokenRange: Range<Int> = firstKnownBrokenRange
   var remainingInput = input
   repeat {
     _internalInvariant(brokenRange.count > 0, "broken range empty")
     _internalInvariant(remainingInput.count > 0, "empty remaining input doesn't need to be repaired")
     let goodChunk = remainingInput[..<brokenRange.startIndex]

     // very likely this capacity reservation does not actually do anything because we reserved space for the entire
     // input plus up to five replacement characters up front
     result.reserveCapacity(result.count + remainingInput.count + replacementCharacterCount)

     // we can now safely append the next known good bytes and a replacement character
     result.appendInPlace(UnsafeBufferPointer(rebasing: goodChunk),
                          isASCII: false /* appending replacement character anyway, so let's not bother */)
     Unicode.Scalar._replacementCharacter.withUTF8CodeUnits {
       result.appendInPlace($0, isASCII: false)
     }

     remainingInput = UnsafeBufferPointer(rebasing: remainingInput[brokenRange.endIndex...])
     switch validateUTF8(remainingInput) {
     case .success:
       result.appendInPlace(remainingInput, isASCII: false)
       return String(result)
     case .error(let newBrokenRange):
       brokenRange = newBrokenRange
     }
   } while remainingInput.count > 0
   return String(result)
 }
	private func _isUTF8MultiByteLeading(_ x: UInt8) -> Bool {
	return (0xC2...0xF4).contains(x)
	}

	private func _isNotOverlong_F0(_ x: UInt8) -> Bool {
	return (0x90...0xBF).contains(x)
	}

	private func _isNotOverlong_F4(_ x: UInt8) -> Bool {
	return UTF8.isContinuation(x) && x <= 0x8F
	}

	private func _isNotOverlong_E0(_ x: UInt8) -> Bool {
	return (0xA0...0xBF).contains(x)
	}

	private func _isNotOverlong_ED(_ x: UInt8) -> Bool {
	return UTF8.isContinuation(x) && x <= 0x9F
	}

	internal struct UTF8ExtraInfo: Equatable {
	public var isASCII: Bool
	}

	internal enum UTF8ValidationResult {
	case success(UTF8ExtraInfo)
	case error(toBeReplaced: Range<Int>)
	}

	extension UTF8ValidationResult: Equatable {}

	private struct UTF8ValidationError: Error {}

	internal func validateUTF8(_ buf: UnsafeBufferPointer<UInt8>) -> UTF8ValidationResult {
	if _allASCII(buf) {
	return .success(UTF8ExtraInfo(isASCII: true))
	}

	var iter = buf.makeIterator()
	var lastValidIndex = buf.startIndex

	@inline(__always) func guaranteeIn(_ f: (UInt8) -> Bool) throws {
	guard let cu = iter.next() else { throw UTF8ValidationError() }
	guard f(cu) else { throw UTF8ValidationError() }
	}
	@inline(__always) func guaranteeContinuation() throws {
	try guaranteeIn(UTF8.isContinuation)
	}

	func _legacyInvalidLengthCalculation(_ _buffer: (_storage: UInt32, ())) -> Int {
	// function body copied from UTF8.ForwardParser._invalidLength
	if _buffer._storage & 0b0__1100_0000__1111_0000
	== 0b0__1000_0000__1110_0000 {
	// 2-byte prefix of 3-byte sequence. The top 5 bits of the decoded result
	// must be nonzero and not a surrogate
	let top5Bits = _buffer._storage & 0b0__0010_0000__0000_1111
	if top5Bits != 0 && top5Bits != 0b0__0010_0000__0000_1101 { return 2 }
	}
	else if _buffer._storage & 0b0__1100_0000__1111_1000
	== 0b0__1000_0000__1111_0000
	{
	// Prefix of 4-byte sequence. The top 5 bits of the decoded result
	// must be nonzero and no greater than 0b0__0100_0000
	let top5bits = UInt16(_buffer._storage & 0b0__0011_0000__0000_0111)
	if top5bits != 0 && top5bits.byteSwapped <= 0b0__0000_0100__0000_0000 {
	return _buffer._storage & 0b0__1100_0000__0000_0000__0000_0000
	== 0b0__1000_0000__0000_0000__0000_0000 ? 3 : 2
	}
	}
	return 1
	}

	func _legacyNarrowIllegalRange(buf: Slice<UnsafeBufferPointer<UInt8>>) -> Range<Int> {
	var reversePacked: UInt32 = 0
	if let third = buf.dropFirst(2).first {
	reversePacked \|= UInt32(third)
	reversePacked <<= 8
	}
	if let second = buf.dropFirst().first {
	reversePacked \|= UInt32(second)
	reversePacked <<= 8
	}
	reversePacked \|= UInt32(buf.first!)
	let _buffer: (_storage: UInt32, x: ()) = (reversePacked, ())
	let invalids = _legacyInvalidLengthCalculation(_buffer)
	return buf.startIndex ..< buf.startIndex + invalids
	}

	func findInvalidRange(_ buf: Slice<UnsafeBufferPointer<UInt8>>) -> Range<Int> {
	var endIndex = buf.startIndex
	var iter = buf.makeIterator()
	_ = iter.next()
	while let cu = iter.next(), UTF8.isContinuation(cu) {
	endIndex += 1
	}
	let illegalRange = Range(buf.startIndex...endIndex)
	_internalInvariant(illegalRange.clamped(to: (buf.startIndex..<buf.endIndex)) == illegalRange,
	"illegal range out of full range")
	// FIXME: Remove the call to `_legacyNarrowIllegalRange` and return `illegalRange` directly
	return _legacyNarrowIllegalRange(buf: buf[illegalRange])
	}

	do {
	var isASCII = true
	while let cu = iter.next() {
	if UTF8.isASCII(cu) { lastValidIndex &+= 1; continue }
	isASCII = false
	if _slowPath(!_isUTF8MultiByteLeading(cu)) {
	throw UTF8ValidationError()
	}
	switch cu {
	case 0xC2...0xDF:
	try guaranteeContinuation()
	lastValidIndex &+= 2
	case 0xE0:
	try guaranteeIn(_isNotOverlong_E0)
	try guaranteeContinuation()
	lastValidIndex &+= 3
	case 0xE1...0xEC:
	try guaranteeContinuation()
	try guaranteeContinuation()
	lastValidIndex &+= 3
	case 0xED:
	try guaranteeIn(_isNotOverlong_ED)
	try guaranteeContinuation()
	lastValidIndex &+= 3
	case 0xEE...0xEF:
	try guaranteeContinuation()
	try guaranteeContinuation()
	lastValidIndex &+= 3
	case 0xF0:
	try guaranteeIn(_isNotOverlong_F0)
	try guaranteeContinuation()
	try guaranteeContinuation()
	lastValidIndex &+= 4
	case 0xF1...0xF3:
	try guaranteeContinuation()
	try guaranteeContinuation()
	try guaranteeContinuation()
	lastValidIndex &+= 4
	case 0xF4:
	try guaranteeIn(_isNotOverlong_F4)
	try guaranteeContinuation()
	try guaranteeContinuation()
	lastValidIndex &+= 4
	default:
	Builtin.unreachable()
	}
	}
	return .success(UTF8ExtraInfo(isASCII: isASCII))
	} catch {
	return .error(toBeReplaced: findInvalidRange(buf[lastValidIndex...]))
	}
	}

	internal func repairUTF8(_ input: UnsafeBufferPointer<UInt8>, firstKnownBrokenRange: Range<Int>) -> String {
	_internalInvariant(input.count > 0, "empty input doesn't need to be repaired")
	_internalInvariant(firstKnownBrokenRange.clamped(to: input.indices) == firstKnownBrokenRange)
	// During this process, `remainingInput` contains the remaining bytes to process. It's split into three
	// non-overlapping sub-regions:
	//
	// 1. `goodChunk` (may be empty) containing bytes that are known good UTF-8 and can be copied into the output String
	// 2. `brokenRange` (never empty) the next range of broken bytes,
	// 3. the remainder (implicit, will become the next `remainingInput`)
	//
	// At the beginning of the process, the `goodChunk` starts at the beginning and extends to just before the first
	// known broken byte. The known broken bytes are covered in the `brokenRange` and everything following that is
	// the remainder.
	// We then copy the `goodChunk` into the target buffer and append a UTF8 replacement character. `brokenRange` is
	// skipped (replaced by the replacement character) and we restart the same process. This time, `goodChunk` extends
	// from the byte after the previous `brokenRange` to the next `brokenRange`.
	var result = _StringGuts()
	let replacementCharacterCount = Unicode.Scalar._replacementCharacter.withUTF8CodeUnits { $0.count }
	result.reserveCapacity(input.count + 5 * replacementCharacterCount) // extra space for some replacement characters

	var brokenRange: Range<Int> = firstKnownBrokenRange
	var remainingInput = input
	repeat {
	_internalInvariant(brokenRange.count > 0, "broken range empty")
	_internalInvariant(remainingInput.count > 0, "empty remaining input doesn't need to be repaired")
	let goodChunk = remainingInput[..<brokenRange.startIndex]

	// very likely this capacity reservation does not actually do anything because we reserved space for the entire
	// input plus up to five replacement characters up front
	result.reserveCapacity(result.count + remainingInput.count + replacementCharacterCount)

	// we can now safely append the next known good bytes and a replacement character
	result.appendInPlace(UnsafeBufferPointer(rebasing: goodChunk),
	isASCII: false /* appending replacement character anyway, so let's not bother */)
	Unicode.Scalar._replacementCharacter.withUTF8CodeUnits {
	result.appendInPlace($0, isASCII: false)
	}

	remainingInput = UnsafeBufferPointer(rebasing: remainingInput[brokenRange.endIndex...])
	switch validateUTF8(remainingInput) {
	case .success:
	result.appendInPlace(remainingInput, isASCII: false)
	return String(result)
	case .error(let newBrokenRange):
	brokenRange = newBrokenRange
	}
	} while remainingInput.count > 0
	return String(result)
	}