sdk/lib/utf-utils/internal/scalar.cc - fuchsia - Git at Google

 // Copyright 2022 The Fuchsia Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include <lib/utf-utils/internal/scalar.h>

 #include <cstdint>
 #include <cstring>
 #ifdef __has_include
 #if __has_include(<version>)
 #include <version>
 #endif
 #endif

 namespace {

 bool IsValidUtf8ScalarFull(const char* data, size_t pos, const size_t size) {
   // Inclusive range check
   auto is_in_range = [](uint8_t byte, uint8_t lo, uint8_t hi) { return lo <= byte && byte <= hi; };

   // The following comparisons rely on treating bytes as if they are unsigned 8-bit values.
   // However, both signed and unsigned char are allowed in the C++ spec, with x64 choosing signed
   // and arm64 choosing unsigned. We therefore force the byte to be treated as unsigned, since we
   // cannot rely on the default.
   const uint8_t* str = reinterpret_cast<const uint8_t*>(data);
   static_assert(sizeof(char) == sizeof(uint8_t), "char and uint8_t are not the same size!");

   while (pos < size) {
     // Table from https://datatracker.ietf.org/doc/html/rfc3629#section-4
     //
     // UTF8-1      = %x00-7F
     // UTF8-2      = %xC2-DF UTF8-tail
     // UTF8-3      = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
     //               %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
     // UTF8-4      = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
     //               %xF4 %x80-8F 2( UTF8-tail )
     // UTF8-tail   = %x80-BF

     const size_t remaining_size = size - pos;

     if (str[pos] <= 0x7F) {
       // UTF8-1      = %x00-7F
       pos += 1;
     } else if (is_in_range(str[pos], 0xC2, 0xDF)) /* %xC2-DF */ {
       // UTF8-2      = %xC2-DF UTF8-tail
       if (remaining_size < 2) {
         return false;
       }
       if ((str[pos + 1] & 0b11000000) != 0b10000000) {
         // Not followed by continuation character.
         return false;
       }

       pos += 2;
     } else if (is_in_range(str[pos], 0xE0, 0xEF)) /* %xE0-EF */ {
       // UTF8-3      = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
       //               %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
       if (remaining_size < 3) {
         return false;
       }

       uint16_t continuations;
       memcpy(&continuations, &str[pos + 1], sizeof(continuations));
       if ((continuations & 0b11000000'11000000) != 0b10000000'10000000) {
         // Not followed by continuation characters.
         return false;
       }
       if (str[pos] == 0xE0 && !is_in_range(str[pos + 1], 0xA0, 0xBF)) {
         // First byte is %xE0 but second byte is not in range %xA0-BF.
         return false;
       }
       if (str[pos] == 0xED && !is_in_range(str[pos + 1], 0x80, 0x9F)) {
         // First byte is %xED but second byte is not in range %x80-9F.
         return false;
       }

       pos += 3;
     } else if (is_in_range(str[pos], 0xF0, 0xF4)) /* %xF0-F4 */ {
       // UTF8-4      = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
       //               %xF4 %x80-8F 2( UTF8-tail )
       if (remaining_size < 4) {
         return false;
       }

       // Note: don't forget about endianness here!
       uint32_t code_point;
       memcpy(&code_point, &str[pos], sizeof(code_point));
 #if !defined(__cpp_if_constexpr) || __cpp_if_constexpr < 201606L
       if (cpp20::endian::native == cpp20::endian::big) {
 #else
       if constexpr (cpp20::endian::native == cpp20::endian::big) {
 #endif
         if ((code_point & 0b11000000'11000000'11000000) != 0b10000000'10000000'10000000) {
           // Not followed by continuation characters.
           return false;
         }
       } else {
         if ((code_point & 0b11000000'11000000'11000000'00000000) !=
             0b10000000'10000000'10000000'00000000) {
           // Not followed by continuation characters.
           return false;
         }
       }

       if (str[pos] == 0xF0 && !is_in_range(str[pos + 1], 0x90, 0xBF)) {
         // First byte is %xF0 but second byte is not in range %x90-BF.
         return false;
       }
       if (str[pos] == 0xF4 && !is_in_range(str[pos + 1], 0x80, 0x8F)) {
         // First byte is %xF4 but second byte is not in range %x80-8F.
         return false;
       }

       pos += 4;
     } else {
       return false;
     }
   }

   return true;
 }

 }  // namespace

 namespace utfutils {
 namespace internal {

 bool IsValidUtf8Scalar(const char* data, const size_t size) {
   if (data == nullptr) {
     return false;
   }

   size_t pos = 0;
   // Fast path: read ASCII bytes in 8-byte chunks until a non-ASCII byte is encountered.
   for (; pos < (size & ~7); pos += 8) {
     uint64_t val;
     memcpy(&val, &data[pos], sizeof(val));
     if ((val & 0x8080'8080'8080'8080) != 0) {
       return IsValidUtf8ScalarFull(data, pos, size);
     }
   }

   // Fast path: drain loop for remaining chunk of ASCII (< 8-byte chunk)
   for (; pos < size; ++pos) {
     if ((data[pos] & 0x80) != 0) {
       return IsValidUtf8ScalarFull(data, pos, size);
     }
   }

   return true;
 }

 bool ValidateAndCopyUtf8Scalar(const char* src, char* dst, const size_t size) {
   bool is_valid = IsValidUtf8Scalar(src, size);
   if (is_valid) {
     memcpy(dst, src, size);
   }

   return is_valid;
 }

 }  // namespace internal
 }  // namespace utfutils
	// Copyright 2022 The Fuchsia Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#include <lib/utf-utils/internal/scalar.h>

	#include <cstdint>
	#include <cstring>
	#ifdef __has_include
	#if __has_include(<version>)
	#include <version>
	#endif
	#endif

	namespace {

	bool IsValidUtf8ScalarFull(const char* data, size_t pos, const size_t size) {
	// Inclusive range check
	auto is_in_range = [](uint8_t byte, uint8_t lo, uint8_t hi) { return lo <= byte && byte <= hi; };

	// The following comparisons rely on treating bytes as if they are unsigned 8-bit values.
	// However, both signed and unsigned char are allowed in the C++ spec, with x64 choosing signed
	// and arm64 choosing unsigned. We therefore force the byte to be treated as unsigned, since we
	// cannot rely on the default.
	const uint8_t* str = reinterpret_cast<const uint8_t*>(data);
	static_assert(sizeof(char) == sizeof(uint8_t), "char and uint8_t are not the same size!");

	while (pos < size) {
	// Table from https://datatracker.ietf.org/doc/html/rfc3629#section-4
	//
	// UTF8-1 = %x00-7F
	// UTF8-2 = %xC2-DF UTF8-tail
	// UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
	// %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
	// UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
	// %xF4 %x80-8F 2( UTF8-tail )
	// UTF8-tail = %x80-BF

	const size_t remaining_size = size - pos;

	if (str[pos] <= 0x7F) {
	// UTF8-1 = %x00-7F
	pos += 1;
	} else if (is_in_range(str[pos], 0xC2, 0xDF)) /* %xC2-DF */ {
	// UTF8-2 = %xC2-DF UTF8-tail
	if (remaining_size < 2) {
	return false;
	}
	if ((str[pos + 1] & 0b11000000) != 0b10000000) {
	// Not followed by continuation character.
	return false;
	}

	pos += 2;
	} else if (is_in_range(str[pos], 0xE0, 0xEF)) /* %xE0-EF */ {
	// UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
	// %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
	if (remaining_size < 3) {
	return false;
	}

	uint16_t continuations;
	memcpy(&continuations, &str[pos + 1], sizeof(continuations));
	if ((continuations & 0b11000000'11000000) != 0b10000000'10000000) {
	// Not followed by continuation characters.
	return false;
	}
	if (str[pos] == 0xE0 && !is_in_range(str[pos + 1], 0xA0, 0xBF)) {
	// First byte is %xE0 but second byte is not in range %xA0-BF.
	return false;
	}
	if (str[pos] == 0xED && !is_in_range(str[pos + 1], 0x80, 0x9F)) {
	// First byte is %xED but second byte is not in range %x80-9F.
	return false;
	}

	pos += 3;
	} else if (is_in_range(str[pos], 0xF0, 0xF4)) /* %xF0-F4 */ {
	// UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
	// %xF4 %x80-8F 2( UTF8-tail )
	if (remaining_size < 4) {
	return false;
	}

	// Note: don't forget about endianness here!
	uint32_t code_point;
	memcpy(&code_point, &str[pos], sizeof(code_point));
	#if !defined(__cpp_if_constexpr) \|\| __cpp_if_constexpr < 201606L
	if (cpp20::endian::native == cpp20::endian::big) {
	#else
	if constexpr (cpp20::endian::native == cpp20::endian::big) {
	#endif
	if ((code_point & 0b11000000'11000000'11000000) != 0b10000000'10000000'10000000) {
	// Not followed by continuation characters.
	return false;
	}
	} else {
	if ((code_point & 0b11000000'11000000'11000000'00000000) !=
	0b10000000'10000000'10000000'00000000) {
	// Not followed by continuation characters.
	return false;
	}
	}

	if (str[pos] == 0xF0 && !is_in_range(str[pos + 1], 0x90, 0xBF)) {
	// First byte is %xF0 but second byte is not in range %x90-BF.
	return false;
	}
	if (str[pos] == 0xF4 && !is_in_range(str[pos + 1], 0x80, 0x8F)) {
	// First byte is %xF4 but second byte is not in range %x80-8F.
	return false;
	}

	pos += 4;
	} else {
	return false;
	}
	}

	return true;
	}

	} // namespace

	namespace utfutils {
	namespace internal {

	bool IsValidUtf8Scalar(const char* data, const size_t size) {
	if (data == nullptr) {
	return false;
	}

	size_t pos = 0;
	// Fast path: read ASCII bytes in 8-byte chunks until a non-ASCII byte is encountered.
	for (; pos < (size & ~7); pos += 8) {
	uint64_t val;
	memcpy(&val, &data[pos], sizeof(val));
	if ((val & 0x8080'8080'8080'8080) != 0) {
	return IsValidUtf8ScalarFull(data, pos, size);
	}
	}

	// Fast path: drain loop for remaining chunk of ASCII (< 8-byte chunk)
	for (; pos < size; ++pos) {
	if ((data[pos] & 0x80) != 0) {
	return IsValidUtf8ScalarFull(data, pos, size);
	}
	}

	return true;
	}

	bool ValidateAndCopyUtf8Scalar(const char* src, char* dst, const size_t size) {
	bool is_valid = IsValidUtf8Scalar(src, size);
	if (is_valid) {
	memcpy(dst, src, size);
	}

	return is_valid;
	}

	} // namespace internal
	} // namespace utfutils