| // Copyright 2022 The Fuchsia Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include <lib/utf-utils/internal/scalar.h> |
| |
| #include <cstdint> |
| #include <cstring> |
| #ifdef __has_include |
| #if __has_include(<version>) |
| #include <version> |
| #endif |
| #endif |
| |
| namespace { |
| |
| bool IsValidUtf8ScalarFull(const char* data, size_t pos, const size_t size) { |
| // Inclusive range check |
| auto is_in_range = [](uint8_t byte, uint8_t lo, uint8_t hi) { return lo <= byte && byte <= hi; }; |
| |
| // The following comparisons rely on treating bytes as if they are unsigned 8-bit values. |
| // However, both signed and unsigned char are allowed in the C++ spec, with x64 choosing signed |
| // and arm64 choosing unsigned. We therefore force the byte to be treated as unsigned, since we |
| // cannot rely on the default. |
| const uint8_t* str = reinterpret_cast<const uint8_t*>(data); |
| static_assert(sizeof(char) == sizeof(uint8_t), "char and uint8_t are not the same size!"); |
| |
| while (pos < size) { |
| // Table from https://datatracker.ietf.org/doc/html/rfc3629#section-4 |
| // |
| // UTF8-1 = %x00-7F |
| // UTF8-2 = %xC2-DF UTF8-tail |
| // UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) / |
| // %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail ) |
| // UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) / |
| // %xF4 %x80-8F 2( UTF8-tail ) |
| // UTF8-tail = %x80-BF |
| |
| const size_t remaining_size = size - pos; |
| |
| if (str[pos] <= 0x7F) { |
| // UTF8-1 = %x00-7F |
| pos += 1; |
| } else if (is_in_range(str[pos], 0xC2, 0xDF)) /* %xC2-DF */ { |
| // UTF8-2 = %xC2-DF UTF8-tail |
| if (remaining_size < 2) { |
| return false; |
| } |
| if ((str[pos + 1] & 0b11000000) != 0b10000000) { |
| // Not followed by continuation character. |
| return false; |
| } |
| |
| pos += 2; |
| } else if (is_in_range(str[pos], 0xE0, 0xEF)) /* %xE0-EF */ { |
| // UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) / |
| // %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail ) |
| if (remaining_size < 3) { |
| return false; |
| } |
| |
| uint16_t continuations; |
| memcpy(&continuations, &str[pos + 1], sizeof(continuations)); |
| if ((continuations & 0b11000000'11000000) != 0b10000000'10000000) { |
| // Not followed by continuation characters. |
| return false; |
| } |
| if (str[pos] == 0xE0 && !is_in_range(str[pos + 1], 0xA0, 0xBF)) { |
| // First byte is %xE0 but second byte is not in range %xA0-BF. |
| return false; |
| } |
| if (str[pos] == 0xED && !is_in_range(str[pos + 1], 0x80, 0x9F)) { |
| // First byte is %xED but second byte is not in range %x80-9F. |
| return false; |
| } |
| |
| pos += 3; |
| } else if (is_in_range(str[pos], 0xF0, 0xF4)) /* %xF0-F4 */ { |
| // UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) / |
| // %xF4 %x80-8F 2( UTF8-tail ) |
| if (remaining_size < 4) { |
| return false; |
| } |
| |
| // Note: don't forget about endianness here! |
| uint32_t code_point; |
| memcpy(&code_point, &str[pos], sizeof(code_point)); |
| #if !defined(__cpp_if_constexpr) || __cpp_if_constexpr < 201606L |
| if (cpp20::endian::native == cpp20::endian::big) { |
| #else |
| if constexpr (cpp20::endian::native == cpp20::endian::big) { |
| #endif |
| if ((code_point & 0b11000000'11000000'11000000) != 0b10000000'10000000'10000000) { |
| // Not followed by continuation characters. |
| return false; |
| } |
| } else { |
| if ((code_point & 0b11000000'11000000'11000000'00000000) != |
| 0b10000000'10000000'10000000'00000000) { |
| // Not followed by continuation characters. |
| return false; |
| } |
| } |
| |
| if (str[pos] == 0xF0 && !is_in_range(str[pos + 1], 0x90, 0xBF)) { |
| // First byte is %xF0 but second byte is not in range %x90-BF. |
| return false; |
| } |
| if (str[pos] == 0xF4 && !is_in_range(str[pos + 1], 0x80, 0x8F)) { |
| // First byte is %xF4 but second byte is not in range %x80-8F. |
| return false; |
| } |
| |
| pos += 4; |
| } else { |
| return false; |
| } |
| } |
| |
| return true; |
| } |
| |
| } // namespace |
| |
| namespace utfutils { |
| namespace internal { |
| |
| bool IsValidUtf8Scalar(const char* data, const size_t size) { |
| if (data == nullptr) { |
| return false; |
| } |
| |
| size_t pos = 0; |
| // Fast path: read ASCII bytes in 8-byte chunks until a non-ASCII byte is encountered. |
| for (; pos < (size & ~7); pos += 8) { |
| uint64_t val; |
| memcpy(&val, &data[pos], sizeof(val)); |
| if ((val & 0x8080'8080'8080'8080) != 0) { |
| return IsValidUtf8ScalarFull(data, pos, size); |
| } |
| } |
| |
| // Fast path: drain loop for remaining chunk of ASCII (< 8-byte chunk) |
| for (; pos < size; ++pos) { |
| if ((data[pos] & 0x80) != 0) { |
| return IsValidUtf8ScalarFull(data, pos, size); |
| } |
| } |
| |
| return true; |
| } |
| |
| bool ValidateAndCopyUtf8Scalar(const char* src, char* dst, const size_t size) { |
| bool is_valid = IsValidUtf8Scalar(src, size); |
| if (is_valid) { |
| memcpy(dst, src, size); |
| } |
| |
| return is_valid; |
| } |
| |
| } // namespace internal |
| } // namespace utfutils |