| // Copyright 2022 The Fuchsia Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include <lib/utf-utils/utf-utils.h> |
| |
| #include <iostream> |
| #include <string> |
| |
| #include <zxtest/zxtest.h> |
| |
| #include "sdk/lib/utf-utils/internal/arm-neon.h" |
| #include "sdk/lib/utf-utils/internal/generic-simd.h" |
| #include "sdk/lib/utf-utils/internal/scalar.h" |
| #include "sdk/lib/utf-utils/internal/x86-avx2.h" |
| #include "sdk/lib/utf-utils/internal/x86-ssse3.h" |
| |
| [[maybe_unused]] constexpr char kTag[] = "[utf-utils-unit-tests] "; |
| |
| template <bool (*ValidateFn)(const char*, size_t), |
| bool (*ValidateAndCopyFn)(const char*, char*, size_t)> |
| bool TestUtf8(const char* data, size_t size) { |
| auto tmp = std::make_unique<char[]>(size); |
| bool validate_result = ValidateFn(data, size); |
| bool copy_result = ValidateAndCopyFn(data, tmp.get(), size); |
| if (copy_result && size > 0) { |
| EXPECT_BYTES_EQ(data, tmp.get(), size); |
| } |
| |
| EXPECT_EQ(validate_result, copy_result); |
| return validate_result; |
| } |
| |
| bool TestUtf8Scalar(const char* data, size_t size) { |
| return TestUtf8<utfutils::internal::IsValidUtf8Scalar, |
| utfutils::internal::ValidateAndCopyUtf8Scalar>(data, size); |
| } |
| |
| bool TestUtf8Neon(const char* data, size_t size, bool expectation) { |
| #ifdef __ARM_NEON |
| static bool printed_message = false; |
| if (!printed_message) { |
| std::cout << kTag << "Testing NEON extensions" << std::endl; |
| printed_message = true; |
| } |
| return TestUtf8<utfutils::internal::IsValidUtf8Simd<utfutils::internal::arm::Neon>, |
| utfutils::internal::ValidateAndCopyUtf8Simd<utfutils::internal::arm::Neon>>(data, |
| size); |
| #else |
| return expectation; |
| #endif |
| } |
| |
| bool TestUtf8Ssse3(const char* data, size_t size, bool expectation) { |
| #ifdef __x86_64__ |
| static bool printed_message = false; |
| if (__builtin_cpu_supports("ssse3")) { |
| if (!printed_message) { |
| std::cout << kTag << "Testing SSSE3 extensions" << std::endl; |
| printed_message = true; |
| } |
| return TestUtf8<utfutils::internal::IsValidUtf8Simd<utfutils::internal::x86::Ssse3>, |
| utfutils::internal::ValidateAndCopyUtf8Simd<utfutils::internal::x86::Ssse3>>( |
| data, size); |
| } |
| |
| return expectation; |
| #else |
| return expectation; |
| #endif |
| } |
| |
| bool TestUtf8Avx2(const char* data, size_t size, bool expectation) { |
| #ifdef __x86_64__ |
| static bool printed_message = false; |
| |
| if (__builtin_cpu_supports("avx2")) { |
| if (!printed_message) { |
| std::cout << kTag << "Testing AVX2 extensions" << std::endl; |
| printed_message = true; |
| } |
| return TestUtf8<utfutils::internal::IsValidUtf8Simd<utfutils::internal::x86::Avx2>, |
| utfutils::internal::ValidateAndCopyUtf8Simd<utfutils::internal::x86::Avx2>>( |
| data, size); |
| } |
| |
| return expectation; |
| #else |
| return expectation; |
| #endif |
| } |
| |
| constexpr size_t kTestVectorBoundaries[] = {7, 13, 14, 15, 29, 30, 31, 61, 62, 63, 79, 95, 127}; |
| |
| #define EXPECT_VALID_STRING_HELPER(bytes, num_bytes) \ |
| { \ |
| EXPECT_TRUE(utfutils_is_valid_utf8((bytes), (num_bytes))); \ |
| EXPECT_TRUE(TestUtf8Scalar((bytes), (num_bytes))); \ |
| EXPECT_TRUE(TestUtf8Neon((bytes), (num_bytes), true)); \ |
| EXPECT_TRUE(TestUtf8Ssse3((bytes), (num_bytes), true)); \ |
| EXPECT_TRUE(TestUtf8Avx2((bytes), (num_bytes), true)); \ |
| } |
| |
| #define EXPECT_INVALID_STRING_HELPER(bytes, num_bytes, explanation) \ |
| { \ |
| EXPECT_FALSE(utfutils_is_valid_utf8((bytes), (num_bytes)), explanation); \ |
| EXPECT_FALSE(TestUtf8Scalar((bytes), (num_bytes)), explanation); \ |
| EXPECT_FALSE(TestUtf8Neon((bytes), (num_bytes), false), explanation); \ |
| EXPECT_FALSE(TestUtf8Ssse3((bytes), (num_bytes), false), explanation); \ |
| EXPECT_FALSE(TestUtf8Avx2((bytes), (num_bytes), false), explanation); \ |
| } |
| |
| #define EXPECT_VALID_STRING(input) \ |
| { \ |
| const char* bytes = input; \ |
| size_t num_bytes = sizeof(input) - 1; \ |
| EXPECT_VALID_STRING_HELPER(bytes, num_bytes); \ |
| for (size_t prefix_size : kTestVectorBoundaries) { \ |
| std::string tmp = std::string(prefix_size, '\r') + std::string((bytes), (num_bytes)); \ |
| EXPECT_VALID_STRING_HELPER(tmp.data(), tmp.size()); \ |
| } \ |
| } |
| |
| #define EXPECT_INVALID_STRING(input, explanation) \ |
| { \ |
| const char* bytes = input; \ |
| size_t num_bytes = sizeof(input) - 1; \ |
| EXPECT_INVALID_STRING_HELPER(bytes, num_bytes, explanation); \ |
| for (size_t prefix_size : kTestVectorBoundaries) { \ |
| std::string tmp = std::string(prefix_size, '\r') + std::string((bytes), (num_bytes)); \ |
| EXPECT_INVALID_STRING_HELPER(tmp.data(), tmp.size(), explanation); \ |
| } \ |
| } |
| |
| TEST(ValidateUtf8, SafeOnNullptr) { EXPECT_FALSE(utfutils_is_valid_utf8(nullptr, 10)); } |
| |
| TEST(ValidateUtf8, MinMaxCodeUnitsAndMinusOneAndPlusOne) { |
| EXPECT_VALID_STRING("\x00"); // single byte, min: 0 |
| EXPECT_VALID_STRING("\x7f"); // single byte, max: 127 |
| EXPECT_VALID_STRING("\xc2\x80"); // two bytes, min: 128 |
| EXPECT_VALID_STRING("\xdf\xbf"); // two bytes, max: 2047 |
| EXPECT_VALID_STRING("\xe1\x80\x80"); // three bytes, min: 2048 |
| EXPECT_VALID_STRING("\xef\xbf\xbf"); // three bytes, max: 65535 |
| EXPECT_VALID_STRING("\xf0\x90\x80\x80"); // four bytes, min: 65536 |
| EXPECT_VALID_STRING("\xf4\x8f\xbf\xbf"); // four bytes, max: 1114111 |
| |
| EXPECT_INVALID_STRING("\x80", "1 above max single byte"); |
| EXPECT_INVALID_STRING("\xc2\x7f", "1 below min two bytes"); |
| EXPECT_INVALID_STRING("\xdf\xc0", "1 above max two bytes"); |
| EXPECT_INVALID_STRING("\xe1\x80\x7f", "1 below min three bytes"); |
| EXPECT_INVALID_STRING("\xef\xbf\xc0", "1 above max three bytes"); |
| EXPECT_INVALID_STRING("\xf0\x80\x80\x80", "1 below min four bytes"); |
| EXPECT_INVALID_STRING("\xf7\xbf\xbf\xc0", "1 above max four bytes"); |
| } |
| |
| TEST(ValidateUtf8, InvalidContinuations) { |
| // 1 test for the first following byte of an initial two byte value not having the high bit. |
| EXPECT_VALID_STRING("\xc2\x80"); |
| EXPECT_INVALID_STRING("\xc2\x7f", "first byte following two byte value not starting with 0b10"); |
| |
| // 2 tests for the first and second following byte of an initial three byte value not having the |
| // high bit set. |
| EXPECT_INVALID_STRING("\xe1\x7f\x80", |
| "first byte following three byte value not starting with 0b10"); |
| EXPECT_INVALID_STRING("\xe1\x80\x7f", |
| "second byte following three byte value not starting with 0b10"); |
| |
| // 3 tests for the first, second, and third following byte of an initial four byte value not |
| // having the high bit set. |
| EXPECT_VALID_STRING("\xf0\x90\x80\x80"); |
| EXPECT_INVALID_STRING("\xf0\x7f\x80\x80", |
| "first byte following four byte value not starting with 0b10"); |
| EXPECT_INVALID_STRING("\xf0\x90\x7f\x80", |
| "second byte following four byte value not starting with 0b10"); |
| EXPECT_INVALID_STRING("\xf0\x90\x80\x7f", |
| "third byte following four byte value not starting with 0b10"); |
| } |
| |
| TEST(ValidateUtf8, OnlyShortestEncodingIsValid) { |
| // All encodings of slash, only the shortest is valid. |
| // |
| // For further details, see "code unit" defined to be 'The minimal bit |
| // combination that can represent a unit of encoded text for processing or |
| // interchange.' |
| EXPECT_VALID_STRING("\x2f"); |
| EXPECT_INVALID_STRING("\xc0\xaf", "slash (2)"); |
| EXPECT_INVALID_STRING("\xe0\x80\xaf", "slash (3)"); |
| EXPECT_INVALID_STRING("\xf0\x80\x80\xaf", "slash (4)"); |
| } |
| |
| TEST(ValidateUtf8, ValidNoncharacterCodepoints) { |
| EXPECT_VALID_STRING("\xd8\x9d"); // U+061D |
| EXPECT_VALID_STRING("\xd7\xb6"); // U+05F6 |
| EXPECT_VALID_STRING("\xe0\xab\xb4"); // U+0AF4 |
| EXPECT_VALID_STRING("\xe0\xb1\x92"); // U+0C52 |
| EXPECT_VALID_STRING("\xf0\x9e\x91\x94"); // U+1E454 |
| EXPECT_VALID_STRING("\xf0\x9f\xa5\xb8"); // U+1F978 |
| } |
| |
| TEST(ValidateUtf8, Various) { |
| EXPECT_VALID_STRING(""); |
| EXPECT_VALID_STRING("a"); |
| EXPECT_VALID_STRING("€"); // \xe2\x82\xac |
| |
| // Mix and match from MinMaxCodeUnitsAndMinusOneAndPlusOne |
| EXPECT_VALID_STRING("\x00\xf4\x8f\xbf\xbf\x7f\xf0\x90\x80\x80\xc2\x80"); |
| EXPECT_VALID_STRING("\xdf\xbf\xef\xbf\xbf\xe1\x80\x80"); |
| |
| // UTF-8 BOM |
| EXPECT_VALID_STRING("\xef\xbb\xbf"); |
| EXPECT_INVALID_STRING("\xef", "Partial UTF-8 BOM (1)"); |
| EXPECT_INVALID_STRING("\xef\xbb", "Partial UTF-8 BOM (2)"); |
| |
| EXPECT_INVALID_STRING("\xdf\x80\x80", "invalid partial sequence"); |
| EXPECT_INVALID_STRING("\xe0\x80\x80", "long U+0000, non shortest form"); |
| EXPECT_VALID_STRING("\xe1\x80\x80"); |
| } |
| |
| TEST(ValidateUtf8, IncompleteCodepointEndOfString) { |
| EXPECT_INVALID_STRING("\xc2", "incomplete 2-byte codepoint"); |
| EXPECT_INVALID_STRING("\xd0", "incomplete 2-byte codepoint"); |
| |
| EXPECT_INVALID_STRING("\xe0", "incomplete 3-byte codepoint"); |
| EXPECT_INVALID_STRING("\xe0\xa9", "incomplete 3-byte codepoint"); |
| EXPECT_INVALID_STRING("\xed", "incomplete 3-byte codepoint"); |
| EXPECT_INVALID_STRING("\xed\x9f", "incomplete 3-byte codepoint"); |
| EXPECT_INVALID_STRING("\xea", "incomplete 3-byte codepoint"); |
| |
| EXPECT_INVALID_STRING("\xf0", "incomplete 4-byte codepoint"); |
| EXPECT_INVALID_STRING("\xf0\xaa", "incomplete 4-byte codepoint"); |
| EXPECT_INVALID_STRING("\xf0\xaa\x80", "incomplete 4-byte codepoint"); |
| EXPECT_INVALID_STRING("\xf4", "incomplete 4-byte codepoint"); |
| EXPECT_INVALID_STRING("\xf4\x8f", "incomplete 4-byte codepoint"); |
| EXPECT_INVALID_STRING("\xf4\xbf\xbf", "incomplete 4-byte codepoint"); |
| } |
| |
| TEST(ValidateUtf8, InvalidSpecialCharacterRanges) { |
| // [0x80, 0xC1] |
| EXPECT_INVALID_STRING("\x80", "invalid 1st byte character in range [0x80, 0xC1]"); |
| EXPECT_INVALID_STRING("\x9d\x9d\x0a", "invalid 1st byte character in range [0x80, 0xC1]"); |
| EXPECT_INVALID_STRING("\xa0", "invalid 1st byte character in range [0x80, 0xC1]"); |
| EXPECT_INVALID_STRING("\xb6", "invalid 1st byte character in range [0x80, 0xC1]"); |
| EXPECT_INVALID_STRING("\xc1", "invalid 1st byte character in range [0x80, 0xC1]"); |
| |
| // 0xE0 followed by something not in [0xA0, 0xBF] |
| EXPECT_INVALID_STRING("\xe0\x16\xc1", "0xE0 followed by something not in [0xA0, 0xBF]"); |
| EXPECT_INVALID_STRING("\xe0\xc0\xbf", "0xE0 followed by something not in [0xA0, 0xBF]"); |
| |
| // 0xED followed by something not in [0x80, 0x9F] |
| EXPECT_INVALID_STRING("\xed\x7f\xbf", "0xED followed by something not in [0x80, 0x9F]"); |
| EXPECT_INVALID_STRING("\xed\x7f\xbf", "0xED followed by something not in [0x80, 0x9F]"); |
| |
| // 0xF0 followed by something not in [0x90, 0xBF] |
| EXPECT_INVALID_STRING("\xf0\x8e\xbf", "0xF0 followed by something not in [0x90, 0xBF]"); |
| EXPECT_INVALID_STRING("\xf0\xc1\xbf", "0xF0 followed by something not in [0x90, 0xBF]"); |
| |
| // 0xF4 followed by something not in [0x80-0x8F] |
| EXPECT_INVALID_STRING("\xf4\x7d\xbc", "0xF4 followed by something not in [0x80, 0x8F]"); |
| EXPECT_INVALID_STRING("\xf4\x92\xa8", "0xF4 followed by something not in [0x80, 0x8F]"); |
| } |
| |
| // All the following test cases are taken from Chromium's |
| // streaming_utf8_validator_unittest.cc |
| // |
| // Some are duplicative to other tests, and have been kept to ease |
| // comparison and translation of the tests. |
| |
| TEST(ValidateUtf8, ChromiumSimple) { |
| EXPECT_VALID_STRING("\r"); |
| EXPECT_VALID_STRING("\n"); |
| EXPECT_VALID_STRING("a"); |
| EXPECT_VALID_STRING("\xc2\x81"); |
| EXPECT_VALID_STRING("\xe1\x80\xbf"); |
| EXPECT_VALID_STRING("\xf1\x80\xa0\xbf"); |
| EXPECT_VALID_STRING("\xef\xbb\xbf"); // UTF-8 BOM |
| } |
| |
| TEST(ValidateUtf8, ChromiumAlwaysInvalidBytes) { |
| EXPECT_INVALID_STRING("\xc0", ""); |
| EXPECT_INVALID_STRING("\xc1", ""); |
| EXPECT_INVALID_STRING("\xf5", ""); |
| EXPECT_INVALID_STRING("\xf6", ""); |
| EXPECT_INVALID_STRING("\xf7", ""); |
| EXPECT_INVALID_STRING("\xf8", ""); |
| EXPECT_INVALID_STRING("\xf9", ""); |
| EXPECT_INVALID_STRING("\xfa", ""); |
| EXPECT_INVALID_STRING("\xfb", ""); |
| EXPECT_INVALID_STRING("\xfc", ""); |
| EXPECT_INVALID_STRING("\xfd", ""); |
| EXPECT_INVALID_STRING("\xfe", ""); |
| EXPECT_INVALID_STRING("\xff", ""); |
| } |
| |
| TEST(ValidateUtf8, ChromiumSurrogateCodepoints) { |
| EXPECT_INVALID_STRING("\xed\xa0\x80", "U+D800, high surrogate, first"); |
| EXPECT_INVALID_STRING("\xed\xb0\x80", "low surrogate, first"); |
| EXPECT_INVALID_STRING("\xed\xbf\xbf", "low surrogate, last"); |
| } |
| |
| TEST(ValidateUtf8, ChromiumOverlongSequences) { |
| EXPECT_INVALID_STRING("\xc0\x80", "U+0000"); |
| EXPECT_INVALID_STRING("\xc1\x80", "\"A\""); |
| EXPECT_INVALID_STRING("\xc1\x81", "\"B\""); |
| EXPECT_INVALID_STRING("\xe0\x80\x80", "U+0000"); |
| EXPECT_INVALID_STRING("\xe0\x82\x80", "U+0080"); |
| EXPECT_INVALID_STRING("\xe0\x9f\xbf", "U+07ff"); |
| EXPECT_INVALID_STRING("\xf0\x80\x80\x8D", "U+000D"); |
| EXPECT_INVALID_STRING("\xf0\x80\x82\x91", "U+0091"); |
| EXPECT_INVALID_STRING("\xf0\x80\xa0\x80", "U+0800"); |
| EXPECT_INVALID_STRING("\xf0\x8f\xbb\xbf", "U+FEFF (BOM)"); |
| EXPECT_INVALID_STRING("\xf8\x80\x80\x80\xbf", "U+003F"); |
| EXPECT_INVALID_STRING("\xfc\x80\x80\x80\xa0\xa5", ""); |
| } |
| |
| TEST(ValidateUtf8, ChromiumBeyondU10FFFF) { |
| // Beyond U+10FFFF |
| EXPECT_INVALID_STRING("\xf4\x90\x80\x80", "U+110000"); |
| EXPECT_INVALID_STRING("\xf5\xaf\xb6\x96", "First byte beyond 0xF4"); |
| EXPECT_INVALID_STRING("\xf8\xa0\xbf\x80\xbf", "5 bytes"); |
| EXPECT_INVALID_STRING("\xfc\x9c\xbf\x80\xbf\x80", "6 bytes"); |
| } |
| |
| TEST(ValidateUtf8, ChromiumUtf16Boms) { |
| // BOMs in UTF-16(BE|LE) |
| EXPECT_INVALID_STRING("\xfe\xff", "BOMs in UTF-16 BE"); |
| EXPECT_INVALID_STRING("\xff\xfe", "BOMs in UTF-16 LE"); |
| } |