blob: b0b9688ae15ad531bce958d38e6f6bd04ae88455 [file] [log] [blame]
// Copyright 2018 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "src/lib/utf_conversion/utf_conversion.h"
#include <lib/stdcompat/bit.h>
#include <stdio.h>
#include <iterator>
#include <fbl/algorithm.h>
#include <zxtest/zxtest.h>
namespace {
constexpr uint32_t HOST_ENDIAN_FLAG = cpp20::endian::native == cpp20::endian::big
? UTF_CONVERT_FLAG_FORCE_BIG_ENDIAN
: UTF_CONVERT_FLAG_FORCE_LITTLE_ENDIAN;
constexpr uint32_t INVERT_ENDIAN_FLAG = cpp20::endian::native == cpp20::endian::big
? UTF_CONVERT_FLAG_FORCE_LITTLE_ENDIAN
: UTF_CONVERT_FLAG_FORCE_BIG_ENDIAN;
#define ASSERT_UTF8_EQ(expected, expected_len, actual, actual_bytes, enc_len, msg) \
do { \
ASSERT_GE(actual_bytes, expected_len, "%s", msg); \
ASSERT_EQ(expected_len, enc_len, "%s", msg); \
ASSERT_BYTES_EQ(expected, actual, expected_len, "%s", msg); \
} while (false)
constexpr uint16_t ByteSwap(uint16_t x) {
return static_cast<uint16_t>(((x & 0xff00) >> 8) | ((x & 0x00ff) << 8));
}
TEST(UTF16To8TestCase, BadArgs) {
uint16_t src;
uint8_t dst = 0xFE;
size_t dst_len;
zx_status_t res;
// Bad destination buffer with non-zero destination length
dst_len = 1;
res = utf16_to_utf8(&src, 1, nullptr, &dst_len);
ASSERT_EQ(ZX_ERR_INVALID_ARGS, res, "null dst should fail with INVALID_ARGS");
ASSERT_EQ(1, dst_len, "dst_len modified after conversion with invalid args");
// Bad dest len pointer
res = utf16_to_utf8(&src, 1, &dst, nullptr);
ASSERT_EQ(ZX_ERR_INVALID_ARGS, res, "null dst_len should fail with INVALID_ARGS");
ASSERT_EQ(0xFE, dst, "dst modified after conversion with invalid args");
// Bad (undefined) flags
res = utf16_to_utf8(&src, 1, &dst, &dst_len, 0x80000000);
ASSERT_EQ(ZX_ERR_INVALID_ARGS, res, "undefined flags should fail with INVALID_ARGS");
ASSERT_EQ(1, dst_len, "dst_len modified after conversion with invalid args");
ASSERT_EQ(0xFE, dst, "dst modified after conversion with invalid args");
// A null dest buffer is allowed if (and only if) the dst_len is zero.
// Practical use cases include using the converter to determine the length
// needed to hold a converted string.
dst_len = 0;
src = 0xAB;
res = utf16_to_utf8(&src, 1, nullptr, &dst_len);
ASSERT_OK(res, "null dst with zero dst_len should succeed");
ASSERT_EQ(2, dst_len, "encoded size of 0xAB should be 2!");
}
TEST(UTF16To8TestCase, EmptySource) {
uint16_t src;
static constexpr uint8_t kExpected[] = {0xA1, 0xB2, 0xC3, 0xD4};
uint8_t actual[sizeof(kExpected)];
size_t dst_len;
zx_status_t res;
// Check to make sure that attempting to encode a zero length source results
// in a length of zero and no changes to the destination buffer.
memcpy(actual, kExpected, sizeof(actual));
dst_len = sizeof(actual);
res = utf16_to_utf8(&src, 0, actual, &dst_len);
ASSERT_OK(res, "zero length string conversion failed");
ASSERT_EQ(0, dst_len, "dst_len should be zero after zero length string conversion");
ASSERT_BYTES_EQ(kExpected, actual, sizeof(actual),
"dst buffer modified after zero length string conversion");
dst_len = sizeof(actual);
res = utf16_to_utf8(nullptr, 1, actual, &dst_len);
ASSERT_OK(res, "null source string conversion failed");
ASSERT_EQ(0, dst_len, "dst_len should be zero after null source string conversion");
ASSERT_BYTES_EQ(kExpected, actual, sizeof(actual),
"dst buffer modified after null source string conversion");
}
TEST(UTF16To8TestCase, SimpleCodepoints) {
static const struct {
uint16_t src;
uint8_t expected[3];
size_t expected_len;
} TEST_VECTORS[] = {
// 1 byte UTF-8 codepoints (U+0000, U+007F)
{0x0000, {0x00}, 1},
{0x0001, {0x01}, 1},
{0x007f, {0x7f}, 1},
// 2 byte UTF-8 codepoints (U+0080, U+07FF)
{0x0080, {0xC2, 0x80}, 2},
{0x0456, {0xD1, 0x96}, 2},
{0x07FF, {0xDF, 0xBF}, 2},
// 3 byte UTF-8 codepoints (U+0800, U+07FF)
// Note: we are skipping the (theoretically illegal) unpaired surrogate
// range (U+D800, U+DFFF) here. There is a separate test for support of
// unpaired surrogates.
{0x0800, {0xE0, 0xA0, 0x80}, 3},
{0x4567, {0xE4, 0x95, 0xA7}, 3},
{0xD7FF, {0xED, 0x9F, 0xBF}, 3},
{0xE000, {0xEE, 0x80, 0x80}, 3},
{0xE456, {0xEE, 0x91, 0x96}, 3},
{0xFFFF, {0xEF, 0xBF, 0xBF}, 3},
};
uint8_t actual[3];
for (const auto& v : TEST_VECTORS) {
char case_id[64];
size_t encoded_len = sizeof(actual);
zx_status_t res;
snprintf(case_id, sizeof(case_id), "case id [0x%04hx]", v.src);
::memset(actual, 0xAB, sizeof(actual));
res = utf16_to_utf8(&v.src, 1, actual, &encoded_len);
ASSERT_OK(res, "%s", case_id);
ASSERT_LE(v.expected_len, sizeof(v.expected), "%s", case_id);
ASSERT_UTF8_EQ(v.expected, v.expected_len, actual, sizeof(actual), encoded_len, case_id);
}
}
TEST(UTF16To8TestCase, PairedSurrogates) {
// All paired surrogate encodings are going to be 4 byte UTF-8 codepoints (U+010000, U+10FFFF)
static const struct {
uint16_t src[2];
uint8_t expected[4];
} TEST_VECTORS[] = {
{{0xD800, 0xDC00}, {0xF0, 0x90, 0x80, 0x80}}, // U+10000
{{0xD811, 0xDD67}, {0xF0, 0x94, 0x95, 0xA7}}, // U+14567
{{0xDA6F, 0xDCDE}, {0xF2, 0xAB, 0xB3, 0x9E}}, // U+ABCDE
{{0xDBBF, 0xDFFF}, {0xF3, 0xBF, 0xBF, 0xBF}}, // U+FFFFF
{{0xDBC0, 0xDC00}, {0xF4, 0x80, 0x80, 0x80}}, // U+100000
{{0xDBD1, 0xDD67}, {0xF4, 0x84, 0x95, 0xA7}}, // U+104567
{{0xDBFF, 0xDFFF}, {0xF4, 0x8F, 0xBF, 0xBF}}, // U+10FFFF
};
uint8_t actual[4];
for (const auto& v : TEST_VECTORS) {
char case_id[64];
size_t encoded_len = sizeof(actual);
zx_status_t res;
snprintf(case_id, sizeof(case_id), "case id [0x%04hx : 0x%04hx]", v.src[0], v.src[1]);
::memset(actual, 0xAB, sizeof(actual));
res = utf16_to_utf8(v.src, std::size(v.src), actual, &encoded_len);
ASSERT_OK(res, "%s", case_id);
ASSERT_UTF8_EQ(v.expected, sizeof(v.expected), actual, sizeof(actual), encoded_len, case_id);
}
}
TEST(UTF16To8TestCase, UnpairedSurrogates) {
static const struct {
uint16_t src;
uint8_t expected[3];
} TEST_VECTORS[] = {
// All unpaired surrogates are technically supposed to be illegal, but
// apparently there are systems out there who use them any (Wikipedia
// claims that Windows allows unpaired surrogates in file names encoded
// using UTF-16)
//
// Unpaired surrogates are 16 bits wide, so they will require a 3-byte
// UTF-8 encoding.
{0xD800, {0xED, 0xA0, 0x80}}, {0xD945, {0xED, 0xA5, 0x85}}, {0xDBFF, {0xED, 0xAF, 0xBF}},
{0xDC00, {0xED, 0xB0, 0x80}}, {0xDD45, {0xED, 0xB5, 0x85}}, {0xDFFF, {0xED, 0xBF, 0xBF}},
};
uint8_t replace[3] = {0xEF, 0xBF, 0xBD};
uint8_t actual[3];
for (const auto& v : TEST_VECTORS) {
char case_id[64];
size_t encoded_len = sizeof(actual);
zx_status_t res;
// Attempt to encode the unpaired surrogate, but do not specify that we
// want to preserve it. We should end up with the encoded form of the
// replacement character (U+FFFD) instead.
snprintf(case_id, sizeof(case_id), "case id [0x%04hx, replace]", v.src);
::memset(actual, 0xAB, sizeof(actual));
encoded_len = sizeof(actual);
res = utf16_to_utf8(&v.src, 1, actual, &encoded_len);
ASSERT_OK(res, "%s", case_id);
ASSERT_UTF8_EQ(replace, sizeof(replace), actual, sizeof(actual), encoded_len, case_id);
// Do it again, but this time tell the converter to preserve the
// unpaired surrogate instead.
snprintf(case_id, sizeof(case_id), "case id [0x%04hx, preserve]", v.src);
::memset(actual, 0xAB, sizeof(actual));
encoded_len = sizeof(actual);
res = utf16_to_utf8(&v.src, 1, actual, &encoded_len,
UTF_CONVERT_FLAG_PRESERVE_UNPAIRED_SURROGATES);
ASSERT_OK(res, "%s", case_id);
ASSERT_UTF8_EQ(v.expected, sizeof(v.expected), actual, sizeof(actual), encoded_len, case_id);
}
}
TEST(UTF16To8TestCase, BufferLengths) {
const uint16_t src[] = {'T', 'e', 's', 't'};
const uint8_t expected[] = {'T', 'e', 's', 't'};
uint8_t actual[16];
// Perform a conversion, but test multiple cases.
//
// 1) The destination buffer size is exactly what is required.
// 2) The destination buffer size is more than what is required.
// 3) The destination buffer size is less than what is required.
// 4) The destination buffer is NULL and buffer size is 0.
static const size_t DST_LENGTHS[] = {sizeof(expected), sizeof(actual), sizeof(expected) >> 1, 0};
for (const auto& d : DST_LENGTHS) {
char case_id[64];
size_t encoded_len = d;
zx_status_t res;
snprintf(case_id, sizeof(case_id), "case id [needed %zu, provided %zu]", sizeof(expected), d);
::memset(actual, 0xAB, sizeof(actual));
ASSERT_LE(encoded_len, sizeof(actual), "%s", case_id);
uint8_t* dest = (d == 0 ? nullptr : actual);
res = utf16_to_utf8(src, std::size(src), dest, &encoded_len);
ASSERT_OK(res, "%s", case_id);
ASSERT_EQ(sizeof(expected), encoded_len, "%s", case_id);
static_assert(sizeof(expected) <= sizeof(actual),
"'actual' buffer must be large enough to hold 'expected' result");
ASSERT_BYTES_EQ(expected, actual, d < encoded_len ? d : encoded_len, "%s", case_id);
if (d < sizeof(actual)) {
uint8_t pattern[sizeof(actual)];
::memset(pattern, 0xAB, sizeof(pattern));
ASSERT_BYTES_EQ(actual + d, pattern, sizeof(actual) - d, "%s", case_id);
}
}
}
TEST(UTF16To8TestCase, EndiannessAndBom) {
static const struct {
uint16_t src[5];
bool host_order;
} SOURCES[] = {{{0xFEFF, 'T', 'e', 's', 't'}, true},
{{
ByteSwap(0xFEFF),
ByteSwap('T'),
ByteSwap('e'),
ByteSwap('s'),
ByteSwap('t'),
},
false}};
const uint8_t bom_removed[] = {'T', 'e', 's', 't'};
const uint8_t bom_removed_inverted[] = {0xE5, 0x90, 0x80, 0xE6, 0x94, 0x80,
0xE7, 0x8C, 0x80, 0xE7, 0x90, 0x80};
const uint8_t bom_encoded[] = {0xEF, 0xBB, 0xBF, 'T', 'e', 's', 't'};
const uint8_t bom_encoded_inverted[] = {0xEF, 0xBF, 0xBE, 0xE5, 0x90, 0x80, 0xE6, 0x94,
0x80, 0xE7, 0x8C, 0x80, 0xE7, 0x90, 0x80};
uint8_t actual[std::size(bom_encoded_inverted)];
#define EXPECT(e) \
{ e, sizeof(e) }
static const struct {
uint32_t flags;
struct {
const uint8_t* exp;
size_t len;
} host;
struct {
const uint8_t* exp;
size_t len;
} inv;
} EXPECTED[]{
{0, EXPECT(bom_encoded), EXPECT(bom_encoded)},
{UTF_CONVERT_FLAG_DISCARD_BOM, EXPECT(bom_removed), EXPECT(bom_removed)},
{HOST_ENDIAN_FLAG, EXPECT(bom_encoded), EXPECT(bom_encoded_inverted)},
{HOST_ENDIAN_FLAG | UTF_CONVERT_FLAG_DISCARD_BOM, EXPECT(bom_removed),
EXPECT(bom_removed_inverted)},
{INVERT_ENDIAN_FLAG, EXPECT(bom_encoded_inverted), EXPECT(bom_encoded)},
{INVERT_ENDIAN_FLAG | UTF_CONVERT_FLAG_DISCARD_BOM, EXPECT(bom_removed_inverted),
EXPECT(bom_removed)},
};
#undef EXPECT
for (const auto& s : SOURCES) {
for (const auto& e : EXPECTED) {
char case_id[64];
zx_status_t res;
size_t enc_len = sizeof(actual);
::memset(actual, 0xAB, sizeof(actual));
snprintf(case_id, sizeof(case_id), "case id [%s BOM, %s endian]",
(e.flags & UTF_CONVERT_FLAG_DISCARD_BOM) ? "discard" : "encode",
(e.flags & HOST_ENDIAN_FLAG) ? "host"
: (e.flags & INVERT_ENDIAN_FLAG) ? "invert"
: "detect");
res = utf16_to_utf8(s.src, std::size(s.src), actual, &enc_len, e.flags);
ASSERT_OK(res, "%s", case_id);
if (s.host_order) {
ASSERT_UTF8_EQ(e.host.exp, e.host.len, actual, sizeof(actual), enc_len, case_id);
} else {
ASSERT_UTF8_EQ(e.inv.exp, e.inv.len, actual, sizeof(actual), enc_len, case_id);
}
}
}
}
TEST(UTF8To16TestCase, SimpleCodepoints) {
// Only one-byte code points are currently handled.
constexpr uint8_t kExpected[] = {0x00, 0x01, 0x7f};
for (const uint8_t expected : kExpected) {
uint16_t actual[16];
size_t encoded_len = std::size(actual);
ASSERT_OK(utf8_to_utf16(&expected, 1, actual, &encoded_len));
ASSERT_EQ(encoded_len, 1);
ASSERT_EQ(actual[0], static_cast<uint16_t>(expected));
}
}
TEST(UTF8To16TestCase, BufferLengths) {
const uint8_t src[] = {'T', 'e', 's', 't'};
const uint16_t expected[] = {'T', 'e', 's', 't'};
uint16_t actual[16];
// Perform a conversion, but test multiple cases.
//
// 1) The destination buffer size is exactly what is required.
// 2) The destination buffer size is more than what is required.
// 3) The destination buffer size is less than what is required.
// 4) The destination buffer is NULL and buffer size is 0.
constexpr size_t DST_LENGTHS[] = {std::size(expected), std::size(actual),
std::size(expected) >> 1, 0};
for (const auto& d : DST_LENGTHS) {
char case_id[64];
size_t encoded_len = d;
zx_status_t res;
snprintf(case_id, sizeof(case_id), "case id [needed %zu, provided %zu]", std::size(expected),
d);
::memset(actual, 0xAB, sizeof(actual));
ASSERT_LE(encoded_len, sizeof(actual), "%s", case_id);
uint16_t* dest = (d == 0 ? nullptr : actual);
res = utf8_to_utf16(src, std::size(src), dest, &encoded_len);
ASSERT_OK(res, "%s", case_id);
ASSERT_EQ(std::size(expected), encoded_len, "%s", case_id);
static_assert(sizeof(expected) <= sizeof(actual),
"'actual' buffer must be large enough to hold 'expected' result");
ASSERT_BYTES_EQ(expected, actual, std::min(d, encoded_len) * sizeof(uint16_t), "%s", case_id);
if (d < std::size(actual)) {
uint16_t pattern[sizeof(actual)];
::memset(pattern, 0xAB, sizeof(pattern));
ASSERT_BYTES_EQ(actual + d, pattern, sizeof(actual) - (d * sizeof(uint16_t)), "%s", case_id);
}
}
}
} // namespace