blob: 3f1996ea0e8f3ac5653448024e28737a30343056 [file] [log] [blame]
// Copyright 2018 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include <endian.h>
#include <stdio.h>
#include <unittest/unittest.h>
#include <utf_conversion/utf_conversion.h>
#include <fbl/algorithm.h>
#if (BYTE_ORDER == BIG_ENDIAN)
static constexpr uint32_t HOST_ENDIAN_FLAG = UTF_CONVERT_FLAG_FORCE_BIG_ENDIAN;
static constexpr uint32_t INVERT_ENDIAN_FLAG = UTF_CONVERT_FLAG_FORCE_LITTLE_ENDIAN;
#else
static constexpr uint32_t HOST_ENDIAN_FLAG = UTF_CONVERT_FLAG_FORCE_LITTLE_ENDIAN;
static constexpr uint32_t INVERT_ENDIAN_FLAG = UTF_CONVERT_FLAG_FORCE_BIG_ENDIAN;
#endif
#define ASSERT_UTF8_EQ(expected, expected_len, actual, actual_bytes, enc_len, msg) \
do { \
ASSERT_GE(actual_bytes, expected_len, msg); \
ASSERT_EQ(expected_len, enc_len, msg); \
ASSERT_BYTES_EQ(expected, actual, expected_len, msg); \
} while(false)
static bool utf16to8_bad_args(void) {
BEGIN_TEST;
uint16_t src;
uint8_t dst = 0xFE;
size_t dst_len;
zx_status_t res;
// Bad destination buffer with non-zero destination length
dst_len = 1;
res = utf16_to_utf8(&src, 1, nullptr, &dst_len);
ASSERT_EQ(ZX_ERR_INVALID_ARGS, res, "null dst should fail with INVALID_ARGS");
ASSERT_EQ(1, dst_len, "dst_len modified after conversion with invalid args");
// Bad dest len pointer
res = utf16_to_utf8(&src, 1, &dst, nullptr);
ASSERT_EQ(ZX_ERR_INVALID_ARGS, res, "null dst_len should fail with INVALID_ARGS");
ASSERT_EQ(0xFE, dst, "dst modified after conversion with invalid args");
// Bad (undefined) flags
res = utf16_to_utf8(&src, 1, &dst, &dst_len, 0x80000000);
ASSERT_EQ(ZX_ERR_INVALID_ARGS, res, "undefined flags should fail with INVALID_ARGS");
ASSERT_EQ(1, dst_len, "dst_len modified after conversion with invalid args");
ASSERT_EQ(0xFE, dst, "dst modified after conversion with invalid args");
// A null dest buffer is allowed if (and only if) the dst_len is zero.
// Practical use cases include using the converter to determine the length
// needed to hold a converted string.
dst_len = 0;
src = 0xAB;
res = utf16_to_utf8(&src, 1, nullptr, &dst_len);
ASSERT_EQ(ZX_OK, res, "null dst with zero dst_len should succeed");
ASSERT_EQ(2, dst_len, "encoded size of 0xAB should be 2!");
END_TEST;
}
static bool utf16to8_empty_source(void) {
BEGIN_TEST;
uint16_t src;
static const uint8_t expected[] = { 0xA1, 0xB2, 0xC3, 0xD4 };
uint8_t actual[sizeof(expected)];
size_t dst_len;
zx_status_t res;
// Check to make sure that attempting to encode a zero length source results
// in a length of zero and no changes to the destination buffer.
memcpy(actual, expected, sizeof(actual));
dst_len = sizeof(actual);
res = utf16_to_utf8(&src, 0,actual, &dst_len);
ASSERT_EQ(ZX_OK, res, "zero length string conversion failed");
ASSERT_EQ(0, dst_len, "dst_len should be zero after zero length string conversion");
ASSERT_BYTES_EQ(expected, actual, sizeof(actual),
"dst buffer modified after zero length string conversion");
dst_len = sizeof(actual);
res = utf16_to_utf8(nullptr, 1,actual, &dst_len);
ASSERT_EQ(ZX_OK, res, "null source string conversion failed");
ASSERT_EQ(0, dst_len, "dst_len should be zero after null source string conversion");
ASSERT_BYTES_EQ(expected, actual, sizeof(actual),
"dst buffer modified after null source string conversion");
END_TEST;
}
static bool utf16to8_simple_codepoints(void) {
BEGIN_TEST;
static const struct {
uint16_t src;
uint8_t expected[3];
size_t expected_len;
} TEST_VECTORS[] = {
// 1 byte UTF-8 codepoints (U+0000, U+007F)
{ 0x0000, { 0x00 }, 1 },
{ 0x0001, { 0x01 }, 1 },
{ 0x007f, { 0x7f }, 1 },
// 2 byte UTF-8 codepoints (U+0080, U+07FF)
{ 0x0080, { 0xC2, 0x80 }, 2 },
{ 0x0456, { 0xD1, 0x96 }, 2 },
{ 0x07FF, { 0xDF, 0xBF }, 2 },
// 3 byte UTF-8 codepoints (U+0800, U+07FF)
// Note: we are skipping the (theoretically illegal) unpaired surrogate
// range (U+D800, U+DFFF) here. There is a separate test for support of
// unpaired surrogates.
{ 0x0800, { 0xE0, 0xA0, 0x80 }, 3 },
{ 0x4567, { 0xE4, 0x95, 0xA7 }, 3 },
{ 0xD7FF, { 0xED, 0x9F, 0xBF }, 3 },
{ 0xE000, { 0xEE, 0x80, 0x80 }, 3 },
{ 0xE456, { 0xEE, 0x91, 0x96 }, 3 },
{ 0xFFFF, { 0xEF, 0xBF, 0xBF }, 3 },
};
uint8_t actual[3];
for (const auto& v : TEST_VECTORS) {
char case_id[64];
size_t encoded_len = sizeof(actual);
zx_status_t res;
snprintf(case_id, sizeof(case_id), "case id [0x%04hx]", v.src);
::memset(actual, 0xAB, sizeof(actual));
res = utf16_to_utf8(&v.src, 1, actual, &encoded_len);
ASSERT_EQ(ZX_OK, res, case_id);
ASSERT_LE(v.expected_len, sizeof(v.expected), case_id);
ASSERT_UTF8_EQ(v.expected, v.expected_len,
actual, sizeof(actual),
encoded_len, case_id);
}
END_TEST;
}
static bool utf16to8_paired_surrogates(void) {
BEGIN_TEST;
// All paired surrogate encodings are going to be 4 byte UTF-8 codepoints (U+010000, U+10FFFF)
static const struct {
uint16_t src[2];
uint8_t expected[4];
} TEST_VECTORS[] = {
{ { 0xD800, 0xDC00 }, { 0xF0, 0x90, 0x80, 0x80 } }, // U+10000
{ { 0xD811, 0xDD67 }, { 0xF0, 0x94, 0x95, 0xA7 } }, // U+14567
{ { 0xDA6F, 0xDCDE }, { 0xF2, 0xAB, 0xB3, 0x9E } }, // U+ABCDE
{ { 0xDBBF, 0xDFFF }, { 0xF3, 0xBF, 0xBF, 0xBF } }, // U+FFFFF
{ { 0xDBC0, 0xDC00 }, { 0xF4, 0x80, 0x80, 0x80 } }, // U+100000
{ { 0xDBD1, 0xDD67 }, { 0xF4, 0x84, 0x95, 0xA7 } }, // U+104567
{ { 0xDBFF, 0xDFFF }, { 0xF4, 0x8F, 0xBF, 0xBF } }, // U+10FFFF
};
uint8_t actual[4];
for (const auto& v : TEST_VECTORS) {
char case_id[64];
size_t encoded_len = sizeof(actual);
zx_status_t res;
snprintf(case_id, sizeof(case_id), "case id [0x%04hx : 0x%04hx]", v.src[0], v.src[1]);
::memset(actual, 0xAB, sizeof(actual));
res = utf16_to_utf8(v.src, fbl::count_of(v.src), actual, &encoded_len);
ASSERT_EQ(ZX_OK, res, case_id);
ASSERT_UTF8_EQ(v.expected, sizeof(v.expected),
actual, sizeof(actual),
encoded_len, case_id);
}
END_TEST;
}
static bool utf16to8_unpaired_surrogates(void) {
BEGIN_TEST;
static const struct {
uint16_t src;
uint8_t expected[3];
} TEST_VECTORS[] = {
// All unpaired surrogates are technically supposed to be illegal, but
// apparently there are systems out there who use them any (Wikipedia
// claims that Windows allows unpaired surrogates in file names encoded
// using UTF-16)
//
// Unpaired surrogates are 16 bits wide, so they will require a 3-byte
// UTF-8 encoding.
{ 0xD800, { 0xED, 0xA0, 0x80 } },
{ 0xD945, { 0xED, 0xA5, 0x85 } },
{ 0xDBFF, { 0xED, 0xAF, 0xBF } },
{ 0xDC00, { 0xED, 0xB0, 0x80 } },
{ 0xDD45, { 0xED, 0xB5, 0x85 } },
{ 0xDFFF, { 0xED, 0xBF, 0xBF } },
};
uint8_t replace[3] = { 0xEF, 0xBF, 0xBD };
uint8_t actual[3];
for (const auto& v : TEST_VECTORS) {
char case_id[64];
size_t encoded_len = sizeof(actual);
zx_status_t res;
// Attempt to encode the unpaired surrogate, but do not specify that we
// want to preserve it. We should end up with the encoded form of the
// replacement character (U+FFFD) instead.
snprintf(case_id, sizeof(case_id), "case id [0x%04hx, replace]", v.src);
::memset(actual, 0xAB, sizeof(actual));
encoded_len = sizeof(actual);
res = utf16_to_utf8(&v.src, 1, actual, &encoded_len);
ASSERT_EQ(ZX_OK, res, case_id);
ASSERT_UTF8_EQ(replace, sizeof(replace), actual, sizeof(actual),
encoded_len, case_id);
// Do it again, but this time tell the converter to preserve the
// unpaired surrogate instead.
snprintf(case_id, sizeof(case_id), "case id [0x%04hx, preserve]", v.src);
::memset(actual, 0xAB, sizeof(actual));
encoded_len = sizeof(actual);
res = utf16_to_utf8(&v.src, 1, actual, &encoded_len,
UTF_CONVERT_FLAG_PRESERVE_UNPAIRED_SURROGATES);
ASSERT_EQ(ZX_OK, res, case_id);
ASSERT_UTF8_EQ(v.expected, sizeof(v.expected), actual, sizeof(actual),
encoded_len, case_id);
}
END_TEST;
}
static bool utf16to8_dst_buffer_lengths(void) {
BEGIN_TEST;
const uint16_t src[] = { 'T', 'e', 's', 't' };
const uint8_t expected[] = { 'T', 'e', 's', 't' };
uint8_t actual[16];
// Perform a conversion, but test three cases.
//
// 1) The destination buffer size is exactly what is required.
// 2) The destination buffer size is more than what is required.
// 3) The destination buffer size is less than what is required.
static const size_t DST_LENGTHS[] = { sizeof(expected), sizeof(actual), sizeof(expected) >> 1 };
for (const auto& d : DST_LENGTHS) {
char case_id[64];
size_t encoded_len = d;
zx_status_t res;
snprintf(case_id, sizeof(case_id), "case id [needed %zu, provided %zu]",
sizeof(expected), d);
::memset(actual, 0xAB, sizeof(actual));
ASSERT_LE(encoded_len, sizeof(actual), case_id);
res = utf16_to_utf8(src, fbl::count_of(src), actual, &encoded_len);
ASSERT_EQ(ZX_OK, res, case_id);
ASSERT_EQ(sizeof(expected), encoded_len, case_id);
static_assert(sizeof(expected) <= sizeof(actual),
"'actual' buffer must be large enough to hold 'expected' result");
ASSERT_BYTES_EQ(expected, actual, d < encoded_len ? d : encoded_len, case_id);
if (d < sizeof(actual)) {
uint8_t pattern[sizeof(actual)];
::memset(pattern, 0xAB, sizeof(pattern));
ASSERT_BYTES_EQ(actual + d, pattern, sizeof(actual) - d, case_id);
}
}
END_TEST;
}
static bool utf16to8_endianness_and_bom(void) {
BEGIN_TEST;
static const struct {
uint16_t src[5];
bool host_order;
} SOURCES[] = {
{ { 0xFEFF, 'T', 'e', 's', 't' }, true },
{ { __bswap16(0xFEFF),
__bswap16('T'),
__bswap16('e'),
__bswap16('s'),
__bswap16('t'),
}, false }
};
const uint8_t bom_removed[] = { 'T', 'e', 's', 't' };
const uint8_t bom_removed_inverted[] = {
0xE5, 0x90, 0x80, 0xE6, 0x94, 0x80, 0xE7,
0x8C, 0x80, 0xE7, 0x90, 0x80 };
const uint8_t bom_encoded[] = { 0xEF, 0xBB, 0xBF, 'T', 'e', 's', 't' };
const uint8_t bom_encoded_inverted[] = {
0xEF, 0xBF, 0xBE, 0xE5, 0x90, 0x80, 0xE6,
0x94, 0x80, 0xE7, 0x8C, 0x80, 0xE7, 0x90,
0x80 };
uint8_t actual[fbl::count_of(bom_encoded_inverted)];
#define EXPECT(e) { e, sizeof(e) }
static const struct {
uint32_t flags;
struct {
const uint8_t* exp;
size_t len;
} host;
struct {
const uint8_t* exp;
size_t len;
} inv;
} EXPECTED[] {
{ 0,
EXPECT(bom_encoded), EXPECT(bom_encoded) },
{ UTF_CONVERT_FLAG_DISCARD_BOM,
EXPECT(bom_removed), EXPECT(bom_removed) },
{ HOST_ENDIAN_FLAG,
EXPECT(bom_encoded), EXPECT(bom_encoded_inverted) },
{ HOST_ENDIAN_FLAG | UTF_CONVERT_FLAG_DISCARD_BOM,
EXPECT(bom_removed), EXPECT(bom_removed_inverted) },
{ INVERT_ENDIAN_FLAG,
EXPECT(bom_encoded_inverted), EXPECT(bom_encoded) },
{ INVERT_ENDIAN_FLAG | UTF_CONVERT_FLAG_DISCARD_BOM,
EXPECT(bom_removed_inverted), EXPECT(bom_removed) },
};
#undef EXPECT
for (const auto& s : SOURCES) {
for (const auto& e : EXPECTED) {
char case_id[64];
zx_status_t res;
size_t enc_len = sizeof(actual);
::memset(actual, 0xAB, sizeof(actual));
snprintf(case_id, sizeof(case_id), "case id [%s BOM, %s endian]",
(e.flags & UTF_CONVERT_FLAG_DISCARD_BOM) ? "discard" : "encode",
(e.flags & HOST_ENDIAN_FLAG) ? "host" :
(e.flags & INVERT_ENDIAN_FLAG) ? "invert" : "detect");
res = utf16_to_utf8(s.src, fbl::count_of(s.src), actual, &enc_len, e.flags);
ASSERT_EQ(ZX_OK, res, case_id);
if (s.host_order) {
ASSERT_UTF8_EQ(e.host.exp, e.host.len, actual, sizeof(actual), enc_len, case_id);
} else {
ASSERT_UTF8_EQ(e.inv.exp, e.inv.len, actual, sizeof(actual), enc_len, case_id);
}
}
}
END_TEST;
}
BEGIN_TEST_CASE(utf_conversion_tests)
RUN_TEST(utf16to8_bad_args);
RUN_TEST(utf16to8_empty_source);
RUN_TEST(utf16to8_simple_codepoints);
RUN_TEST(utf16to8_paired_surrogates);
RUN_TEST(utf16to8_unpaired_surrogates);
RUN_TEST(utf16to8_dst_buffer_lengths);
RUN_TEST(utf16to8_endianness_and_bom);
END_TEST_CASE(utf_conversion_tests)