[util][utf] Start a UTF conversion library. Start a basic UTF conversion library. Right now, it only converts from UTF-16 to UTF-8, but it can be extended as needs arise. For now, I plan to use it to convert strings in USB descriptors (which are spec'ed to be UTF16) to UTF8 (which seems to be the current defacto standard for the rest of Fuchsia) Change-Id: Ie2bcccb82ec00680752a2fac4d937332bd52fc95

commit: 28f446e9652948bec5cdf4b707813825270a1994 [log] [tgz]
author: John Grossman <johngro@google.com> Tue May 22 15:45:08 2018 -0700
committer: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org> Mon Jun 11 17:35:54 2018 +0000
tree: f59ff9a3f1301c7d871e18a9e34f22aa7e70d242
parent: 522cbc50104f8171c2c5050cad228c57f76dd173 [diff]
diff --git a/system/ulib/utf_conversion/include/utf_conversion/utf_conversion.h b/system/ulib/utf_conversion/include/utf_conversion/utf_conversion.h
new file mode 100644
index 0000000..de3eb4b
--- /dev/null
+++ b/system/ulib/utf_conversion/include/utf_conversion/utf_conversion.h

@@ -0,0 +1,78 @@
+// Copyright 2018 The Fuchsia Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <zircon/compiler.h>
+#include <zircon/types.h>
+
+#pragma once
+
+__BEGIN_CDECLS
+
+// Flags which control UTF conversion behavior.
+//
+// ++ PRESERVE_UNPAIRED_SURROGATES
+// By default, when an unpaired surrogates are encountered in a UTF16 stream,
+// they will be replaced with the codepoint for the Unicode replacment character
+// (U+FFFD).  When the PRESERVE_UNPAIRED_SURROGATE flag is passed, however, the
+// value of the unpaired surrogate will be encoded directly as a codepoint.
+//
+// Note that while the presence of unpaired surrogates are technically a
+// violation of the Unicode UTF16 encoding specification, apparently there are
+// many UTF16 encoded strings in the world today who have chosen to allow this.
+//
+// This implementation considers the following to be unpaired surrogates.
+// ++ A "high" surrogate [0xD800, 0xDBFF] which is not followed by a "low"
+//    surrogate.
+// ++ A "low" surrogate [0xDC00, 0xDFFF] which is not preceded by a "high"
+//    surrogate.
+//
+// ++ FORCE_LITTLE_ENDIAN
+// ++ FORCE_BIG_ENDIAN
+// By default, the conversion process will look for a byte-order-marker (code
+// unit 0xFEFF) in order to determine the endianness of the UTF16 source
+// string.  If no byte-order-marker is detected, host endianness will be
+// assumed.
+//
+// Users may override this behavior by passing one of the force endian flags.
+// The indicated endianness will be assumed, regardless of whether or not a byte
+// order marker is found, and anything.  It is illegal to attempt to force both
+// big and little endian encoding at the same time.  Attempts to do so will
+// result byte-order-marker detection being applied.
+//
+// ++ DISCARD_BOM
+// By default, a byte order marker detected in a UTF16 encoded string will be
+// encoded in the UTF8 output.  Users may change this behavior and cause the BOM
+// to be discarded instead of encoded by passing the DISCARD_BOM flag.
+#define UTF_CONVERT_FLAG_PRESERVE_UNPAIRED_SURROGATES   ((uint32_t)0x01)
+#define UTF_CONVERT_FLAG_FORCE_LITTLE_ENDIAN            ((uint32_t)0x02)
+#define UTF_CONVERT_FLAG_FORCE_BIG_ENDIAN               ((uint32_t)0x04)
+#define UTF_CONVERT_FLAG_DISCARD_BOM                    ((uint32_t)0x08)
+
+// Attempt to convert a UTF16 string to UTF8 using either an explicitly
+// specified (utf16le_*, utf16be_*) or an unspecified endianness (utf16_*)
+//
+// src     : a pointer to the source string, encoded using UTF16
+// src_len : The number of code units (uint16_t) in the source to process.
+// dst     : a pointer to the buffer which will hold the null terminated result
+//           of the conversion.
+// dst_len : A pointer to the length of of the destination buffer (in bytes).
+//           Afterwards, this parameter will be updated to indicate the total
+//           number of bytes it would take to hold a null terminated
+//           representation of the UTF8 string, even if there was not enough
+//           room in the destination buffer to perform a full conversion.
+// flags   : Flags which control the conversion process.  See above.
+//
+// Note:  Embedded nulls within the source will be processed and encoded.  *No*
+// null termination of the destination buffer will be performed by default.
+#if __cplusplus
+zx_status_t utf16_to_utf8(const uint16_t* src, size_t src_len,
+                          uint8_t* dst, size_t* dst_len,
+                          uint32_t flags = 0);
+#else
+zx_status_t utf16_to_utf8(const uint16_t* src, size_t src_len,
+                          uint8_t* dst, size_t* dst_len,
+                          uint32_t flags);
+#endif
+
+__END_CDECLS

diff --git a/system/ulib/utf_conversion/rules.mk b/system/ulib/utf_conversion/rules.mk
new file mode 100644
index 0000000..7be7fb0
--- /dev/null
+++ b/system/ulib/utf_conversion/rules.mk

@@ -0,0 +1,16 @@
+# Copyright 2018 The Fuchsia Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+LOCAL_DIR := $(GET_LOCAL_DIR)
+
+MODULE := $(LOCAL_DIR)
+
+MODULE_TYPE := userlib
+
+MODULE_SRCS += \
+    $(LOCAL_DIR)/utf_conversion.cpp
+
+MODULE_PACKAGE := static
+
+include make/module.mk

diff --git a/system/ulib/utf_conversion/utf_conversion.cpp b/system/ulib/utf_conversion/utf_conversion.cpp
new file mode 100644
index 0000000..cd73419
--- /dev/null
+++ b/system/ulib/utf_conversion/utf_conversion.cpp

@@ -0,0 +1,199 @@
+// Copyright 2018 The Fuchsia Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <endian.h>
+#include <utf_conversion/utf_conversion.h>
+#include <zircon/assert.h>
+
+namespace {
+
+enum class Endianness {
+    HOST,
+    INVERT,
+};
+
+#if BYTE_ORDER == BIG_ENDIAN
+constexpr Endianness kBigEndian = Endianness::HOST;
+constexpr Endianness kLittleEndian = Endianness::INVERT;
+#else
+constexpr Endianness kBigEndian = Endianness::INVERT;
+constexpr Endianness kLittleEndian = Endianness::HOST;
+#endif
+
+template <Endianness E>
+struct CodeUnit;
+
+template <>
+struct CodeUnit<Endianness::HOST> {
+    static inline uint16_t Read(uint16_t val) { return val; }
+};
+template <>
+struct CodeUnit<Endianness::INVERT> {
+    static inline uint16_t Read(uint16_t val) { return __bswap16(val); }
+};
+
+static constexpr bool IsHighSurrogate(uint16_t val)  { return ((val >= 0xD800) && (val <= 0xDBFF)); }
+static constexpr bool IsLowSurrogate(uint16_t val) { return ((val >= 0xDC00) && (val <= 0xDFFF)); }
+constexpr uint32_t kMaxUnicodeCodePoint = 0x10FFFF;
+constexpr uint32_t kSupplementaryPlaneStart = 0x10000;
+constexpr uint32_t kUnicodeReplacementChar = 0xFFFD;
+
+// If there is space to do so, encode the Unicode code point provided as UTF8.
+// No matter what, return the number of bytes that the encoded code point would
+// take.
+//
+// If the input is an invalid Unicode codepoint, signal this by returning 0.
+inline uint32_t EncodeUtf8CodePoint(uint32_t code_point,
+                                    uint8_t* tgt,
+                                    size_t tgt_len,
+                                    size_t offset) {
+    // If this codepoint is illegal (for whatever reason), replace it with the
+    // Unicode replacement character instead.
+    if (code_point > kMaxUnicodeCodePoint) {
+        code_point = kUnicodeReplacementChar;
+    }
+
+    if (code_point < 0x80) {
+        if ((tgt_len > offset) && ((tgt_len - offset) >= 1)) {
+            tgt[offset] = static_cast<uint8_t>(code_point);
+        }
+        return 1;
+    } else
+    if (code_point < 0x800) {
+        if ((tgt_len > offset) && ((tgt_len - offset) >= 2)) {
+            tgt[offset + 0] = static_cast<uint8_t>(0xC0 | (code_point >> 6));
+            tgt[offset + 1] = static_cast<uint8_t>(0x80 | (code_point & 0x3F));
+        }
+        return 2;
+    } else
+    if (code_point < 0x10000) {
+        if ((tgt_len > offset) && ((tgt_len - offset) >= 3)) {
+            tgt[offset + 0] = static_cast<uint8_t>(0xE0 | (code_point >> 12));
+            tgt[offset + 1] = static_cast<uint8_t>(0x80 | ((code_point >> 6) & 0x3F));
+            tgt[offset + 2] = static_cast<uint8_t>(0x80 | (code_point & 0x3F));
+        }
+        return 3;
+    }
+
+    ZX_DEBUG_ASSERT(code_point <= kMaxUnicodeCodePoint);
+    if ((tgt_len > offset) && ((tgt_len - offset) >= 4)) {
+        tgt[offset + 0] = static_cast<uint8_t>(0xF0 | (code_point >> 18));
+        tgt[offset + 1] = static_cast<uint8_t>(0x80 | ((code_point >> 12) & 0x3F));
+        tgt[offset + 2] = static_cast<uint8_t>(0x80 | ((code_point >> 6) & 0x3F));
+        tgt[offset + 3] = static_cast<uint8_t>(0x80 | (code_point & 0x3F));
+    }
+    return 4;
+}
+
+template <Endianness E>
+zx_status_t Utf16ToUtf8(const uint16_t* src, size_t src_len, uint8_t* dst, size_t* dst_len,
+                        uint32_t flags) {
+    bool preserve_unpaired = (flags & UTF_CONVERT_FLAG_PRESERVE_UNPAIRED_SURROGATES);
+    zx_status_t ret = ZX_OK;
+    size_t rd = 0;
+    size_t wr = 0;
+
+    ZX_DEBUG_ASSERT((src != nullptr) && (dst_len != nullptr));
+    ZX_DEBUG_ASSERT((dst != nullptr) || (*dst_len == 0));
+
+    // Process all of our source characters.  Even if we run out of space in our
+    // destination, we need to compute the space that we would have needed.
+    while (rd < src_len) {
+        uint16_t code_unit = CodeUnit<E>::Read(src[rd++]);
+        uint32_t code_point;
+
+        // If this is a high surrogate, go looking for its low surrogate pair.
+        if (IsHighSurrogate(code_unit)) {
+            uint16_t high = code_unit;
+
+            // Fetch the next code unit, if any, and then attempt to pair it up
+            // with this high surrogate.
+            code_unit = (rd < src_len) ? CodeUnit<E>::Read(src[rd]) : 0;
+
+            // If the next code unit we peeked at is a low surrogate, then
+            // combine high and low to form the code point and then encode that.
+            // Otherwise, the high surrogate we have encountered is unpaired and
+            // should either be replaced or preserved, depending on our flags.
+            if (IsLowSurrogate(code_unit)) {
+                constexpr uint32_t SHIFT = 10u;
+                constexpr uint32_t MASK = (1u << SHIFT) - 1;
+                code_point = ((code_unit & MASK) | (static_cast<uint32_t>(high & MASK) << SHIFT))
+                           + kSupplementaryPlaneStart;
+                ++rd;
+            } else {
+                code_point = preserve_unpaired ? high : kUnicodeReplacementChar;
+            }
+        } else if (IsLowSurrogate(code_unit) && !preserve_unpaired) {
+            code_point = kUnicodeReplacementChar;
+        } else {
+            code_point = code_unit;
+        }
+
+        wr += EncodeUtf8CodePoint(code_point, dst, *dst_len, wr);
+    }
+
+
+    *dst_len = wr;
+    return ret;
+}
+
+}  // anon namespace
+
+extern "C" {
+
+zx_status_t utf16_to_utf8(const uint16_t* src, size_t src_len, uint8_t* dst, size_t *dst_len,
+                          uint32_t flags) {
+    // Sanity check our args.
+    constexpr uint32_t ENDIAN_FLAGS = UTF_CONVERT_FLAG_FORCE_LITTLE_ENDIAN
+                                    | UTF_CONVERT_FLAG_FORCE_BIG_ENDIAN;
+    constexpr uint32_t ALL_FLAGS = UTF_CONVERT_FLAG_DISCARD_BOM |
+                                   UTF_CONVERT_FLAG_PRESERVE_UNPAIRED_SURROGATES |
+                                   ENDIAN_FLAGS;
+    // dst_len *must* be provided, and all flags need to be understood.
+    if ((dst_len == nullptr) || (flags & ~ALL_FLAGS)) {
+        return ZX_ERR_INVALID_ARGS;
+    }
+
+    // dst may only be null if dst_len is zero (eg; a sizing operation)
+    if ((dst == nullptr) && (*dst_len != 0)) {
+        return ZX_ERR_INVALID_ARGS;
+    }
+
+    // handle the special case of an empty source string.
+    if (!src || !src_len) {
+        *dst_len = 0;
+        return ZX_OK;
+    }
+
+    // Deal with endian detection.
+    Endianness detected;
+
+    constexpr uint16_t HOST_BOM = 0xFEFF;
+    constexpr uint16_t INVERT_BOM = 0xFFFE;
+    const uint16_t bom = *src;
+    bool bom_detected = (bom == HOST_BOM) || (bom == INVERT_BOM);
+
+    if ((flags & ENDIAN_FLAGS) == UTF_CONVERT_FLAG_FORCE_LITTLE_ENDIAN) {
+        detected = kLittleEndian;
+    } else
+    if ((flags & ENDIAN_FLAGS) == UTF_CONVERT_FLAG_FORCE_BIG_ENDIAN) {
+        detected = kBigEndian;
+    } else {
+        detected = (bom_detected && (bom == INVERT_BOM)) ? Endianness::INVERT : Endianness::HOST;
+    }
+
+    if (bom_detected && (flags & UTF_CONVERT_FLAG_DISCARD_BOM)) {
+        ZX_DEBUG_ASSERT(src_len > 0);
+        ++src;
+        --src_len;
+    }
+
+    if (detected == Endianness::INVERT) {
+        return Utf16ToUtf8<Endianness::INVERT>(src, src_len, dst, dst_len, flags);
+    } else {
+        return Utf16ToUtf8<Endianness::HOST>(src, src_len, dst, dst_len, flags);
+    }
+}
+
+}  // extern "C"

diff --git a/system/utest/utf_conversion/main.cpp b/system/utest/utf_conversion/main.cpp
new file mode 100644
index 0000000..1f6bd83
--- /dev/null
+++ b/system/utest/utf_conversion/main.cpp

@@ -0,0 +1,366 @@
+// Copyright 2018 The Fuchsia Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <endian.h>
+#include <stdio.h>
+#include <unittest/unittest.h>
+#include <utf_conversion/utf_conversion.h>
+
+#if (BYTE_ORDER == BIG_ENDIAN)
+static constexpr uint32_t HOST_ENDIAN_FLAG   = UTF_CONVERT_FLAG_FORCE_BIG_ENDIAN;
+static constexpr uint32_t INVERT_ENDIAN_FLAG = UTF_CONVERT_FLAG_FORCE_LITTLE_ENDIAN;
+#else
+static constexpr uint32_t HOST_ENDIAN_FLAG   = UTF_CONVERT_FLAG_FORCE_LITTLE_ENDIAN;
+static constexpr uint32_t INVERT_ENDIAN_FLAG = UTF_CONVERT_FLAG_FORCE_BIG_ENDIAN;
+#endif
+
+#define ASSERT_UTF8_EQ(expected, expected_len, actual, actual_bytes, enc_len, msg)  \
+    do {                                                                            \
+        ASSERT_GE(actual_bytes, expected_len, msg);                                 \
+        ASSERT_EQ(expected_len, enc_len, msg);                                      \
+        ASSERT_BYTES_EQ(expected, actual, expected_len, msg);                       \
+    } while(false)
+
+static bool utf16to8_bad_args(void) {
+    BEGIN_TEST;
+
+    uint16_t src;
+    uint8_t dst = 0xFE;
+    size_t dst_len;
+    zx_status_t res;
+
+    // Bad destination buffer with non-zero destination length
+    dst_len = 1;
+    res = utf16_to_utf8(&src, 1, nullptr, &dst_len);
+    ASSERT_EQ(ZX_ERR_INVALID_ARGS, res, "null dst should fail with INVALID_ARGS");
+    ASSERT_EQ(1, dst_len, "dst_len modified after conversion with invalid args");
+
+    // Bad dest len pointer
+    res = utf16_to_utf8(&src, 1, &dst, nullptr);
+    ASSERT_EQ(ZX_ERR_INVALID_ARGS, res, "null dst_len should fail with INVALID_ARGS");
+    ASSERT_EQ(0xFE, dst, "dst modified after conversion with invalid args");
+
+    // Bad (undefined) flags
+    res = utf16_to_utf8(&src, 1, &dst, &dst_len, 0x80000000);
+    ASSERT_EQ(ZX_ERR_INVALID_ARGS, res, "undefined flags should fail with INVALID_ARGS");
+    ASSERT_EQ(1, dst_len, "dst_len modified after conversion with invalid args");
+    ASSERT_EQ(0xFE, dst, "dst modified after conversion with invalid args");
+
+    // A null dest buffer is allowed if (and only if) the dst_len is zero.
+    // Practical use cases include using the converter to determine the length
+    // needed to hold a converted string.
+    dst_len = 0;
+    src = 0xAB;
+    res = utf16_to_utf8(&src, 1, nullptr, &dst_len);
+    ASSERT_EQ(ZX_OK, res, "null dst with zero dst_len should succeed");
+    ASSERT_EQ(2, dst_len, "encoded size of 0xAB should be 2!");
+
+    END_TEST;
+}
+
+static bool utf16to8_empty_source(void) {
+    BEGIN_TEST;
+
+    uint16_t src;
+    static const uint8_t expected[] = { 0xA1, 0xB2, 0xC3, 0xD4 };
+    uint8_t actual[sizeof(expected)];
+    size_t dst_len;
+    zx_status_t res;
+
+    // Check to make sure that attempting to encode a zero length source results
+    // in a length of zero and no changes to the destination buffer.
+    memcpy(actual, expected, sizeof(actual));
+    dst_len = sizeof(actual);
+    res = utf16_to_utf8(&src, 0,actual, &dst_len);
+    ASSERT_EQ(ZX_OK, res, "zero length string conversion failed");
+    ASSERT_EQ(0, dst_len, "dst_len should be zero after zero length string conversion");
+    ASSERT_BYTES_EQ(expected, actual, sizeof(actual),
+                    "dst buffer modified after zero length string conversion");
+
+    dst_len = sizeof(actual);
+    res = utf16_to_utf8(nullptr, 1,actual, &dst_len);
+    ASSERT_EQ(ZX_OK, res, "null source string conversion failed");
+    ASSERT_EQ(0, dst_len, "dst_len should be zero after null source string conversion");
+    ASSERT_BYTES_EQ(expected, actual, sizeof(actual),
+                    "dst buffer modified after null source string conversion");
+
+    END_TEST;
+}
+
+static bool utf16to8_simple_codepoints(void) {
+    BEGIN_TEST;
+
+    static const struct {
+        uint16_t src;
+        uint8_t expected[3];
+        size_t  expected_len;
+    } TEST_VECTORS[] = {
+        // 1 byte UTF-8 codepoints (U+0000, U+007F)
+        { 0x0000, { 0x00 }, 1 },
+        { 0x0001, { 0x01 }, 1 },
+        { 0x007f, { 0x7f }, 1 },
+
+        // 2 byte UTF-8 codepoints (U+0080, U+07FF)
+        { 0x0080, { 0xC2, 0x80 }, 2 },
+        { 0x0456, { 0xD1, 0x96 }, 2 },
+        { 0x07FF, { 0xDF, 0xBF }, 2 },
+
+        // 3 byte UTF-8 codepoints (U+0800, U+07FF)
+        // Note: we are skipping the (theoretically illegal) unpaired surrogate
+        // range (U+D800, U+DFFF) here.  There is a separate test for support of
+        // unpaired surrogates.
+        { 0x0800, { 0xE0, 0xA0, 0x80 }, 3 },
+        { 0x4567, { 0xE4, 0x95, 0xA7 }, 3 },
+        { 0xD7FF, { 0xED, 0x9F, 0xBF }, 3 },
+        { 0xE000, { 0xEE, 0x80, 0x80 }, 3 },
+        { 0xE456, { 0xEE, 0x91, 0x96 }, 3 },
+        { 0xFFFF, { 0xEF, 0xBF, 0xBF }, 3 },
+    };
+
+    uint8_t actual[3];
+    for (const auto& v : TEST_VECTORS) {
+        char case_id[64];
+        size_t encoded_len = sizeof(actual);
+        zx_status_t res;
+
+        snprintf(case_id, sizeof(case_id), "case id [0x%04hx]", v.src);
+        ::memset(actual, 0xAB, sizeof(actual));
+
+        res = utf16_to_utf8(&v.src, 1, actual, &encoded_len);
+        ASSERT_EQ(ZX_OK, res, case_id);
+        ASSERT_LE(v.expected_len, sizeof(v.expected), case_id);
+        ASSERT_UTF8_EQ(v.expected, v.expected_len,
+                       actual, sizeof(actual),
+                       encoded_len, case_id);
+    }
+
+    END_TEST;
+}
+
+static bool utf16to8_paired_surrogates(void) {
+    BEGIN_TEST;
+
+    // All paired surrogate encodings are going to be 4 byte UTF-8 codepoints (U+010000, U+10FFFF)
+    static const struct {
+        uint16_t src[2];
+        uint8_t expected[4];
+    } TEST_VECTORS[] = {
+        { { 0xD800, 0xDC00 }, { 0xF0, 0x90, 0x80, 0x80 } }, // U+10000
+        { { 0xD811, 0xDD67 }, { 0xF0, 0x94, 0x95, 0xA7 } }, // U+14567
+        { { 0xDA6F, 0xDCDE }, { 0xF2, 0xAB, 0xB3, 0x9E } }, // U+ABCDE
+        { { 0xDBBF, 0xDFFF }, { 0xF3, 0xBF, 0xBF, 0xBF } }, // U+FFFFF
+        { { 0xDBC0, 0xDC00 }, { 0xF4, 0x80, 0x80, 0x80 } }, // U+100000
+        { { 0xDBD1, 0xDD67 }, { 0xF4, 0x84, 0x95, 0xA7 } }, // U+104567
+        { { 0xDBFF, 0xDFFF }, { 0xF4, 0x8F, 0xBF, 0xBF } }, // U+10FFFF
+    };
+
+    uint8_t actual[4];
+    for (const auto& v : TEST_VECTORS) {
+        char case_id[64];
+        size_t encoded_len = sizeof(actual);
+        zx_status_t res;
+
+        snprintf(case_id, sizeof(case_id), "case id [0x%04hx : 0x%04hx]", v.src[0], v.src[1]);
+        ::memset(actual, 0xAB, sizeof(actual));
+
+        res = utf16_to_utf8(v.src, countof(v.src), actual, &encoded_len);
+        ASSERT_EQ(ZX_OK, res, case_id);
+        ASSERT_UTF8_EQ(v.expected, sizeof(v.expected),
+                       actual, sizeof(actual),
+                       encoded_len, case_id);
+    }
+
+    END_TEST;
+}
+
+static bool utf16to8_unpaired_surrogates(void) {
+    BEGIN_TEST;
+
+    static const struct {
+        uint16_t src;
+        uint8_t expected[3];
+    } TEST_VECTORS[] = {
+        // All unpaired surrogates are technically supposed to be illegal, but
+        // apparently there are systems out there who use them any (Wikipedia
+        // claims that Windows allows unpaired surrogates in file names encoded
+        // using UTF-16)
+        //
+        // Unpaired surrogates are 16 bits wide, so they will require a 3-byte
+        // UTF-8 encoding.
+        { 0xD800, { 0xED, 0xA0, 0x80 } },
+        { 0xD945, { 0xED, 0xA5, 0x85 } },
+        { 0xDBFF, { 0xED, 0xAF, 0xBF } },
+        { 0xDC00, { 0xED, 0xB0, 0x80 } },
+        { 0xDD45, { 0xED, 0xB5, 0x85 } },
+        { 0xDFFF, { 0xED, 0xBF, 0xBF } },
+    };
+    uint8_t replace[3] = { 0xEF, 0xBF, 0xBD };
+    uint8_t actual[3];
+    for (const auto& v : TEST_VECTORS) {
+        char case_id[64];
+        size_t encoded_len = sizeof(actual);
+        zx_status_t res;
+
+        // Attempt to encode the unpaired surrogate, but do not specify that we
+        // want to preserve it.  We should end up with the encoded form of the
+        // replacement character (U+FFFD) instead.
+        snprintf(case_id, sizeof(case_id), "case id [0x%04hx, replace]", v.src);
+        ::memset(actual, 0xAB, sizeof(actual));
+
+        encoded_len = sizeof(actual);
+        res = utf16_to_utf8(&v.src, 1, actual, &encoded_len);
+        ASSERT_EQ(ZX_OK, res, case_id);
+        ASSERT_UTF8_EQ(replace, sizeof(replace), actual, sizeof(actual),
+                       encoded_len, case_id);
+
+        // Do it again, but this time tell the converter to preserve the
+        // unpaired surrogate instead.
+        snprintf(case_id, sizeof(case_id), "case id [0x%04hx, preserve]", v.src);
+        ::memset(actual, 0xAB, sizeof(actual));
+
+        encoded_len = sizeof(actual);
+        res = utf16_to_utf8(&v.src, 1, actual, &encoded_len,
+                            UTF_CONVERT_FLAG_PRESERVE_UNPAIRED_SURROGATES);
+        ASSERT_EQ(ZX_OK, res, case_id);
+        ASSERT_UTF8_EQ(v.expected, sizeof(v.expected), actual, sizeof(actual),
+                       encoded_len, case_id);
+    }
+
+    END_TEST;
+}
+
+static bool utf16to8_dst_buffer_lengths(void) {
+    BEGIN_TEST;
+
+    const uint16_t src[] = { 'T', 'e', 's', 't' };
+    const uint8_t expected[] = { 'T', 'e', 's', 't' };
+    uint8_t actual[16];
+
+    // Perform a conversion, but test three cases.
+    //
+    // 1) The destination buffer size is exactly what is required.
+    // 2) The destination buffer size is more than what is required.
+    // 3) The destination buffer size is less than what is required.
+    static const size_t DST_LENGTHS[] = { sizeof(expected), sizeof(actual), sizeof(expected) >> 1 };
+    for (const auto& d : DST_LENGTHS) {
+        char case_id[64];
+        size_t encoded_len = d;
+        zx_status_t res;
+
+        snprintf(case_id, sizeof(case_id), "case id [needed %zu, provided %zu]",
+                 sizeof(expected), d);
+        ::memset(actual, 0xAB, sizeof(actual));
+
+        ASSERT_LE(encoded_len, sizeof(actual), case_id);
+        res = utf16_to_utf8(src, countof(src), actual, &encoded_len);
+
+        ASSERT_EQ(ZX_OK, res, case_id);
+        ASSERT_EQ(sizeof(expected), encoded_len, case_id);
+        static_assert(sizeof(expected) <= sizeof(actual),
+                      "'actual' buffer must be large enough to hold 'expected' result");
+        ASSERT_BYTES_EQ(expected, actual, d < encoded_len ? d : encoded_len, case_id);
+
+        if (d < sizeof(actual)) {
+            uint8_t pattern[sizeof(actual)];
+            ::memset(pattern, 0xAB, sizeof(pattern));
+            ASSERT_BYTES_EQ(actual + d, pattern, sizeof(actual) - d, case_id);
+        }
+    }
+
+    END_TEST;
+}
+
+static bool utf16to8_endianness_and_bom(void) {
+    BEGIN_TEST;
+
+    static const struct {
+        uint16_t src[5];
+        bool host_order;
+    } SOURCES[] = {
+        { { 0xFEFF, 'T', 'e', 's', 't' }, true },
+        { { __bswap16(0xFEFF),
+            __bswap16('T'),
+            __bswap16('e'),
+            __bswap16('s'),
+            __bswap16('t'),
+            }, false }
+    };
+
+    const uint8_t bom_removed[] = { 'T', 'e', 's', 't' };
+    const uint8_t bom_removed_inverted[] = {
+        0xE5, 0x90, 0x80, 0xE6, 0x94, 0x80, 0xE7,
+        0x8C, 0x80, 0xE7, 0x90, 0x80 };
+    const uint8_t bom_encoded[] = { 0xEF, 0xBB, 0xBF, 'T', 'e', 's', 't' };
+    const uint8_t bom_encoded_inverted[] = {
+        0xEF, 0xBF, 0xBE, 0xE5, 0x90, 0x80, 0xE6,
+        0x94, 0x80, 0xE7, 0x8C, 0x80, 0xE7, 0x90,
+        0x80 };
+    uint8_t actual[countof(bom_encoded_inverted)];
+
+#define EXPECT(e) { e, sizeof(e) }
+    static const struct {
+        uint32_t flags;
+        struct {
+            const uint8_t* exp;
+            size_t len;
+        } host;
+        struct {
+            const uint8_t* exp;
+            size_t len;
+        } inv;
+    } EXPECTED[] {
+        { 0,
+          EXPECT(bom_encoded), EXPECT(bom_encoded) },
+        { UTF_CONVERT_FLAG_DISCARD_BOM,
+          EXPECT(bom_removed), EXPECT(bom_removed) },
+        { HOST_ENDIAN_FLAG,
+          EXPECT(bom_encoded), EXPECT(bom_encoded_inverted) },
+        { HOST_ENDIAN_FLAG | UTF_CONVERT_FLAG_DISCARD_BOM,
+          EXPECT(bom_removed), EXPECT(bom_removed_inverted) },
+        { INVERT_ENDIAN_FLAG,
+          EXPECT(bom_encoded_inverted), EXPECT(bom_encoded) },
+        { INVERT_ENDIAN_FLAG | UTF_CONVERT_FLAG_DISCARD_BOM,
+          EXPECT(bom_removed_inverted), EXPECT(bom_removed) },
+    };
+#undef EXPECT
+
+    for (const auto& s : SOURCES) {
+        for (const auto& e : EXPECTED) {
+            char case_id[64];
+            zx_status_t res;
+            size_t enc_len = sizeof(actual);
+
+            ::memset(actual, 0xAB, sizeof(actual));
+            snprintf(case_id, sizeof(case_id), "case id [%s BOM, %s endian]",
+                     (e.flags & UTF_CONVERT_FLAG_DISCARD_BOM) ? "discard" : "encode",
+                     (e.flags & HOST_ENDIAN_FLAG) ? "host" :
+                     (e.flags & INVERT_ENDIAN_FLAG) ? "invert" : "detect");
+
+            res = utf16_to_utf8(s.src, countof(s.src), actual, &enc_len, e.flags);
+            ASSERT_EQ(ZX_OK, res, case_id);
+
+            if (s.host_order) {
+                ASSERT_UTF8_EQ(e.host.exp, e.host.len, actual, sizeof(actual), enc_len, case_id);
+            } else {
+                ASSERT_UTF8_EQ(e.inv.exp, e.inv.len, actual, sizeof(actual), enc_len, case_id);
+            }
+        }
+    }
+
+    END_TEST;
+}
+
+BEGIN_TEST_CASE(utf_conversion_tests)
+RUN_TEST(utf16to8_bad_args);
+RUN_TEST(utf16to8_empty_source);
+RUN_TEST(utf16to8_simple_codepoints);
+RUN_TEST(utf16to8_paired_surrogates);
+RUN_TEST(utf16to8_unpaired_surrogates);
+RUN_TEST(utf16to8_dst_buffer_lengths);
+RUN_TEST(utf16to8_endianness_and_bom);
+END_TEST_CASE(utf_conversion_tests)
+
+int main(int argc, char** argv) {
+    return unittest_run_all_tests(argc, argv) ? 0 : -1;
+}

diff --git a/system/utest/utf_conversion/rules.mk b/system/utest/utf_conversion/rules.mk
new file mode 100644
index 0000000..55b070e
--- /dev/null
+++ b/system/utest/utf_conversion/rules.mk

@@ -0,0 +1,27 @@
+# Copyright 2018 The Fuchsia Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+LOCAL_DIR := $(GET_LOCAL_DIR)
+
+MODULE := $(LOCAL_DIR)
+
+MODULE_TYPE := usertest
+
+MODULE_SRCS += \
+    $(LOCAL_DIR)/main.cpp
+
+MODULE_NAME := utf-conversion-test
+
+MODULE_STATIC_LIBS := \
+    system/ulib/pretty \
+    system/ulib/utf_conversion \
+    system/ulib/zxcpp \
+
+MODULE_LIBS := \
+    system/ulib/c \
+    system/ulib/fdio \
+    system/ulib/unittest \
+    system/ulib/zircon \
+
+include make/module.mk
commit	28f446e9652948bec5cdf4b707813825270a1994	[log] [tgz]
author	John Grossman <johngro@google.com>	Tue May 22 15:45:08 2018 -0700
committer	CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org>	Mon Jun 11 17:35:54 2018 +0000
tree	f59ff9a3f1301c7d871e18a9e34f22aa7e70d242
parent	522cbc50104f8171c2c5050cad228c57f76dd173 [diff]