| // Copyright 2018 The Fuchsia Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include "utf_conversion.h" |
| |
| #include <lib/stdcompat/bit.h> |
| #include <zircon/assert.h> |
| |
| namespace { |
| |
| enum class Endianness { |
| HOST, |
| INVERT, |
| }; |
| |
| constexpr Endianness kBigEndian = |
| cpp20::endian::native == cpp20::endian::big ? Endianness::HOST : Endianness::INVERT; |
| constexpr Endianness kLittleEndian = |
| cpp20::endian::native == cpp20::endian::little ? Endianness::HOST : Endianness::INVERT; |
| |
| template <Endianness E> |
| struct CodeUnit; |
| |
| template <> |
| struct CodeUnit<Endianness::HOST> { |
| static inline uint16_t Read(uint16_t val) { return val; } |
| }; |
| template <> |
| struct CodeUnit<Endianness::INVERT> { |
| static inline uint16_t Read(uint16_t val) { return static_cast<uint16_t>(val << 8 | val >> 8); } |
| }; |
| |
| static constexpr bool IsHighSurrogate(uint16_t val) { return ((val >= 0xD800) && (val <= 0xDBFF)); } |
| static constexpr bool IsLowSurrogate(uint16_t val) { return ((val >= 0xDC00) && (val <= 0xDFFF)); } |
| constexpr uint32_t kMaxUnicodeCodePoint = 0x10FFFF; |
| constexpr uint32_t kSupplementaryPlaneStart = 0x10000; |
| constexpr uint32_t kUnicodeReplacementChar = 0xFFFD; |
| |
| // If there is space to do so, encode the Unicode code point provided as UTF8. |
| // No matter what, return the number of bytes that the encoded code point would |
| // take. |
| // |
| // If the input is an invalid Unicode codepoint, signal this by returning 0. |
| inline uint32_t EncodeUtf8CodePoint(uint32_t code_point, uint8_t* tgt, size_t tgt_len, |
| size_t offset) { |
| // If this codepoint is illegal (for whatever reason), replace it with the |
| // Unicode replacement character instead. |
| if (code_point > kMaxUnicodeCodePoint) { |
| code_point = kUnicodeReplacementChar; |
| } |
| |
| if (code_point < 0x80) { |
| if ((tgt_len > offset) && ((tgt_len - offset) >= 1)) { |
| tgt[offset] = static_cast<uint8_t>(code_point); |
| } |
| return 1; |
| } else if (code_point < 0x800) { |
| if ((tgt_len > offset) && ((tgt_len - offset) >= 2)) { |
| tgt[offset + 0] = static_cast<uint8_t>(0xC0 | (code_point >> 6)); |
| tgt[offset + 1] = static_cast<uint8_t>(0x80 | (code_point & 0x3F)); |
| } |
| return 2; |
| } else if (code_point < 0x10000) { |
| if ((tgt_len > offset) && ((tgt_len - offset) >= 3)) { |
| tgt[offset + 0] = static_cast<uint8_t>(0xE0 | (code_point >> 12)); |
| tgt[offset + 1] = static_cast<uint8_t>(0x80 | ((code_point >> 6) & 0x3F)); |
| tgt[offset + 2] = static_cast<uint8_t>(0x80 | (code_point & 0x3F)); |
| } |
| return 3; |
| } |
| |
| ZX_DEBUG_ASSERT(code_point <= kMaxUnicodeCodePoint); |
| if ((tgt_len > offset) && ((tgt_len - offset) >= 4)) { |
| tgt[offset + 0] = static_cast<uint8_t>(0xF0 | (code_point >> 18)); |
| tgt[offset + 1] = static_cast<uint8_t>(0x80 | ((code_point >> 12) & 0x3F)); |
| tgt[offset + 2] = static_cast<uint8_t>(0x80 | ((code_point >> 6) & 0x3F)); |
| tgt[offset + 3] = static_cast<uint8_t>(0x80 | (code_point & 0x3F)); |
| } |
| return 4; |
| } |
| |
| template <Endianness E> |
| zx_status_t Utf16ToUtf8(const uint16_t* src, size_t src_len, uint8_t* dst, size_t* dst_len, |
| uint32_t flags) { |
| bool preserve_unpaired = (flags & UTF_CONVERT_FLAG_PRESERVE_UNPAIRED_SURROGATES); |
| zx_status_t ret = ZX_OK; |
| size_t rd = 0; |
| size_t wr = 0; |
| |
| ZX_DEBUG_ASSERT((src != nullptr) && (dst_len != nullptr)); |
| ZX_DEBUG_ASSERT((dst != nullptr) || (*dst_len == 0)); |
| |
| // Process all of our source characters. Even if we run out of space in our |
| // destination, we need to compute the space that we would have needed. |
| while (rd < src_len) { |
| uint16_t code_unit = CodeUnit<E>::Read(src[rd++]); |
| uint32_t code_point; |
| |
| // If this is a high surrogate, go looking for its low surrogate pair. |
| if (IsHighSurrogate(code_unit)) { |
| uint16_t high = code_unit; |
| |
| // Fetch the next code unit, if any, and then attempt to pair it up |
| // with this high surrogate. |
| code_unit = (rd < src_len) ? CodeUnit<E>::Read(src[rd]) : 0; |
| |
| // If the next code unit we peeked at is a low surrogate, then |
| // combine high and low to form the code point and then encode that. |
| // Otherwise, the high surrogate we have encountered is unpaired and |
| // should either be replaced or preserved, depending on our flags. |
| if (IsLowSurrogate(code_unit)) { |
| constexpr uint32_t SHIFT = 10u; |
| constexpr uint32_t MASK = (1u << SHIFT) - 1; |
| code_point = ((code_unit & MASK) | (static_cast<uint32_t>(high & MASK) << SHIFT)) + |
| kSupplementaryPlaneStart; |
| ++rd; |
| } else { |
| code_point = preserve_unpaired ? high : kUnicodeReplacementChar; |
| } |
| } else if (IsLowSurrogate(code_unit) && !preserve_unpaired) { |
| code_point = kUnicodeReplacementChar; |
| } else { |
| code_point = code_unit; |
| } |
| |
| wr += EncodeUtf8CodePoint(code_point, dst, *dst_len, wr); |
| } |
| |
| *dst_len = wr; |
| return ret; |
| } |
| |
| // If there is space to do so, encode the Unicode code point provided as UTF16. |
| // No matter what, return the number of 16-bit characters that the encoded code |
| // point would take. |
| // |
| // If the input is an invalid Unicode codepoint, signal this by returning 0. |
| // Input length is in bytes. Output length is in 16-bit units. |
| uint32_t EncodeUtf16CodePoint(uint32_t code_point, uint16_t* tgt, size_t tgt_len, size_t offset) { |
| if (code_point > kMaxUnicodeCodePoint || tgt_len <= offset) { |
| return 0; |
| } |
| |
| // TODO: this only works with single-byte UTF8 characters. |
| tgt[offset] = static_cast<uint16_t>(code_point); |
| return 1; |
| } |
| |
| } // namespace |
| |
| zx_status_t utf16_to_utf8(const uint16_t* src, size_t src_len, uint8_t* dst, size_t* dst_len, |
| uint32_t flags) { |
| // Sanity check our args. |
| constexpr uint32_t ENDIAN_FLAGS = |
| UTF_CONVERT_FLAG_FORCE_LITTLE_ENDIAN | UTF_CONVERT_FLAG_FORCE_BIG_ENDIAN; |
| constexpr uint32_t ALL_FLAGS = |
| UTF_CONVERT_FLAG_DISCARD_BOM | UTF_CONVERT_FLAG_PRESERVE_UNPAIRED_SURROGATES | ENDIAN_FLAGS; |
| // dst_len *must* be provided, and all flags need to be understood. |
| if ((dst_len == nullptr) || (flags & ~ALL_FLAGS)) { |
| return ZX_ERR_INVALID_ARGS; |
| } |
| |
| // dst may only be null if dst_len is zero (eg; a sizing operation) |
| if ((dst == nullptr) && (*dst_len != 0)) { |
| return ZX_ERR_INVALID_ARGS; |
| } |
| |
| // handle the special case of an empty source string. |
| if (!src || !src_len) { |
| *dst_len = 0; |
| return ZX_OK; |
| } |
| |
| // Deal with endian detection. |
| Endianness detected; |
| |
| constexpr uint16_t HOST_BOM = 0xFEFF; |
| constexpr uint16_t INVERT_BOM = 0xFFFE; |
| const uint16_t bom = *src; |
| bool bom_detected = (bom == HOST_BOM) || (bom == INVERT_BOM); |
| |
| if ((flags & ENDIAN_FLAGS) == UTF_CONVERT_FLAG_FORCE_LITTLE_ENDIAN) { |
| detected = kLittleEndian; |
| } else if ((flags & ENDIAN_FLAGS) == UTF_CONVERT_FLAG_FORCE_BIG_ENDIAN) { |
| detected = kBigEndian; |
| } else { |
| detected = (bom_detected && (bom == INVERT_BOM)) ? Endianness::INVERT : Endianness::HOST; |
| } |
| |
| if (bom_detected && (flags & UTF_CONVERT_FLAG_DISCARD_BOM)) { |
| ZX_DEBUG_ASSERT(src_len > 0); |
| ++src; |
| --src_len; |
| } |
| |
| if (detected == Endianness::INVERT) { |
| return Utf16ToUtf8<Endianness::INVERT>(src, src_len, dst, dst_len, flags); |
| } else { |
| return Utf16ToUtf8<Endianness::HOST>(src, src_len, dst, dst_len, flags); |
| } |
| } |
| |
| zx_status_t utf8_to_utf16(const uint8_t* src, size_t src_len, uint16_t* dst, size_t* dst_len) { |
| zx_status_t ret = ZX_OK; |
| size_t rd = 0; |
| size_t wr = 0; |
| |
| while (rd < src_len) { |
| uint32_t code_point = src[rd++]; |
| |
| // TODO: should correctly process multibyte characters here. |
| wr += EncodeUtf16CodePoint(code_point, dst, *dst_len, wr); |
| } |
| |
| *dst_len = wr; |
| return ret; |
| } |