blob: a4a856c4a2e7a07e357c0608c12864cefc1c494f [file] [log] [blame]
// Copyright 2018 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include <zircon/compiler.h>
#include <zircon/types.h>
#ifndef SRC_LIB_UTF_CONVERSION_UTF_CONVERSION_H_
#define SRC_LIB_UTF_CONVERSION_UTF_CONVERSION_H_
__BEGIN_CDECLS
// Flags which control UTF conversion behavior.
//
// ++ PRESERVE_UNPAIRED_SURROGATES
// By default, when an unpaired surrogates are encountered in a UTF16 stream,
// they will be replaced with the codepoint for the Unicode replacment character
// (U+FFFD). When the PRESERVE_UNPAIRED_SURROGATE flag is passed, however, the
// value of the unpaired surrogate will be encoded directly as a codepoint.
//
// Note that while the presence of unpaired surrogates are technically a
// violation of the Unicode UTF16 encoding specification, apparently there are
// many UTF16 encoded strings in the world today who have chosen to allow this.
//
// This implementation considers the following to be unpaired surrogates.
// ++ A "high" surrogate [0xD800, 0xDBFF] which is not followed by a "low"
// surrogate.
// ++ A "low" surrogate [0xDC00, 0xDFFF] which is not preceded by a "high"
// surrogate.
//
// ++ FORCE_LITTLE_ENDIAN
// ++ FORCE_BIG_ENDIAN
// By default, the conversion process will look for a byte-order-marker (code
// unit 0xFEFF) in order to determine the endianness of the UTF16 source
// string. If no byte-order-marker is detected, host endianness will be
// assumed.
//
// Users may override this behavior by passing one of the force endian flags.
// The indicated endianness will be assumed, regardless of whether or not a byte
// order marker is found, and anything. It is illegal to attempt to force both
// big and little endian encoding at the same time. Attempts to do so will
// result byte-order-marker detection being applied.
//
// ++ DISCARD_BOM
// By default, a byte order marker detected in a UTF16 encoded string will be
// encoded in the UTF8 output. Users may change this behavior and cause the BOM
// to be discarded instead of encoded by passing the DISCARD_BOM flag.
#define UTF_CONVERT_FLAG_PRESERVE_UNPAIRED_SURROGATES ((uint32_t)0x01)
#define UTF_CONVERT_FLAG_FORCE_LITTLE_ENDIAN ((uint32_t)0x02)
#define UTF_CONVERT_FLAG_FORCE_BIG_ENDIAN ((uint32_t)0x04)
#define UTF_CONVERT_FLAG_DISCARD_BOM ((uint32_t)0x08)
// Attempt to convert a UTF16 string to UTF8 using either an explicitly
// specified (utf16le_*, utf16be_*) or an unspecified endianness (utf16_*)
//
// src : a pointer to the source string, encoded using UTF16
// src_len : The number of code units (uint16_t) in the source to process.
// dst : a pointer to the buffer (not null terminated), or NULL to query
// dst_len without actually writing anything.
// dst_len : A pointer to the length of of the destination buffer (in bytes).
// Afterwards, this parameter will be updated to indicate the total
// number of bytes it would take to hold a representation of the UTF8
// string (excluding null terminator), even if there was not enough
// room in the destination buffer to perform a full conversion. No
// error is returned if the buffer is not big enough.
// flags : Flags which control the conversion process. See above.
//
// Note: Embedded nulls within the source will be processed and encoded. *No*
// null termination of the destination buffer will be performed.
#if __cplusplus
zx_status_t utf16_to_utf8(const uint16_t* src, size_t src_len, uint8_t* dst, size_t* dst_len,
uint32_t flags = 0);
#else
zx_status_t utf16_to_utf8(const uint16_t* src, size_t src_len, uint8_t* dst, size_t* dst_len,
uint32_t flags);
#endif
// Convert UTF8 to UTF16. Note that while src_len is in bytes, dst_len is
// in 16-bit units.
zx_status_t utf8_to_utf16(const uint8_t* src, size_t src_len, uint16_t* dst, size_t* dst_len);
__END_CDECLS
#endif // SRC_LIB_UTF_CONVERSION_UTF_CONVERSION_H_