blob: a6f5bd3da9e79956e9ab520a3169a7825dd2d707 [file] [log] [blame]
// Copyright 2019 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "utf_conversion.h"
static bool IsHighSurrogate(uint16_t val) { return ((val >= 0xD800) && (val <= 0xDBFF)); }
static bool IsLowSurrogate(uint16_t val) { return ((val >= 0xDC00) && (val <= 0xDFFF)); }
const uint32_t kMaxUnicodeCodePoint = 0x10FFFF;
const uint32_t kSupplementaryPlaneStart = 0x10000;
const uint32_t kUnicodeReplacementChar = 0xFFFD;
// If there is space to do so, encode the Unicode code point provided as UTF8.
// No matter what, return the number of bytes that the encoded code point would
// take.
//
// If the input is an invalid Unicode codepoint, signal this by returning 0.
uint32_t EncodeUtf8CodePoint(uint32_t code_point, uint8_t* tgt, size_t tgt_len,
size_t offset) {
// If this codepoint is illegal (for whatever reason), replace it with the
// Unicode replacement character instead.
if (code_point > kMaxUnicodeCodePoint) {
code_point = kUnicodeReplacementChar;
}
if (code_point < 0x80) {
if ((tgt_len > offset) && ((tgt_len - offset) >= 1)) {
tgt[offset] = (uint8_t)(code_point);
}
return 1;
} else if (code_point < 0x800) {
if ((tgt_len > offset) && ((tgt_len - offset) >= 2)) {
tgt[offset + 0] = (uint8_t)(0xC0 | (code_point >> 6));
tgt[offset + 1] = (uint8_t)(0x80 | (code_point & 0x3F));
}
return 2;
} else if (code_point < 0x10000) {
if ((tgt_len > offset) && ((tgt_len - offset) >= 3)) {
tgt[offset + 0] = (uint8_t)(0xE0 | (code_point >> 12));
tgt[offset + 1] = (uint8_t)(0x80 | ((code_point >> 6) & 0x3F));
tgt[offset + 2] = (uint8_t)(0x80 | (code_point & 0x3F));
}
return 3;
}
if ((tgt_len > offset) && ((tgt_len - offset) >= 4)) {
tgt[offset + 0] = (uint8_t)(0xF0 | (code_point >> 18));
tgt[offset + 1] = (uint8_t)(0x80 | ((code_point >> 12) & 0x3F));
tgt[offset + 2] = (uint8_t)(0x80 | ((code_point >> 6) & 0x3F));
tgt[offset + 3] = (uint8_t)(0x80 | (code_point & 0x3F));
}
return 4;
}
// If there is space to do so, encode the Unicode code point provided as UTF16.
// No matter what, return the number of 16-bit characters that the encoded code point would
// take.
//
// If the input is an invalid Unicode codepoint, signal this by returning 0.
// Input length is in bytes.
uint32_t EncodeUtf16CodePoint(uint32_t code_point, uint16_t* tgt, size_t tgt_len,
size_t offset) {
// If this codepoint is illegal (for whatever reason), replace it with the
// Unicode replacement character instead.
if (code_point > kMaxUnicodeCodePoint) {
code_point = kUnicodeReplacementChar;
}
// Convert bytes to characters.
tgt_len /= sizeof(uint16_t);
// TODO: this only works with single-byte UTF8 characters.
if ((tgt_len > offset) && ((tgt_len - offset) >= 2)) {
tgt[offset] = (uint16_t)code_point;
}
return 1;
}
zx_status_t Utf16ToUtf8(const uint16_t* src, size_t src_len, uint8_t* dst, size_t* dst_len) {
zx_status_t ret = ZX_OK;
size_t rd = 0;
size_t wr = 0;
// Process all of our source characters. Even if we run out of space in our
// destination, we need to compute the space that we would have needed.
while (rd < src_len) {
uint16_t code_unit = src[rd++];
uint32_t code_point;
// If this is a high surrogate, go looking for its low surrogate pair.
if (IsHighSurrogate(code_unit)) {
uint16_t high = code_unit;
// Fetch the next code unit, if any, and then attempt to pair it up
// with this high surrogate.
code_unit = (rd < src_len) ? src[rd] : 0;
// If the next code unit we peeked at is a low surrogate, then
// combine high and low to form the code point and then encode that.
// Otherwise, the high surrogate we have encountered is unpaired and
// should either be replaced or preserved, depending on our flags.
if (IsLowSurrogate(code_unit)) {
const uint32_t SHIFT = 10u;
const uint32_t MASK = (1u << SHIFT) - 1;
code_point = ((code_unit & MASK) | ((uint32_t)(high & MASK) << SHIFT)) +
kSupplementaryPlaneStart;
++rd;
} else {
code_point = kUnicodeReplacementChar;
}
} else if (IsLowSurrogate(code_unit)) {
code_point = kUnicodeReplacementChar;
} else {
code_point = code_unit;
}
wr += EncodeUtf8CodePoint(code_point, dst, *dst_len, wr);
}
*dst_len = wr;
return ret;
}
zx_status_t Utf8ToUtf16(const uint8_t* src, size_t src_len, uint16_t* dst, size_t* dst_len) {
zx_status_t ret = ZX_OK;
size_t rd = 0;
size_t wr = 0;
// Process all of our source characters. Even if we run out of space in our
// destination, we need to compute the space that we would have needed.
while (rd < src_len) {
uint32_t code_point = src[rd++];
// TODO: should correctly process multibyte characters here.
wr += EncodeUtf16CodePoint(code_point, dst, *dst_len, wr);
}
*dst_len = wr * sizeof(uint16_t);
return ret;
}
zx_status_t utf16_to_utf8(const uint16_t* src, size_t src_len, uint8_t* dst, size_t* dst_len) {
// dst_len *must* be provided.
if (dst_len == NULL) {
return ZX_ERR_INVALID_ARGS;
}
// dst may only be null if dst_len is zero (eg; a sizing operation)
if ((dst == NULL) && (*dst_len != 0)) {
return ZX_ERR_INVALID_ARGS;
}
// handle the special case of an empty source string.
if (!src || !src_len) {
*dst_len = 0;
return ZX_OK;
}
const uint16_t HOST_BOM = 0xFEFF;
const uint16_t INVERT_BOM = 0xFFFE;
const uint16_t bom = *src;
bool bom_detected = (bom == HOST_BOM) || (bom == INVERT_BOM);
if (bom_detected) {
++src;
--src_len;
}
return Utf16ToUtf8(src, src_len, dst, dst_len);
}
zx_status_t utf8_to_utf16(const uint8_t* src, size_t src_len, uint16_t* dst, size_t* dst_len) {
// dst_len *must* be provided.
if (dst_len == NULL) {
return ZX_ERR_INVALID_ARGS;
}
// dst may only be null if dst_len is zero (eg; a sizing operation)
if ((dst == NULL) && (*dst_len != 0)) {
return ZX_ERR_INVALID_ARGS;
}
// handle the special case of an empty source string.
if (!src || !src_len) {
*dst_len = 0;
return ZX_OK;
}
return Utf8ToUtf16(src, src_len, dst, dst_len);
}