blob: 63a36c99b60648479666b0d2a0458735e53fbc17 [file] [log] [blame]
// utf by pietro gagliardi (andlabs) — https://github.com/andlabs/utf/
// 10 november 2016
#include "utf.h"
// this code imitates Go's unicode/utf8 and unicode/utf16
// the biggest difference is that a rune is unsigned instead of signed (because Go guarantees what a right shift on a signed number will do, whereas C does not)
// it is also an imitation so we can license it under looser terms than the Go source
#define badrune 0xFFFD
// encoded must be at most 4 bytes
// TODO clean this code up somehow
size_t utf8EncodeRune(uint32_t rune, char *encoded)
{
uint8_t b;
uint8_t c = 0;
uint8_t d = 0;
uint8_t e = 0;
size_t n;
// not in the valid range for Unicode
if (rune > 0x10FFFF)
rune = badrune;
// surrogate runes cannot be encoded
if (rune >= 0xD800 && rune < 0xE000)
rune = badrune;
if (rune < 0x80) { // ASCII bytes represent themselves
b = (uint8_t) (rune & 0xFF);
n = 1;
goto done;
}
if (rune < 0x800) { // two-byte encoding
c = (uint8_t) (rune & 0x3F);
c |= 0x80;
rune >>= 6;
b = (uint8_t) (rune & 0x1F);
b |= 0xC0;
n = 2;
goto done;
}
if (rune < 0x10000) { // three-byte encoding
d = (uint8_t) (rune & 0x3F);
d |= 0x80;
rune >>= 6;
c = (uint8_t) (rune & 0x3F);
c |= 0x80;
rune >>= 6;
b = (uint8_t) (rune & 0x0F);
b |= 0xE0;
n = 3;
goto done;
}
// otherwise use a four-byte encoding
e = (uint8_t) (rune & 0x3F);
e |= 0x80;
rune >>= 6;
d = (uint8_t) (rune & 0x3F);
d |= 0x80;
rune >>= 6;
c = (uint8_t) (rune & 0x3F);
c |= 0x80;
rune >>= 6;
b = (uint8_t) (rune & 0x07);
b |= 0xF0;
n = 4;
done:
encoded[0] = b;
if (n > 1)
encoded[1] = c;
if (n > 2)
encoded[2] = d;
if (n > 3)
encoded[3] = e;
return n;
}
const char *utf8DecodeRune(const char *s, size_t nElem, uint32_t *rune)
{
uint8_t b, c;
uint8_t lowestAllowed, highestAllowed;
size_t i, expected;
int bad;
b = (uint8_t) (*s);
if (b < 0x80) { // ASCII bytes represent themselves
*rune = b;
s++;
return s;
}
// 0xC0 and 0xC1 cover 2-byte overlong equivalents
// 0xF5 to 0xFD cover values > 0x10FFFF
// 0xFE and 0xFF were never defined (always illegal)
if (b < 0xC2 || b > 0xF4) { // invalid
*rune = badrune;
s++;
return s;
}
// this determines the range of allowed first continuation bytes
lowestAllowed = 0x80;
highestAllowed = 0xBF;
switch (b) {
case 0xE0:
// disallow 3-byte overlong equivalents
lowestAllowed = 0xA0;
break;
case 0xED:
// disallow surrogate characters
highestAllowed = 0x9F;
break;
case 0xF0:
// disallow 4-byte overlong equivalents
lowestAllowed = 0x90;
break;
case 0xF4:
// disallow values > 0x10FFFF
highestAllowed = 0x8F;
break;
}
// and this determines how many continuation bytes are expected
expected = 1;
if (b >= 0xE0)
expected++;
if (b >= 0xF0)
expected++;
if (nElem != 0) { // are there enough bytes?
nElem--;
if (nElem < expected) { // nope
*rune = badrune;
s++;
return s;
}
}
// ensure that everything is correct
// if not, **only** consume the initial byte
bad = 0;
for (i = 0; i < expected; i++) {
c = (uint8_t) (s[1 + i]);
if (c < lowestAllowed || c > highestAllowed) {
bad = 1;
break;
}
// the old lowestAllowed and highestAllowed is only for the first continuation byte
lowestAllowed = 0x80;
highestAllowed = 0xBF;
}
if (bad) {
*rune = badrune;
s++;
return s;
}
// now do the topmost bits
if (b < 0xE0)
*rune = b & 0x1F;
else if (b < 0xF0)
*rune = b & 0x0F;
else
*rune = b & 0x07;
s++; // we can finally move on
// now do the continuation bytes
for (; expected; expected--) {
c = (uint8_t) (*s);
s++;
c &= 0x3F; // strip continuation bits
*rune <<= 6;
*rune |= c;
}
return s;
}
// encoded must have at most 2 elements
size_t utf16EncodeRune(uint32_t rune, uint16_t *encoded)
{
uint16_t low, high;
// not in the valid range for Unicode
if (rune > 0x10FFFF)
rune = badrune;
// surrogate runes cannot be encoded
if (rune >= 0xD800 && rune < 0xE000)
rune = badrune;
if (rune < 0x10000) {
encoded[0] = (uint16_t) rune;
return 1;
}
rune -= 0x10000;
low = (uint16_t) (rune & 0x3FF);
rune >>= 10;
high = (uint16_t) (rune & 0x3FF);
encoded[0] = high | 0xD800;
encoded[1] = low | 0xDC00;
return 2;
}
// TODO see if this can be cleaned up somehow
const uint16_t *utf16DecodeRune(const uint16_t *s, size_t nElem, uint32_t *rune)
{
uint16_t high, low;
if (*s < 0xD800 || *s >= 0xE000) {
// self-representing character
*rune = *s;
s++;
return s;
}
if (*s >= 0xDC00) {
// out-of-order surrogates
*rune = badrune;
s++;
return s;
}
if (nElem == 1) { // not enough elements
*rune = badrune;
s++;
return s;
}
high = *s;
high &= 0x3FF;
if (s[1] < 0xDC00 || s[1] >= 0xE000) {
// bad surrogate pair
*rune = badrune;
s++;
return s;
}
s++;
low = *s;
s++;
low &= 0x3FF;
*rune = high;
*rune <<= 10;
*rune |= low;
*rune += 0x10000;
return s;
}
// TODO find a way to reduce the code in all of these somehow
// TODO find a way to remove u as well
size_t utf8RuneCount(const char *s, size_t nElem)
{
size_t len;
uint32_t rune;
if (nElem != 0) {
const char *t, *u;
len = 0;
t = s;
while (nElem != 0) {
u = utf8DecodeRune(t, nElem, &rune);
len++;
nElem -= u - t;
t = u;
}
return len;
}
len = 0;
while (*s) {
s = utf8DecodeRune(s, nElem, &rune);
len++;
}
return len;
}
size_t utf8UTF16Count(const char *s, size_t nElem)
{
size_t len;
uint32_t rune;
uint16_t encoded[2];
if (nElem != 0) {
const char *t, *u;
len = 0;
t = s;
while (nElem != 0) {
u = utf8DecodeRune(t, nElem, &rune);
len += utf16EncodeRune(rune, encoded);
nElem -= u - t;
t = u;
}
return len;
}
len = 0;
while (*s) {
s = utf8DecodeRune(s, nElem, &rune);
len += utf16EncodeRune(rune, encoded);
}
return len;
}
size_t utf16RuneCount(const uint16_t *s, size_t nElem)
{
size_t len;
uint32_t rune;
if (nElem != 0) {
const uint16_t *t, *u;
len = 0;
t = s;
while (nElem != 0) {
u = utf16DecodeRune(t, nElem, &rune);
len++;
nElem -= u - t;
t = u;
}
return len;
}
len = 0;
while (*s) {
s = utf16DecodeRune(s, nElem, &rune);
len++;
}
return len;
}
size_t utf16UTF8Count(const uint16_t *s, size_t nElem)
{
size_t len;
uint32_t rune;
char encoded[4];
if (nElem != 0) {
const uint16_t *t, *u;
len = 0;
t = s;
while (nElem != 0) {
u = utf16DecodeRune(t, nElem, &rune);
len += utf8EncodeRune(rune, encoded);
nElem -= u - t;
t = u;
}
return len;
}
len = 0;
while (*s) {
s = utf16DecodeRune(s, nElem, &rune);
len += utf8EncodeRune(rune, encoded);
}
return len;
}