// utf by pietro gagliardi (andlabs) — https://github.com/andlabs/utf/ | |
// 10 november 2016 | |
#include "utf.h" | |
// this code imitates Go's unicode/utf8 and unicode/utf16 | |
// the biggest difference is that a rune is unsigned instead of signed (because Go guarantees what a right shift on a signed number will do, whereas C does not) | |
// it is also an imitation so we can license it under looser terms than the Go source | |
#define badrune 0xFFFD | |
// encoded must be at most 4 bytes | |
// TODO clean this code up somehow | |
size_t utf8EncodeRune(uint32_t rune, char *encoded) | |
{ | |
uint8_t b; | |
uint8_t c = 0; | |
uint8_t d = 0; | |
uint8_t e = 0; | |
size_t n; | |
// not in the valid range for Unicode | |
if (rune > 0x10FFFF) | |
rune = badrune; | |
// surrogate runes cannot be encoded | |
if (rune >= 0xD800 && rune < 0xE000) | |
rune = badrune; | |
if (rune < 0x80) { // ASCII bytes represent themselves | |
b = (uint8_t) (rune & 0xFF); | |
n = 1; | |
goto done; | |
} | |
if (rune < 0x800) { // two-byte encoding | |
c = (uint8_t) (rune & 0x3F); | |
c |= 0x80; | |
rune >>= 6; | |
b = (uint8_t) (rune & 0x1F); | |
b |= 0xC0; | |
n = 2; | |
goto done; | |
} | |
if (rune < 0x10000) { // three-byte encoding | |
d = (uint8_t) (rune & 0x3F); | |
d |= 0x80; | |
rune >>= 6; | |
c = (uint8_t) (rune & 0x3F); | |
c |= 0x80; | |
rune >>= 6; | |
b = (uint8_t) (rune & 0x0F); | |
b |= 0xE0; | |
n = 3; | |
goto done; | |
} | |
// otherwise use a four-byte encoding | |
e = (uint8_t) (rune & 0x3F); | |
e |= 0x80; | |
rune >>= 6; | |
d = (uint8_t) (rune & 0x3F); | |
d |= 0x80; | |
rune >>= 6; | |
c = (uint8_t) (rune & 0x3F); | |
c |= 0x80; | |
rune >>= 6; | |
b = (uint8_t) (rune & 0x07); | |
b |= 0xF0; | |
n = 4; | |
done: | |
encoded[0] = b; | |
if (n > 1) | |
encoded[1] = c; | |
if (n > 2) | |
encoded[2] = d; | |
if (n > 3) | |
encoded[3] = e; | |
return n; | |
} | |
const char *utf8DecodeRune(const char *s, size_t nElem, uint32_t *rune) | |
{ | |
uint8_t b, c; | |
uint8_t lowestAllowed, highestAllowed; | |
size_t i, expected; | |
int bad; | |
b = (uint8_t) (*s); | |
if (b < 0x80) { // ASCII bytes represent themselves | |
*rune = b; | |
s++; | |
return s; | |
} | |
// 0xC0 and 0xC1 cover 2-byte overlong equivalents | |
// 0xF5 to 0xFD cover values > 0x10FFFF | |
// 0xFE and 0xFF were never defined (always illegal) | |
if (b < 0xC2 || b > 0xF4) { // invalid | |
*rune = badrune; | |
s++; | |
return s; | |
} | |
// this determines the range of allowed first continuation bytes | |
lowestAllowed = 0x80; | |
highestAllowed = 0xBF; | |
switch (b) { | |
case 0xE0: | |
// disallow 3-byte overlong equivalents | |
lowestAllowed = 0xA0; | |
break; | |
case 0xED: | |
// disallow surrogate characters | |
highestAllowed = 0x9F; | |
break; | |
case 0xF0: | |
// disallow 4-byte overlong equivalents | |
lowestAllowed = 0x90; | |
break; | |
case 0xF4: | |
// disallow values > 0x10FFFF | |
highestAllowed = 0x8F; | |
break; | |
} | |
// and this determines how many continuation bytes are expected | |
expected = 1; | |
if (b >= 0xE0) | |
expected++; | |
if (b >= 0xF0) | |
expected++; | |
if (nElem != 0) { // are there enough bytes? | |
nElem--; | |
if (nElem < expected) { // nope | |
*rune = badrune; | |
s++; | |
return s; | |
} | |
} | |
// ensure that everything is correct | |
// if not, **only** consume the initial byte | |
bad = 0; | |
for (i = 0; i < expected; i++) { | |
c = (uint8_t) (s[1 + i]); | |
if (c < lowestAllowed || c > highestAllowed) { | |
bad = 1; | |
break; | |
} | |
// the old lowestAllowed and highestAllowed is only for the first continuation byte | |
lowestAllowed = 0x80; | |
highestAllowed = 0xBF; | |
} | |
if (bad) { | |
*rune = badrune; | |
s++; | |
return s; | |
} | |
// now do the topmost bits | |
if (b < 0xE0) | |
*rune = b & 0x1F; | |
else if (b < 0xF0) | |
*rune = b & 0x0F; | |
else | |
*rune = b & 0x07; | |
s++; // we can finally move on | |
// now do the continuation bytes | |
for (; expected; expected--) { | |
c = (uint8_t) (*s); | |
s++; | |
c &= 0x3F; // strip continuation bits | |
*rune <<= 6; | |
*rune |= c; | |
} | |
return s; | |
} | |
// encoded must have at most 2 elements | |
size_t utf16EncodeRune(uint32_t rune, uint16_t *encoded) | |
{ | |
uint16_t low, high; | |
// not in the valid range for Unicode | |
if (rune > 0x10FFFF) | |
rune = badrune; | |
// surrogate runes cannot be encoded | |
if (rune >= 0xD800 && rune < 0xE000) | |
rune = badrune; | |
if (rune < 0x10000) { | |
encoded[0] = (uint16_t) rune; | |
return 1; | |
} | |
rune -= 0x10000; | |
low = (uint16_t) (rune & 0x3FF); | |
rune >>= 10; | |
high = (uint16_t) (rune & 0x3FF); | |
encoded[0] = high | 0xD800; | |
encoded[1] = low | 0xDC00; | |
return 2; | |
} | |
// TODO see if this can be cleaned up somehow | |
const uint16_t *utf16DecodeRune(const uint16_t *s, size_t nElem, uint32_t *rune) | |
{ | |
uint16_t high, low; | |
if (*s < 0xD800 || *s >= 0xE000) { | |
// self-representing character | |
*rune = *s; | |
s++; | |
return s; | |
} | |
if (*s >= 0xDC00) { | |
// out-of-order surrogates | |
*rune = badrune; | |
s++; | |
return s; | |
} | |
if (nElem == 1) { // not enough elements | |
*rune = badrune; | |
s++; | |
return s; | |
} | |
high = *s; | |
high &= 0x3FF; | |
if (s[1] < 0xDC00 || s[1] >= 0xE000) { | |
// bad surrogate pair | |
*rune = badrune; | |
s++; | |
return s; | |
} | |
s++; | |
low = *s; | |
s++; | |
low &= 0x3FF; | |
*rune = high; | |
*rune <<= 10; | |
*rune |= low; | |
*rune += 0x10000; | |
return s; | |
} | |
// TODO find a way to reduce the code in all of these somehow | |
// TODO find a way to remove u as well | |
size_t utf8RuneCount(const char *s, size_t nElem) | |
{ | |
size_t len; | |
uint32_t rune; | |
if (nElem != 0) { | |
const char *t, *u; | |
len = 0; | |
t = s; | |
while (nElem != 0) { | |
u = utf8DecodeRune(t, nElem, &rune); | |
len++; | |
nElem -= u - t; | |
t = u; | |
} | |
return len; | |
} | |
len = 0; | |
while (*s) { | |
s = utf8DecodeRune(s, nElem, &rune); | |
len++; | |
} | |
return len; | |
} | |
size_t utf8UTF16Count(const char *s, size_t nElem) | |
{ | |
size_t len; | |
uint32_t rune; | |
uint16_t encoded[2]; | |
if (nElem != 0) { | |
const char *t, *u; | |
len = 0; | |
t = s; | |
while (nElem != 0) { | |
u = utf8DecodeRune(t, nElem, &rune); | |
len += utf16EncodeRune(rune, encoded); | |
nElem -= u - t; | |
t = u; | |
} | |
return len; | |
} | |
len = 0; | |
while (*s) { | |
s = utf8DecodeRune(s, nElem, &rune); | |
len += utf16EncodeRune(rune, encoded); | |
} | |
return len; | |
} | |
size_t utf16RuneCount(const uint16_t *s, size_t nElem) | |
{ | |
size_t len; | |
uint32_t rune; | |
if (nElem != 0) { | |
const uint16_t *t, *u; | |
len = 0; | |
t = s; | |
while (nElem != 0) { | |
u = utf16DecodeRune(t, nElem, &rune); | |
len++; | |
nElem -= u - t; | |
t = u; | |
} | |
return len; | |
} | |
len = 0; | |
while (*s) { | |
s = utf16DecodeRune(s, nElem, &rune); | |
len++; | |
} | |
return len; | |
} | |
size_t utf16UTF8Count(const uint16_t *s, size_t nElem) | |
{ | |
size_t len; | |
uint32_t rune; | |
char encoded[4]; | |
if (nElem != 0) { | |
const uint16_t *t, *u; | |
len = 0; | |
t = s; | |
while (nElem != 0) { | |
u = utf16DecodeRune(t, nElem, &rune); | |
len += utf8EncodeRune(rune, encoded); | |
nElem -= u - t; | |
t = u; | |
} | |
return len; | |
} | |
len = 0; | |
while (*s) { | |
s = utf16DecodeRune(s, nElem, &rune); | |
len += utf8EncodeRune(rune, encoded); | |
} | |
return len; | |
} |