| // utf by pietro gagliardi (andlabs) — https://github.com/andlabs/utf/ |
| // 10 november 2016 |
| #include "utf.h" |
| |
| // this code imitates Go's unicode/utf8 and unicode/utf16 |
| // the biggest difference is that a rune is unsigned instead of signed (because Go guarantees what a right shift on a signed number will do, whereas C does not) |
| // it is also an imitation so we can license it under looser terms than the Go source |
| #define badrune 0xFFFD |
| |
| // encoded must be at most 4 bytes |
| // TODO clean this code up somehow |
| size_t utf8EncodeRune(uint32_t rune, char *encoded) |
| { |
| uint8_t b; |
| uint8_t c = 0; |
| uint8_t d = 0; |
| uint8_t e = 0; |
| size_t n; |
| |
| // not in the valid range for Unicode |
| if (rune > 0x10FFFF) |
| rune = badrune; |
| // surrogate runes cannot be encoded |
| if (rune >= 0xD800 && rune < 0xE000) |
| rune = badrune; |
| |
| if (rune < 0x80) { // ASCII bytes represent themselves |
| b = (uint8_t) (rune & 0xFF); |
| n = 1; |
| goto done; |
| } |
| if (rune < 0x800) { // two-byte encoding |
| c = (uint8_t) (rune & 0x3F); |
| c |= 0x80; |
| rune >>= 6; |
| b = (uint8_t) (rune & 0x1F); |
| b |= 0xC0; |
| n = 2; |
| goto done; |
| } |
| if (rune < 0x10000) { // three-byte encoding |
| d = (uint8_t) (rune & 0x3F); |
| d |= 0x80; |
| rune >>= 6; |
| c = (uint8_t) (rune & 0x3F); |
| c |= 0x80; |
| rune >>= 6; |
| b = (uint8_t) (rune & 0x0F); |
| b |= 0xE0; |
| n = 3; |
| goto done; |
| } |
| // otherwise use a four-byte encoding |
| e = (uint8_t) (rune & 0x3F); |
| e |= 0x80; |
| rune >>= 6; |
| d = (uint8_t) (rune & 0x3F); |
| d |= 0x80; |
| rune >>= 6; |
| c = (uint8_t) (rune & 0x3F); |
| c |= 0x80; |
| rune >>= 6; |
| b = (uint8_t) (rune & 0x07); |
| b |= 0xF0; |
| n = 4; |
| |
| done: |
| encoded[0] = b; |
| if (n > 1) |
| encoded[1] = c; |
| if (n > 2) |
| encoded[2] = d; |
| if (n > 3) |
| encoded[3] = e; |
| return n; |
| } |
| |
| const char *utf8DecodeRune(const char *s, size_t nElem, uint32_t *rune) |
| { |
| uint8_t b, c; |
| uint8_t lowestAllowed, highestAllowed; |
| size_t i, expected; |
| int bad; |
| |
| b = (uint8_t) (*s); |
| if (b < 0x80) { // ASCII bytes represent themselves |
| *rune = b; |
| s++; |
| return s; |
| } |
| // 0xC0 and 0xC1 cover 2-byte overlong equivalents |
| // 0xF5 to 0xFD cover values > 0x10FFFF |
| // 0xFE and 0xFF were never defined (always illegal) |
| if (b < 0xC2 || b > 0xF4) { // invalid |
| *rune = badrune; |
| s++; |
| return s; |
| } |
| |
| // this determines the range of allowed first continuation bytes |
| lowestAllowed = 0x80; |
| highestAllowed = 0xBF; |
| switch (b) { |
| case 0xE0: |
| // disallow 3-byte overlong equivalents |
| lowestAllowed = 0xA0; |
| break; |
| case 0xED: |
| // disallow surrogate characters |
| highestAllowed = 0x9F; |
| break; |
| case 0xF0: |
| // disallow 4-byte overlong equivalents |
| lowestAllowed = 0x90; |
| break; |
| case 0xF4: |
| // disallow values > 0x10FFFF |
| highestAllowed = 0x8F; |
| break; |
| } |
| |
| // and this determines how many continuation bytes are expected |
| expected = 1; |
| if (b >= 0xE0) |
| expected++; |
| if (b >= 0xF0) |
| expected++; |
| if (nElem != 0) { // are there enough bytes? |
| nElem--; |
| if (nElem < expected) { // nope |
| *rune = badrune; |
| s++; |
| return s; |
| } |
| } |
| |
| // ensure that everything is correct |
| // if not, **only** consume the initial byte |
| bad = 0; |
| for (i = 0; i < expected; i++) { |
| c = (uint8_t) (s[1 + i]); |
| if (c < lowestAllowed || c > highestAllowed) { |
| bad = 1; |
| break; |
| } |
| // the old lowestAllowed and highestAllowed is only for the first continuation byte |
| lowestAllowed = 0x80; |
| highestAllowed = 0xBF; |
| } |
| if (bad) { |
| *rune = badrune; |
| s++; |
| return s; |
| } |
| |
| // now do the topmost bits |
| if (b < 0xE0) |
| *rune = b & 0x1F; |
| else if (b < 0xF0) |
| *rune = b & 0x0F; |
| else |
| *rune = b & 0x07; |
| s++; // we can finally move on |
| |
| // now do the continuation bytes |
| for (; expected; expected--) { |
| c = (uint8_t) (*s); |
| s++; |
| c &= 0x3F; // strip continuation bits |
| *rune <<= 6; |
| *rune |= c; |
| } |
| |
| return s; |
| } |
| |
| // encoded must have at most 2 elements |
| size_t utf16EncodeRune(uint32_t rune, uint16_t *encoded) |
| { |
| uint16_t low, high; |
| |
| // not in the valid range for Unicode |
| if (rune > 0x10FFFF) |
| rune = badrune; |
| // surrogate runes cannot be encoded |
| if (rune >= 0xD800 && rune < 0xE000) |
| rune = badrune; |
| |
| if (rune < 0x10000) { |
| encoded[0] = (uint16_t) rune; |
| return 1; |
| } |
| |
| rune -= 0x10000; |
| low = (uint16_t) (rune & 0x3FF); |
| rune >>= 10; |
| high = (uint16_t) (rune & 0x3FF); |
| encoded[0] = high | 0xD800; |
| encoded[1] = low | 0xDC00; |
| return 2; |
| } |
| |
| // TODO see if this can be cleaned up somehow |
| const uint16_t *utf16DecodeRune(const uint16_t *s, size_t nElem, uint32_t *rune) |
| { |
| uint16_t high, low; |
| |
| if (*s < 0xD800 || *s >= 0xE000) { |
| // self-representing character |
| *rune = *s; |
| s++; |
| return s; |
| } |
| if (*s >= 0xDC00) { |
| // out-of-order surrogates |
| *rune = badrune; |
| s++; |
| return s; |
| } |
| if (nElem == 1) { // not enough elements |
| *rune = badrune; |
| s++; |
| return s; |
| } |
| high = *s; |
| high &= 0x3FF; |
| if (s[1] < 0xDC00 || s[1] >= 0xE000) { |
| // bad surrogate pair |
| *rune = badrune; |
| s++; |
| return s; |
| } |
| s++; |
| low = *s; |
| s++; |
| low &= 0x3FF; |
| *rune = high; |
| *rune <<= 10; |
| *rune |= low; |
| *rune += 0x10000; |
| return s; |
| } |
| |
| // TODO find a way to reduce the code in all of these somehow |
| // TODO find a way to remove u as well |
| size_t utf8RuneCount(const char *s, size_t nElem) |
| { |
| size_t len; |
| uint32_t rune; |
| |
| if (nElem != 0) { |
| const char *t, *u; |
| |
| len = 0; |
| t = s; |
| while (nElem != 0) { |
| u = utf8DecodeRune(t, nElem, &rune); |
| len++; |
| nElem -= u - t; |
| t = u; |
| } |
| return len; |
| } |
| len = 0; |
| while (*s) { |
| s = utf8DecodeRune(s, nElem, &rune); |
| len++; |
| } |
| return len; |
| } |
| |
| size_t utf8UTF16Count(const char *s, size_t nElem) |
| { |
| size_t len; |
| uint32_t rune; |
| uint16_t encoded[2]; |
| |
| if (nElem != 0) { |
| const char *t, *u; |
| |
| len = 0; |
| t = s; |
| while (nElem != 0) { |
| u = utf8DecodeRune(t, nElem, &rune); |
| len += utf16EncodeRune(rune, encoded); |
| nElem -= u - t; |
| t = u; |
| } |
| return len; |
| } |
| len = 0; |
| while (*s) { |
| s = utf8DecodeRune(s, nElem, &rune); |
| len += utf16EncodeRune(rune, encoded); |
| } |
| return len; |
| } |
| |
| size_t utf16RuneCount(const uint16_t *s, size_t nElem) |
| { |
| size_t len; |
| uint32_t rune; |
| |
| if (nElem != 0) { |
| const uint16_t *t, *u; |
| |
| len = 0; |
| t = s; |
| while (nElem != 0) { |
| u = utf16DecodeRune(t, nElem, &rune); |
| len++; |
| nElem -= u - t; |
| t = u; |
| } |
| return len; |
| } |
| len = 0; |
| while (*s) { |
| s = utf16DecodeRune(s, nElem, &rune); |
| len++; |
| } |
| return len; |
| } |
| |
| size_t utf16UTF8Count(const uint16_t *s, size_t nElem) |
| { |
| size_t len; |
| uint32_t rune; |
| char encoded[4]; |
| |
| if (nElem != 0) { |
| const uint16_t *t, *u; |
| |
| len = 0; |
| t = s; |
| while (nElem != 0) { |
| u = utf16DecodeRune(t, nElem, &rune); |
| len += utf8EncodeRune(rune, encoded); |
| nElem -= u - t; |
| t = u; |
| } |
| return len; |
| } |
| len = 0; |
| while (*s) { |
| s = utf16DecodeRune(s, nElem, &rune); |
| len += utf8EncodeRune(rune, encoded); |
| } |
| return len; |
| } |