third_party/NordicSemiconductor/libraries/utf_converter/utf.c - third_party/openthread - Git at Google

 // utf by pietro gagliardi (andlabs) — https://github.com/andlabs/utf/
 // 10 november 2016
 #include "utf.h"

 // this code imitates Go's unicode/utf8 and unicode/utf16
 // the biggest difference is that a rune is unsigned instead of signed (because Go guarantees what a right shift on a signed number will do, whereas C does not)
 // it is also an imitation so we can license it under looser terms than the Go source
 #define badrune 0xFFFD

 // encoded must be at most 4 bytes
 // TODO clean this code up somehow
 size_t utf8EncodeRune(uint32_t rune, char *encoded)
 {
 	uint8_t b;
 	uint8_t c = 0;
 	uint8_t d = 0;
 	uint8_t e = 0;
 	size_t  n;

 	// not in the valid range for Unicode
 	if (rune > 0x10FFFF)
 		rune = badrune;
 	// surrogate runes cannot be encoded
 	if (rune >= 0xD800 && rune < 0xE000)
 		rune = badrune;

 	if (rune < 0x80) {		// ASCII bytes represent themselves
 		b = (uint8_t) (rune & 0xFF);
 		n = 1;
 		goto done;
 	}
 	if (rune < 0x800) {		// two-byte encoding
 		c = (uint8_t) (rune & 0x3F);
 		c |= 0x80;
 		rune >>= 6;
 		b = (uint8_t) (rune & 0x1F);
 		b |= 0xC0;
 		n = 2;
 		goto done;
 	}
 	if (rune < 0x10000) {	// three-byte encoding
 		d = (uint8_t) (rune & 0x3F);
 		d |= 0x80;
 		rune >>= 6;
 		c = (uint8_t) (rune & 0x3F);
 		c |= 0x80;
 		rune >>= 6;
 		b = (uint8_t) (rune & 0x0F);
 		b |= 0xE0;
 		n = 3;
 		goto done;
 	}
 	// otherwise use a four-byte encoding
 	e = (uint8_t) (rune & 0x3F);
 	e |= 0x80;
 	rune >>= 6;
 	d = (uint8_t) (rune & 0x3F);
 	d |= 0x80;
 	rune >>= 6;
 	c = (uint8_t) (rune & 0x3F);
 	c |= 0x80;
 	rune >>= 6;
 	b = (uint8_t) (rune & 0x07);
 	b |= 0xF0;
 	n = 4;

 done:
 	encoded[0] = b;
 	if (n > 1)
 		encoded[1] = c;
 	if (n > 2)
 		encoded[2] = d;
 	if (n > 3)
 		encoded[3] = e;
 	return n;
 }

 const char *utf8DecodeRune(const char *s, size_t nElem, uint32_t *rune)
 {
 	uint8_t b, c;
 	uint8_t lowestAllowed, highestAllowed;
 	size_t i, expected;
 	int bad;

 	b = (uint8_t) (*s);
 	if (b < 0x80) {		// ASCII bytes represent themselves
 		*rune = b;
 		s++;
 		return s;
 	}
 	// 0xC0 and 0xC1 cover 2-byte overlong equivalents
 	// 0xF5 to 0xFD cover values > 0x10FFFF
 	// 0xFE and 0xFF were never defined (always illegal)
 	if (b < 0xC2 || b > 0xF4) {		// invalid
 		*rune = badrune;
 		s++;
 		return s;
 	}

 	// this determines the range of allowed first continuation bytes
 	lowestAllowed = 0x80;
 	highestAllowed = 0xBF;
 	switch (b) {
 	case 0xE0:
 		// disallow 3-byte overlong equivalents
 		lowestAllowed = 0xA0;
 		break;
 	case 0xED:
 		// disallow surrogate characters
 		highestAllowed = 0x9F;
 		break;
 	case 0xF0:
 		// disallow 4-byte overlong equivalents
 		lowestAllowed = 0x90;
 		break;
 	case 0xF4:
 		// disallow values > 0x10FFFF
 		highestAllowed = 0x8F;
 		break;
 	}

 	// and this determines how many continuation bytes are expected
 	expected = 1;
 	if (b >= 0xE0)
 		expected++;
 	if (b >= 0xF0)
 		expected++;
 	if (nElem != 0) {				// are there enough bytes?
 		nElem--;
 		if (nElem < expected) {	// nope
 			*rune = badrune;
 			s++;
 			return s;
 		}
 	}

 	// ensure that everything is correct
 	// if not, **only** consume the initial byte
 	bad = 0;
 	for (i = 0; i < expected; i++) {
 		c = (uint8_t) (s[1 + i]);
 		if (c < lowestAllowed || c > highestAllowed) {
 			bad = 1;
 			break;
 		}
 		// the old lowestAllowed and highestAllowed is only for the first continuation byte
 		lowestAllowed = 0x80;
 		highestAllowed = 0xBF;
 	}
 	if (bad) {
 		*rune = badrune;
 		s++;
 		return s;
 	}

 	// now do the topmost bits
 	if (b < 0xE0)
 		*rune = b & 0x1F;
 	else if (b < 0xF0)
 		*rune = b & 0x0F;
 	else
 		*rune = b & 0x07;
 	s++;		// we can finally move on

 	// now do the continuation bytes
 	for (; expected; expected--) {
 		c = (uint8_t) (*s);
 		s++;
 		c &= 0x3F;		// strip continuation bits
 		*rune <<= 6;
 		*rune |= c;
 	}

 	return s;
 }

 // encoded must have at most 2 elements
 size_t utf16EncodeRune(uint32_t rune, uint16_t *encoded)
 {
 	uint16_t low, high;

 	// not in the valid range for Unicode
 	if (rune > 0x10FFFF)
 		rune = badrune;
 	// surrogate runes cannot be encoded
 	if (rune >= 0xD800 && rune < 0xE000)
 		rune = badrune;

 	if (rune < 0x10000) {
 		encoded[0] = (uint16_t) rune;
 		return 1;
 	}

 	rune -= 0x10000;
 	low = (uint16_t) (rune & 0x3FF);
 	rune >>= 10;
 	high = (uint16_t) (rune & 0x3FF);
 	encoded[0] = high | 0xD800;
 	encoded[1] = low | 0xDC00;
 	return 2;
 }

 // TODO see if this can be cleaned up somehow
 const uint16_t *utf16DecodeRune(const uint16_t *s, size_t nElem, uint32_t *rune)
 {
 	uint16_t high, low;

 	if (*s < 0xD800 || *s >= 0xE000) {
 		// self-representing character
 		*rune = *s;
 		s++;
 		return s;
 	}
 	if (*s >= 0xDC00) {
 		// out-of-order surrogates
 		*rune = badrune;
 		s++;
 		return s;
 	}
 	if (nElem == 1) {		// not enough elements
 		*rune = badrune;
 		s++;
 		return s;
 	}
 	high = *s;
 	high &= 0x3FF;
 	if (s[1] < 0xDC00 || s[1] >= 0xE000) {
 		// bad surrogate pair
 		*rune = badrune;
 		s++;
 		return s;
 	}
 	s++;
 	low = *s;
 	s++;
 	low &= 0x3FF;
 	*rune = high;
 	*rune <<= 10;
 	*rune |= low;
 	*rune += 0x10000;
 	return s;
 }

 // TODO find a way to reduce the code in all of these somehow
 // TODO find a way to remove u as well
 size_t utf8RuneCount(const char *s, size_t nElem)
 {
 	size_t len;
 	uint32_t rune;

 	if (nElem != 0) {
 		const char *t, *u;

 		len = 0;
 		t = s;
 		while (nElem != 0) {
 			u = utf8DecodeRune(t, nElem, &rune);
 			len++;
 			nElem -= u - t;
 			t = u;
 		}
 		return len;
 	}
 	len = 0;
 	while (*s) {
 		s = utf8DecodeRune(s, nElem, &rune);
 		len++;
 	}
 	return len;
 }

 size_t utf8UTF16Count(const char *s, size_t nElem)
 {
 	size_t len;
 	uint32_t rune;
 	uint16_t encoded[2];

 	if (nElem != 0) {
 		const char *t, *u;

 		len = 0;
 		t = s;
 		while (nElem != 0) {
 			u = utf8DecodeRune(t, nElem, &rune);
 			len += utf16EncodeRune(rune, encoded);
 			nElem -= u - t;
 			t = u;
 		}
 		return len;
 	}
 	len = 0;
 	while (*s) {
 		s = utf8DecodeRune(s, nElem, &rune);
 		len += utf16EncodeRune(rune, encoded);
 	}
 	return len;
 }

 size_t utf16RuneCount(const uint16_t *s, size_t nElem)
 {
 	size_t len;
 	uint32_t rune;

 	if (nElem != 0) {
 		const uint16_t *t, *u;

 		len = 0;
 		t = s;
 		while (nElem != 0) {
 			u = utf16DecodeRune(t, nElem, &rune);
 			len++;
 			nElem -= u - t;
 			t = u;
 		}
 		return len;
 	}
 	len = 0;
 	while (*s) {
 		s = utf16DecodeRune(s, nElem, &rune);
 		len++;
 	}
 	return len;
 }

 size_t utf16UTF8Count(const uint16_t *s, size_t nElem)
 {
 	size_t len;
 	uint32_t rune;
 	char encoded[4];

 	if (nElem != 0) {
 		const uint16_t *t, *u;

 		len = 0;
 		t = s;
 		while (nElem != 0) {
 			u = utf16DecodeRune(t, nElem, &rune);
 			len += utf8EncodeRune(rune, encoded);
 			nElem -= u - t;
 			t = u;
 		}
 		return len;
 	}
 	len = 0;
 	while (*s) {
 		s = utf16DecodeRune(s, nElem, &rune);
 		len += utf8EncodeRune(rune, encoded);
 	}
 	return len;
 }
	// utf by pietro gagliardi (andlabs) — https://github.com/andlabs/utf/
	// 10 november 2016
	#include "utf.h"

	// this code imitates Go's unicode/utf8 and unicode/utf16
	// the biggest difference is that a rune is unsigned instead of signed (because Go guarantees what a right shift on a signed number will do, whereas C does not)
	// it is also an imitation so we can license it under looser terms than the Go source
	#define badrune 0xFFFD

	// encoded must be at most 4 bytes
	// TODO clean this code up somehow
	size_t utf8EncodeRune(uint32_t rune, char *encoded)
	{
	uint8_t b;
	uint8_t c = 0;
	uint8_t d = 0;
	uint8_t e = 0;
	size_t n;

	// not in the valid range for Unicode
	if (rune > 0x10FFFF)
	rune = badrune;
	// surrogate runes cannot be encoded
	if (rune >= 0xD800 && rune < 0xE000)
	rune = badrune;

	if (rune < 0x80) { // ASCII bytes represent themselves
	b = (uint8_t) (rune & 0xFF);
	n = 1;
	goto done;
	}
	if (rune < 0x800) { // two-byte encoding
	c = (uint8_t) (rune & 0x3F);
	c \|= 0x80;
	rune >>= 6;
	b = (uint8_t) (rune & 0x1F);
	b \|= 0xC0;
	n = 2;
	goto done;
	}
	if (rune < 0x10000) { // three-byte encoding
	d = (uint8_t) (rune & 0x3F);
	d \|= 0x80;
	rune >>= 6;
	c = (uint8_t) (rune & 0x3F);
	c \|= 0x80;
	rune >>= 6;
	b = (uint8_t) (rune & 0x0F);
	b \|= 0xE0;
	n = 3;
	goto done;
	}
	// otherwise use a four-byte encoding
	e = (uint8_t) (rune & 0x3F);
	e \|= 0x80;
	rune >>= 6;
	d = (uint8_t) (rune & 0x3F);
	d \|= 0x80;
	rune >>= 6;
	c = (uint8_t) (rune & 0x3F);
	c \|= 0x80;
	rune >>= 6;
	b = (uint8_t) (rune & 0x07);
	b \|= 0xF0;
	n = 4;

	done:
	encoded[0] = b;
	if (n > 1)
	encoded[1] = c;
	if (n > 2)
	encoded[2] = d;
	if (n > 3)
	encoded[3] = e;
	return n;
	}

	const char utf8DecodeRune(const char s, size_t nElem, uint32_t *rune)
	{
	uint8_t b, c;
	uint8_t lowestAllowed, highestAllowed;
	size_t i, expected;
	int bad;

	b = (uint8_t) (*s);
	if (b < 0x80) { // ASCII bytes represent themselves
	*rune = b;
	s++;
	return s;
	}
	// 0xC0 and 0xC1 cover 2-byte overlong equivalents
	// 0xF5 to 0xFD cover values > 0x10FFFF
	// 0xFE and 0xFF were never defined (always illegal)
	if (b < 0xC2 \|\| b > 0xF4) { // invalid
	*rune = badrune;
	s++;
	return s;
	}

	// this determines the range of allowed first continuation bytes
	lowestAllowed = 0x80;
	highestAllowed = 0xBF;
	switch (b) {
	case 0xE0:
	// disallow 3-byte overlong equivalents
	lowestAllowed = 0xA0;
	break;
	case 0xED:
	// disallow surrogate characters
	highestAllowed = 0x9F;
	break;
	case 0xF0:
	// disallow 4-byte overlong equivalents
	lowestAllowed = 0x90;
	break;
	case 0xF4:
	// disallow values > 0x10FFFF
	highestAllowed = 0x8F;
	break;
	}

	// and this determines how many continuation bytes are expected
	expected = 1;
	if (b >= 0xE0)
	expected++;
	if (b >= 0xF0)
	expected++;
	if (nElem != 0) { // are there enough bytes?
	nElem--;
	if (nElem < expected) { // nope
	*rune = badrune;
	s++;
	return s;
	}
	}

	// ensure that everything is correct
	// if not, only consume the initial byte
	bad = 0;
	for (i = 0; i < expected; i++) {
	c = (uint8_t) (s[1 + i]);
	if (c < lowestAllowed \|\| c > highestAllowed) {
	bad = 1;
	break;
	}
	// the old lowestAllowed and highestAllowed is only for the first continuation byte
	lowestAllowed = 0x80;
	highestAllowed = 0xBF;
	}
	if (bad) {
	*rune = badrune;
	s++;
	return s;
	}

	// now do the topmost bits
	if (b < 0xE0)
	*rune = b & 0x1F;
	else if (b < 0xF0)
	*rune = b & 0x0F;
	else
	*rune = b & 0x07;
	s++; // we can finally move on

	// now do the continuation bytes
	for (; expected; expected--) {
	c = (uint8_t) (*s);
	s++;
	c &= 0x3F; // strip continuation bits
	*rune <<= 6;
	*rune \|= c;
	}

	return s;
	}

	// encoded must have at most 2 elements
	size_t utf16EncodeRune(uint32_t rune, uint16_t *encoded)
	{
	uint16_t low, high;

	// not in the valid range for Unicode
	if (rune > 0x10FFFF)
	rune = badrune;
	// surrogate runes cannot be encoded
	if (rune >= 0xD800 && rune < 0xE000)
	rune = badrune;

	if (rune < 0x10000) {
	encoded[0] = (uint16_t) rune;
	return 1;
	}

	rune -= 0x10000;
	low = (uint16_t) (rune & 0x3FF);
	rune >>= 10;
	high = (uint16_t) (rune & 0x3FF);
	encoded[0] = high \| 0xD800;
	encoded[1] = low \| 0xDC00;
	return 2;
	}

	// TODO see if this can be cleaned up somehow
	const uint16_t utf16DecodeRune(const uint16_t s, size_t nElem, uint32_t *rune)
	{
	uint16_t high, low;

	if (s < 0xD800 \|\| s >= 0xE000) {
	// self-representing character
	rune = s;
	s++;
	return s;
	}
	if (*s >= 0xDC00) {
	// out-of-order surrogates
	*rune = badrune;
	s++;
	return s;
	}
	if (nElem == 1) { // not enough elements
	*rune = badrune;
	s++;
	return s;
	}
	high = *s;
	high &= 0x3FF;
	if (s[1] < 0xDC00 \|\| s[1] >= 0xE000) {
	// bad surrogate pair
	*rune = badrune;
	s++;
	return s;
	}
	s++;
	low = *s;
	s++;
	low &= 0x3FF;
	*rune = high;
	*rune <<= 10;
	*rune \|= low;
	*rune += 0x10000;
	return s;
	}

	// TODO find a way to reduce the code in all of these somehow
	// TODO find a way to remove u as well
	size_t utf8RuneCount(const char *s, size_t nElem)
	{
	size_t len;
	uint32_t rune;

	if (nElem != 0) {
	const char t, u;

	len = 0;
	t = s;
	while (nElem != 0) {
	u = utf8DecodeRune(t, nElem, &rune);
	len++;
	nElem -= u - t;
	t = u;
	}
	return len;
	}
	len = 0;
	while (*s) {
	s = utf8DecodeRune(s, nElem, &rune);
	len++;
	}
	return len;
	}

	size_t utf8UTF16Count(const char *s, size_t nElem)
	{
	size_t len;
	uint32_t rune;
	uint16_t encoded[2];

	if (nElem != 0) {
	const char t, u;

	len = 0;
	t = s;
	while (nElem != 0) {
	u = utf8DecodeRune(t, nElem, &rune);
	len += utf16EncodeRune(rune, encoded);
	nElem -= u - t;
	t = u;
	}
	return len;
	}
	len = 0;
	while (*s) {
	s = utf8DecodeRune(s, nElem, &rune);
	len += utf16EncodeRune(rune, encoded);
	}
	return len;
	}

	size_t utf16RuneCount(const uint16_t *s, size_t nElem)
	{
	size_t len;
	uint32_t rune;

	if (nElem != 0) {
	const uint16_t t, u;

	len = 0;
	t = s;
	while (nElem != 0) {
	u = utf16DecodeRune(t, nElem, &rune);
	len++;
	nElem -= u - t;
	t = u;
	}
	return len;
	}
	len = 0;
	while (*s) {
	s = utf16DecodeRune(s, nElem, &rune);
	len++;
	}
	return len;
	}

	size_t utf16UTF8Count(const uint16_t *s, size_t nElem)
	{
	size_t len;
	uint32_t rune;
	char encoded[4];

	if (nElem != 0) {
	const uint16_t t, u;

	len = 0;
	t = s;
	while (nElem != 0) {
	u = utf16DecodeRune(t, nElem, &rune);
	len += utf8EncodeRune(rune, encoded);
	nElem -= u - t;
	t = u;
	}
	return len;
	}
	len = 0;
	while (*s) {
	s = utf16DecodeRune(s, nElem, &rune);
	len += utf8EncodeRune(rune, encoded);
	}
	return len;
	}