Add support for non-ASCII character references and encode them as UTF-8

commit: 961af5224b84b6a40f204e737d7bf301deed4503 [log] [tgz]
author: Yorhel <git@yorhel.nl> Tue Sep 24 12:31:23 2013 +0200
committer: Yorhel <git@yorhel.nl> Tue Sep 24 12:31:23 2013 +0200
tree: ada259374ad10866e03886bf42bee37e885c96b6
parent: 80e73e201e68a09b399bf192d97c332df59ad980 [diff]
diff --git a/test/content01.out b/test/content01.out
index 572a916..c512665 100644
--- a/test/content01.out
+++ b/test/content01.out

@@ -9,7 +9,7 @@
 data \x0a
 elemstart refs
 content
-data  ! !
+data  ! !é☃🐱
 elemend
 data \x0aCDATA!\x0a[[CD<a/> <!-- no comment -->&amp;<?NotaPI?>&notaref;\x0a]x]]y]]]z]]\x0a
 elemend

diff --git a/test/content01.xml b/test/content01.xml
index 4689c13..be98c69 100644
--- a/test/content01.xml
+++ b/test/content01.xml

@@ -1,7 +1,7 @@
 <a>ZOMFG! Element	content!
 
 <entities>&amp;&lt;&gt;&apos;&quot;</entities>
-<refs>&#x20;&#33;&#x0020;&#0033;</refs>
+<refs>&#x20;&#33;&#x0020;&#0033;&#xe9;&#x2603;&#x1F431;</refs>
 <![CDATA[CDATA!]]>
 <![CDATA[[[CD<a/> <!-- no comment -->&amp;<?NotaPI?>&notaref;
 ]x]]y]]]z]]]]>

diff --git a/yxml.c b/yxml.c
index c451816..56efd35 100644
--- a/yxml.c
+++ b/yxml.c

@@ -121,6 +121,28 @@
 }
 
 
+/* Similar to yxml_setchar(), but will convert ch (any valid unicode point) to
+ * UTF-8 and appends a '\0'. dest must have room for at least 5 bytes. */
+static void yxml_setutf8(char *dest, unsigned ch) {
+	if(ch <= 0x007F)
+		yxml_setchar(dest++, ch);
+	else if(ch <= 0x07FF) {
+		yxml_setchar(dest++, 0xC0 | (ch>>6));
+		yxml_setchar(dest++, 0x80 | (ch & 0x3F));
+	} else if(ch <= 0xFFFF) {
+		yxml_setchar(dest++, 0xE0 | (ch>>12));
+		yxml_setchar(dest++, 0x80 | ((ch>>6) & 0x3F));
+		yxml_setchar(dest++, 0x80 | (ch & 0x3F));
+	} else {
+		yxml_setchar(dest++, 0xF0 | (ch>>18));
+		yxml_setchar(dest++, 0x80 | ((ch>>12) & 0x3F));
+		yxml_setchar(dest++, 0x80 | ((ch>>6) & 0x3F));
+		yxml_setchar(dest++, 0x80 | (ch & 0x3F));
+	}
+	*dest = 0;
+}
+
+
 static inline int yxml_dataset(yxml_t *x, unsigned ch) {
 	yxml_setchar(x->data, ch);
 	x->data[1] = 0;
@@ -282,11 +304,11 @@
 			ch = '"';
 	}
 
-	/* XXX: The API does not allow returning more than one byte at a time, so
-	 * CharRefs only work for ASCII at the moment. This is kind of stupid. */
-	if(!ch || ch > 127)
+	/* Codepoints not allowed in the XML 1.1 definition of a Char */
+	if(!ch || ch > 0x10FFFF || ch == 0xFFFE || ch == 0xFFFF || (ch-0xDFFF) < 0x7FF)
 		return YXML_EREF;
-	return yxml_dataset(x, ch);
+	yxml_setutf8(x->data, ch);
+	return YXML_DATA;
 }
 
 

diff --git a/yxml.c.in b/yxml.c.in
index fe30728..fa0bdeb 100644
--- a/yxml.c.in
+++ b/yxml.c.in

@@ -57,6 +57,28 @@
 }
 
 
+/* Similar to yxml_setchar(), but will convert ch (any valid unicode point) to
+ * UTF-8 and appends a '\0'. dest must have room for at least 5 bytes. */
+static void yxml_setutf8(char *dest, unsigned ch) {
+	if(ch <= 0x007F)
+		yxml_setchar(dest++, ch);
+	else if(ch <= 0x07FF) {
+		yxml_setchar(dest++, 0xC0 | (ch>>6));
+		yxml_setchar(dest++, 0x80 | (ch & 0x3F));
+	} else if(ch <= 0xFFFF) {
+		yxml_setchar(dest++, 0xE0 | (ch>>12));
+		yxml_setchar(dest++, 0x80 | ((ch>>6) & 0x3F));
+		yxml_setchar(dest++, 0x80 | (ch & 0x3F));
+	} else {
+		yxml_setchar(dest++, 0xF0 | (ch>>18));
+		yxml_setchar(dest++, 0x80 | ((ch>>12) & 0x3F));
+		yxml_setchar(dest++, 0x80 | ((ch>>6) & 0x3F));
+		yxml_setchar(dest++, 0x80 | (ch & 0x3F));
+	}
+	*dest = 0;
+}
+
+
 static inline int yxml_dataset(yxml_t *x, unsigned ch) {
 	yxml_setchar(x->data, ch);
 	x->data[1] = 0;
@@ -218,11 +240,11 @@
 			ch = '"';
 	}
 
-	/* XXX: The API does not allow returning more than one byte at a time, so
-	 * CharRefs only work for ASCII at the moment. This is kind of stupid. */
-	if(!ch || ch > 127)
+	/* Codepoints not allowed in the XML 1.1 definition of a Char */
+	if(!ch || ch > 0x10FFFF || ch == 0xFFFE || ch == 0xFFFF || (ch-0xDFFF) < 0x7FF)
 		return YXML_EREF;
-	return yxml_dataset(x, ch);
+	yxml_setutf8(x->data, ch);
+	return YXML_DATA;
 }
 
 

diff --git a/yxml.h b/yxml.h
index ec22db5..c090265 100644
--- a/yxml.h
+++ b/yxml.h

@@ -77,10 +77,12 @@
 	/* The last read character(s) of an attribute value, element data, or
 	 * processing instruction. Changed after YXML_DATA and only valid until the
 	 * next yxml_parse() call. Usually, this string only consists of a single
-	 * character, but multiple characters are returned in the following cases:
+	 * byte, but multiple bytes are returned in the following cases:
 	 * - "<?SomePI ?x ?>": The two characters "?x"
 	 * - "<![CDATA[ ]x ]]>": The two characters "]x"
 	 * - "<![CDATA[ ]]x ]]>": The three characters "]]x"
+	 * - "&#N;" and "&#xN;", where dec(n) > 127. The referenced Unicode
+	 *   character is then encoded in multiple UTF-8 bytes.
 	 */
 	char data[8];
commit	961af5224b84b6a40f204e737d7bf301deed4503	[log] [tgz]
author	Yorhel <git@yorhel.nl>	Tue Sep 24 12:31:23 2013 +0200
committer	Yorhel <git@yorhel.nl>	Tue Sep 24 12:31:23 2013 +0200
tree	ada259374ad10866e03886bf42bee37e885c96b6
parent	80e73e201e68a09b399bf192d97c332df59ad980 [diff]