Add support for non-ASCII character references and encode them as UTF-8
diff --git a/test/content01.out b/test/content01.out
index 572a916..c512665 100644
--- a/test/content01.out
+++ b/test/content01.out
@@ -9,7 +9,7 @@
data \x0a
elemstart refs
content
-data ! !
+data ! !é☃🐱
elemend
data \x0aCDATA!\x0a[[CD<a/> <!-- no comment -->&<?NotaPI?>¬aref;\x0a]x]]y]]]z]]\x0a
elemend
diff --git a/test/content01.xml b/test/content01.xml
index 4689c13..be98c69 100644
--- a/test/content01.xml
+++ b/test/content01.xml
@@ -1,7 +1,7 @@
<a>ZOMFG! Element content!
<entities>&<>'"</entities>
-<refs> ! !</refs>
+<refs> ! !é☃🐱</refs>
<![CDATA[CDATA!]]>
<![CDATA[[[CD<a/> <!-- no comment -->&<?NotaPI?>¬aref;
]x]]y]]]z]]]]>
diff --git a/yxml.c b/yxml.c
index c451816..56efd35 100644
--- a/yxml.c
+++ b/yxml.c
@@ -121,6 +121,28 @@
}
+/* Similar to yxml_setchar(), but will convert ch (any valid unicode point) to
+ * UTF-8 and appends a '\0'. dest must have room for at least 5 bytes. */
+static void yxml_setutf8(char *dest, unsigned ch) {
+ if(ch <= 0x007F)
+ yxml_setchar(dest++, ch);
+ else if(ch <= 0x07FF) {
+ yxml_setchar(dest++, 0xC0 | (ch>>6));
+ yxml_setchar(dest++, 0x80 | (ch & 0x3F));
+ } else if(ch <= 0xFFFF) {
+ yxml_setchar(dest++, 0xE0 | (ch>>12));
+ yxml_setchar(dest++, 0x80 | ((ch>>6) & 0x3F));
+ yxml_setchar(dest++, 0x80 | (ch & 0x3F));
+ } else {
+ yxml_setchar(dest++, 0xF0 | (ch>>18));
+ yxml_setchar(dest++, 0x80 | ((ch>>12) & 0x3F));
+ yxml_setchar(dest++, 0x80 | ((ch>>6) & 0x3F));
+ yxml_setchar(dest++, 0x80 | (ch & 0x3F));
+ }
+ *dest = 0;
+}
+
+
static inline int yxml_dataset(yxml_t *x, unsigned ch) {
yxml_setchar(x->data, ch);
x->data[1] = 0;
@@ -282,11 +304,11 @@
ch = '"';
}
- /* XXX: The API does not allow returning more than one byte at a time, so
- * CharRefs only work for ASCII at the moment. This is kind of stupid. */
- if(!ch || ch > 127)
+ /* Codepoints not allowed in the XML 1.1 definition of a Char */
+ if(!ch || ch > 0x10FFFF || ch == 0xFFFE || ch == 0xFFFF || (ch-0xDFFF) < 0x7FF)
return YXML_EREF;
- return yxml_dataset(x, ch);
+ yxml_setutf8(x->data, ch);
+ return YXML_DATA;
}
diff --git a/yxml.c.in b/yxml.c.in
index fe30728..fa0bdeb 100644
--- a/yxml.c.in
+++ b/yxml.c.in
@@ -57,6 +57,28 @@
}
+/* Similar to yxml_setchar(), but will convert ch (any valid unicode point) to
+ * UTF-8 and appends a '\0'. dest must have room for at least 5 bytes. */
+static void yxml_setutf8(char *dest, unsigned ch) {
+ if(ch <= 0x007F)
+ yxml_setchar(dest++, ch);
+ else if(ch <= 0x07FF) {
+ yxml_setchar(dest++, 0xC0 | (ch>>6));
+ yxml_setchar(dest++, 0x80 | (ch & 0x3F));
+ } else if(ch <= 0xFFFF) {
+ yxml_setchar(dest++, 0xE0 | (ch>>12));
+ yxml_setchar(dest++, 0x80 | ((ch>>6) & 0x3F));
+ yxml_setchar(dest++, 0x80 | (ch & 0x3F));
+ } else {
+ yxml_setchar(dest++, 0xF0 | (ch>>18));
+ yxml_setchar(dest++, 0x80 | ((ch>>12) & 0x3F));
+ yxml_setchar(dest++, 0x80 | ((ch>>6) & 0x3F));
+ yxml_setchar(dest++, 0x80 | (ch & 0x3F));
+ }
+ *dest = 0;
+}
+
+
static inline int yxml_dataset(yxml_t *x, unsigned ch) {
yxml_setchar(x->data, ch);
x->data[1] = 0;
@@ -218,11 +240,11 @@
ch = '"';
}
- /* XXX: The API does not allow returning more than one byte at a time, so
- * CharRefs only work for ASCII at the moment. This is kind of stupid. */
- if(!ch || ch > 127)
+ /* Codepoints not allowed in the XML 1.1 definition of a Char */
+ if(!ch || ch > 0x10FFFF || ch == 0xFFFE || ch == 0xFFFF || (ch-0xDFFF) < 0x7FF)
return YXML_EREF;
- return yxml_dataset(x, ch);
+ yxml_setutf8(x->data, ch);
+ return YXML_DATA;
}
diff --git a/yxml.h b/yxml.h
index ec22db5..c090265 100644
--- a/yxml.h
+++ b/yxml.h
@@ -77,10 +77,12 @@
/* The last read character(s) of an attribute value, element data, or
* processing instruction. Changed after YXML_DATA and only valid until the
* next yxml_parse() call. Usually, this string only consists of a single
- * character, but multiple characters are returned in the following cases:
+ * byte, but multiple bytes are returned in the following cases:
* - "<?SomePI ?x ?>": The two characters "?x"
* - "<![CDATA[ ]x ]]>": The two characters "]x"
* - "<![CDATA[ ]]x ]]>": The three characters "]]x"
+ * - "&#N;" and "&#xN;", where dec(n) > 127. The referenced Unicode
+ * character is then encoded in multiple UTF-8 bytes.
*/
char data[8];