Improve parsing of doctype declarations
This code should handle all declarations that don't use a conditional
section anywhere.
diff --git a/yxml-states b/yxml-states
index 7e599d5..0134a51 100644
--- a/yxml-states
+++ b/yxml-states
@@ -134,12 +134,21 @@
# Doctype, starting from '<!DOCTYPE', returns to misc1
-# TODO: This is a hack, all we do is read until we find a '>', not
-# validating its content. This hack fails if the DTD contains a '>'
-# character, which is very possible. Unfortunately, just figuring out where a
-# DTD ends already requires a rather elaborate parser. :-(
-# <DOCTYPE " " ' ' [ " " ' ' <? ?> <!-- --> < " " ' ' > ]>
-dt0 '>' misc1; Char dt0
+# XXX: The state machine below only attempts to figure out where the doctype
+# declaration ends, its contents are not actually parsed or validated.
+# Basically, it allows the following nesting of tags/quotes/PIs/comments:
+#
+# <!DOCTYPE ".." '..' <?PI ..?> <!--..--> <!.. ".." '.."> >
+#
+# Only the last '>' is correctly recognized as the end of the declaration.
+# Any other '>' found to end a tag/PI/comment, or found within quotes,
+# comments or a PI, is ignored.
+# TODO: This still fails on conditional sections, which may nest.
+dt0 '>' misc1; '\''|'"' $quote @dt0 dt1; '<' dt2; Char dt0
+dt1 $quote @; Char dt1
+dt2 '?' @dt0 pi0; '!' dt3
+dt3 '-' @dt0 comment1; Char dt4
+dt4 '\''|'"' $quote @dt4 dt1; '>' dt0; Char dt4
# End tag, after '</', returns to misc2
diff --git a/yxml.c b/yxml.c
index 8322b74..cbdada9 100644
--- a/yxml.c
+++ b/yxml.c
@@ -42,6 +42,10 @@
YXMLS_comment4,
YXMLS_comment5,
YXMLS_dt0,
+ YXMLS_dt1,
+ YXMLS_dt2,
+ YXMLS_dt3,
+ YXMLS_dt4,
YXMLS_elem0,
YXMLS_elem1,
YXMLS_elem2,
@@ -441,6 +445,60 @@
x->state = YXMLS_misc1;
return YXML_OK;
}
+ if(ch == (unsigned char)'\'' || ch == (unsigned char)'"') {
+ x->state = YXMLS_dt1;
+ x->quote = ch;
+ x->nextstate = YXMLS_dt0;
+ return YXML_OK;
+ }
+ if(ch == (unsigned char)'<') {
+ x->state = YXMLS_dt2;
+ return YXML_OK;
+ }
+ if(yxml_isChar(ch))
+ return YXML_OK;
+ break;
+ case YXMLS_dt1:
+ if(x->quote == ch) {
+ x->state = x->nextstate;
+ return YXML_OK;
+ }
+ if(yxml_isChar(ch))
+ return YXML_OK;
+ break;
+ case YXMLS_dt2:
+ if(ch == (unsigned char)'?') {
+ x->state = YXMLS_pi0;
+ x->nextstate = YXMLS_dt0;
+ return YXML_OK;
+ }
+ if(ch == (unsigned char)'!') {
+ x->state = YXMLS_dt3;
+ return YXML_OK;
+ }
+ break;
+ case YXMLS_dt3:
+ if(ch == (unsigned char)'-') {
+ x->state = YXMLS_comment1;
+ x->nextstate = YXMLS_dt0;
+ return YXML_OK;
+ }
+ if(yxml_isChar(ch)) {
+ x->state = YXMLS_dt4;
+ return YXML_OK;
+ }
+ break;
+ case YXMLS_dt4:
+ if(ch == (unsigned char)'\'' || ch == (unsigned char)'"') {
+ x->state = YXMLS_dt1;
+ x->quote = ch;
+ x->nextstate = YXMLS_dt4;
+ return YXML_OK;
+ }
+ if(ch == (unsigned char)'>') {
+ x->state = YXMLS_dt0;
+ return YXML_OK;
+ }
if(yxml_isChar(ch))
return YXML_OK;
break;