Add 'misc3' state to handle 'Misc' data after the root element closed
Previously the misc2 state was entered after the root element has
closed, which would still allow for character content and new open tags
to be parsed. The latter was already detected with the 'afterelem' trick
added in 6bc21882f (now removed again), but that commit did not disallow
character content. This also removes the YXML_EMULROOT error code, such
errors are now reported as YXML_ESYN.
This commit adds ~600 bytes and improves performance for one benchmark
and worsens performance for the other. Neither difference is very
significant, however.
diff --git a/yxml-states b/yxml-states
index a197736..7e599d5 100644
--- a/yxml-states
+++ b/yxml-states
@@ -54,22 +54,27 @@
# 0 = before XMLDecl, (prolog)
# 1 = before first element, (prolog/misc)
# 2 = inside element (content)
+# 3 = after root element (misc)
# And naming:
# misc = Nothing special seen yet
# le = Seen '<'
# lee = Seen '<!'
# leq = Seen '<?'
+# The 'misc3' state is entered automatically when the root element has been
+# closed, yxml_selfclose() does this.
misc0 SP misc0; '<' le0
misc1 SP misc1; '<' le1
misc2 '<' le2; '&' refstart misc2a; Char setdata misc2
misc2a Ref ref misc2a; '\x3b' refend misc2
+misc3 SP misc3; '<' le3
le0 '!' lee1; '?' leq0; NameStart elemstart elem0
le1 '!' lee1; '?' @misc1 pi0; NameStart elemstart elem0
le2 '!' lee2; '?' @misc2 pi0; '/' etag0; NameStart elemstart elem0
+le3 '!' @misc3 comment0; '?' @misc3 pi0
-lee1 '-' @misc1 comment0; 'D' "OCTYPE" dt0
-lee2 '-' @misc2 comment0; '[' "CDATA[" cd0
+lee1 '-' @misc1 comment1; 'D' "OCTYPE" dt0
+lee2 '-' @misc2 comment1; '[' "CDATA[" cd0
leq0 'x' "ml" xmldecl0; NameStart pi1
@@ -104,12 +109,13 @@
std3 $quote xmldecl6
-# Comment, after '<!-', returns to @
+# Comment, after '<!', returns to @
comment0 '-' comment1
-comment1 CommentStart comment2
-comment2 '-' comment3; Char comment2
-comment3 '-' comment4; Char comment2
-comment4 '>' @
+comment1 '-' comment2
+comment2 CommentStart comment3
+comment3 '-' comment4; Char comment3
+comment4 '-' comment5; Char comment3
+comment5 '>' @
# PI, starting from '<?', returns to @
diff --git a/yxml.c b/yxml.c
index 901b314..cff89df 100644
--- a/yxml.c
+++ b/yxml.c
@@ -40,6 +40,7 @@
YXMLS_comment2,
YXMLS_comment3,
YXMLS_comment4,
+ YXMLS_comment5,
YXMLS_dt0,
YXMLS_elem0,
YXMLS_elem1,
@@ -56,6 +57,7 @@
YXMLS_le0,
YXMLS_le1,
YXMLS_le2,
+ YXMLS_le3,
YXMLS_lee1,
YXMLS_lee2,
YXMLS_leq0,
@@ -63,6 +65,7 @@
YXMLS_misc1,
YXMLS_misc2,
YXMLS_misc2a,
+ YXMLS_misc3,
YXMLS_pi0,
YXMLS_pi1,
YXMLS_pi2,
@@ -152,7 +155,7 @@
static inline int yxml_elemstart(yxml_t *x, unsigned ch) {
- return x->afterelem ? YXML_EMULROOT : yxml_pushstack(x, &x->elem, ch);
+ return yxml_pushstack(x, &x->elem, ch);
}
@@ -177,7 +180,7 @@
return YXML_ELEMEND;
}
x->elem = (char *)x->stack;
- x->afterelem = 1;
+ x->state = YXMLS_misc3;
return YXML_ELEMEND;
}
@@ -398,30 +401,36 @@
}
break;
case YXMLS_comment1:
- if(yxml_isCommentStart(ch)) {
+ if(ch == (unsigned char)'-') {
x->state = YXMLS_comment2;
return YXML_OK;
}
break;
case YXMLS_comment2:
- if(ch == (unsigned char)'-') {
+ if(yxml_isCommentStart(ch)) {
x->state = YXMLS_comment3;
return YXML_OK;
}
- if(yxml_isChar(ch))
- return YXML_OK;
break;
case YXMLS_comment3:
if(ch == (unsigned char)'-') {
x->state = YXMLS_comment4;
return YXML_OK;
}
+ if(yxml_isChar(ch))
+ return YXML_OK;
+ break;
+ case YXMLS_comment4:
+ if(ch == (unsigned char)'-') {
+ x->state = YXMLS_comment5;
+ return YXML_OK;
+ }
if(yxml_isChar(ch)) {
- x->state = YXMLS_comment2;
+ x->state = YXMLS_comment3;
return YXML_OK;
}
break;
- case YXMLS_comment4:
+ case YXMLS_comment5:
if(ch == (unsigned char)'>') {
x->state = x->nextstate;
return YXML_OK;
@@ -608,9 +617,21 @@
return yxml_elemstart(x, ch);
}
break;
+ case YXMLS_le3:
+ if(ch == (unsigned char)'!') {
+ x->state = YXMLS_comment0;
+ x->nextstate = YXMLS_misc3;
+ return YXML_OK;
+ }
+ if(ch == (unsigned char)'?') {
+ x->state = YXMLS_pi0;
+ x->nextstate = YXMLS_misc3;
+ return YXML_OK;
+ }
+ break;
case YXMLS_lee1:
if(ch == (unsigned char)'-') {
- x->state = YXMLS_comment0;
+ x->state = YXMLS_comment1;
x->nextstate = YXMLS_misc1;
return YXML_OK;
}
@@ -623,7 +644,7 @@
break;
case YXMLS_lee2:
if(ch == (unsigned char)'-') {
- x->state = YXMLS_comment0;
+ x->state = YXMLS_comment1;
x->nextstate = YXMLS_misc2;
return YXML_OK;
}
@@ -682,6 +703,14 @@
return yxml_refend(x, ch);
}
break;
+ case YXMLS_misc3:
+ if(yxml_isSP(ch))
+ return YXML_OK;
+ if(ch == (unsigned char)'<') {
+ x->state = YXMLS_le3;
+ return YXML_OK;
+ }
+ break;
case YXMLS_pi0:
if(yxml_isNameStart(ch)) {
x->state = YXMLS_pi1;
@@ -872,7 +901,7 @@
yxml_ret_t yxml_eof(yxml_t *x) {
- if(!x->afterelem || x->state != YXMLS_misc2)
+ if(x->state != YXMLS_misc3)
return YXML_EEOF;
return YXML_OK;
}
diff --git a/yxml.c.in b/yxml.c.in
index 3871742..a4a7dc5 100644
--- a/yxml.c.in
+++ b/yxml.c.in
@@ -95,7 +95,7 @@
static inline int yxml_elemstart(yxml_t *x, unsigned ch) {
- return x->afterelem ? YXML_EMULROOT : yxml_pushstack(x, &x->elem, ch);
+ return yxml_pushstack(x, &x->elem, ch);
}
@@ -120,7 +120,7 @@
return YXML_ELEMEND;
}
x->elem = (char *)x->stack;
- x->afterelem = 1;
+ x->state = YXMLS_misc3;
return YXML_ELEMEND;
}
@@ -268,7 +268,7 @@
yxml_ret_t yxml_eof(yxml_t *x) {
- if(!x->afterelem || x->state != YXMLS_misc2)
+ if(x->state != YXMLS_misc3)
return YXML_EEOF;
return YXML_OK;
}
diff --git a/yxml.h b/yxml.h
index a43ca81..f011dcf 100644
--- a/yxml.h
+++ b/yxml.h
@@ -25,8 +25,7 @@
typedef enum {
- YXML_EEOF = -7, /* Unexpected EOF */
- YXML_EMULROOT = -6, /* Document contains more than a single root element */
+ YXML_EEOF = -6, /* Unexpected EOF */
YXML_EREF = -5, /* Invalid character or entity reference (&whatever;) */
YXML_ECLOSE = -4, /* Close tag does not match open tag (<Tag> .. </OtherTag>) */
YXML_ESTACK = -3, /* Stack overflow (too deeply nested tags or too long element/attribute name) */
@@ -99,7 +98,6 @@
int nextstate; /* Used for '@' state remembering and for the "string" consuming state */
unsigned ignore;
unsigned char *string;
- char afterelem;
} yxml_t;