Heap-based buffer overread in htmlCurrentChar For https://bugzilla.gnome.org/show_bug.cgi?id=758606 * parserInternals.c: (xmlNextChar): Add an test to catch other issues on ctxt->input corruption proactively. For non-UTF-8 charsets, xmlNextChar() failed to check for the end of the input buffer and would continuing reading. Fix this by pulling out the check for the end of the input buffer into common code, and return if we reach the end of the input buffer prematurely. * result/HTML/758606.html: Added. * result/HTML/758606.html.err: Added. * result/HTML/758606.html.sax: Added. * result/HTML/758606_2.html: Added. * result/HTML/758606_2.html.err: Added. * result/HTML/758606_2.html.sax: Added. * test/HTML/758606.html: Added test case. * test/HTML/758606_2.html: Added test case.

commit: 0bcd05c5cd83dec3406c8f68b769b1d610c72f76 [log] [tgz]
author: Pranjal Jumde <pjumde@apple.com> Tue Mar 01 15:18:04 2016 -0800
committer: Daniel Veillard <veillard@redhat.com> Mon May 23 15:01:07 2016 +0800
tree: d9b6339d37e1de4eac646331a69fb73d4e53a8b7
parent: 00906759053986b8079985644172085f74331f83 [diff]
diff --git a/parserInternals.c b/parserInternals.c
index 8c79678..bfc778a 100644
--- a/parserInternals.c
+++ b/parserInternals.c

@@ -55,6 +55,10 @@
 #include <libxml/globals.h>
 #include <libxml/chvalid.h>
 
+#define CUR(ctxt) ctxt->input->cur
+#define END(ctxt) ctxt->input->end
+#define VALID_CTXT(ctxt) (CUR(ctxt) <= END(ctxt))
+
 #include "buf.h"
 #include "enc.h"
 
@@ -422,103 +426,105 @@
         (ctxt->input == NULL))
         return;
 
-    if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
-        if ((*ctxt->input->cur == 0) &&
-            (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0) &&
-            (ctxt->instate != XML_PARSER_COMMENT)) {
-            /*
-             * If we are at the end of the current entity and
-             * the context allows it, we pop consumed entities
-             * automatically.
-             * the auto closing should be blocked in other cases
-             */
+    if (!(VALID_CTXT(ctxt))) {
+        xmlErrInternal(ctxt, "Parser input data memory error\n", NULL);
+	ctxt->errNo = XML_ERR_INTERNAL_ERROR;
+        xmlStopParser(ctxt);
+	return;
+    }
+
+    if ((*ctxt->input->cur == 0) &&
+        (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
+        if ((ctxt->instate != XML_PARSER_COMMENT))
             xmlPopInput(ctxt);
-        } else {
-            const unsigned char *cur;
-            unsigned char c;
+        return;
+    }
 
-            /*
-             *   2.11 End-of-Line Handling
-             *   the literal two-character sequence "#xD#xA" or a standalone
-             *   literal #xD, an XML processor must pass to the application
-             *   the single character #xA.
-             */
-            if (*(ctxt->input->cur) == '\n') {
-                ctxt->input->line++; ctxt->input->col = 1;
-            } else
-                ctxt->input->col++;
+    if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
+        const unsigned char *cur;
+        unsigned char c;
 
-            /*
-             * We are supposed to handle UTF8, check it's valid
-             * From rfc2044: encoding of the Unicode values on UTF-8:
-             *
-             * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
-             * 0000 0000-0000 007F   0xxxxxxx
-             * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
-             * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
-             *
-             * Check for the 0x110000 limit too
-             */
-            cur = ctxt->input->cur;
+        /*
+         *   2.11 End-of-Line Handling
+         *   the literal two-character sequence "#xD#xA" or a standalone
+         *   literal #xD, an XML processor must pass to the application
+         *   the single character #xA.
+         */
+        if (*(ctxt->input->cur) == '\n') {
+            ctxt->input->line++; ctxt->input->col = 1;
+        } else
+            ctxt->input->col++;
 
-            c = *cur;
-            if (c & 0x80) {
-	        if (c == 0xC0)
-		    goto encoding_error;
-                if (cur[1] == 0) {
+        /*
+         * We are supposed to handle UTF8, check it's valid
+         * From rfc2044: encoding of the Unicode values on UTF-8:
+         *
+         * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
+         * 0000 0000-0000 007F   0xxxxxxx
+         * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
+         * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
+         *
+         * Check for the 0x110000 limit too
+         */
+        cur = ctxt->input->cur;
+
+        c = *cur;
+        if (c & 0x80) {
+        if (c == 0xC0)
+	    goto encoding_error;
+            if (cur[1] == 0) {
+                xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
+                cur = ctxt->input->cur;
+            }
+            if ((cur[1] & 0xc0) != 0x80)
+                goto encoding_error;
+            if ((c & 0xe0) == 0xe0) {
+                unsigned int val;
+
+                if (cur[2] == 0) {
                     xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
                     cur = ctxt->input->cur;
                 }
-                if ((cur[1] & 0xc0) != 0x80)
+                if ((cur[2] & 0xc0) != 0x80)
                     goto encoding_error;
-                if ((c & 0xe0) == 0xe0) {
-                    unsigned int val;
-
-                    if (cur[2] == 0) {
+                if ((c & 0xf0) == 0xf0) {
+                    if (cur[3] == 0) {
                         xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
                         cur = ctxt->input->cur;
                     }
-                    if ((cur[2] & 0xc0) != 0x80)
+                    if (((c & 0xf8) != 0xf0) ||
+                        ((cur[3] & 0xc0) != 0x80))
                         goto encoding_error;
-                    if ((c & 0xf0) == 0xf0) {
-                        if (cur[3] == 0) {
-                            xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
-                            cur = ctxt->input->cur;
-                        }
-                        if (((c & 0xf8) != 0xf0) ||
-                            ((cur[3] & 0xc0) != 0x80))
-                            goto encoding_error;
-                        /* 4-byte code */
-                        ctxt->input->cur += 4;
-                        val = (cur[0] & 0x7) << 18;
-                        val |= (cur[1] & 0x3f) << 12;
-                        val |= (cur[2] & 0x3f) << 6;
-                        val |= cur[3] & 0x3f;
-                    } else {
-                        /* 3-byte code */
-                        ctxt->input->cur += 3;
-                        val = (cur[0] & 0xf) << 12;
-                        val |= (cur[1] & 0x3f) << 6;
-                        val |= cur[2] & 0x3f;
-                    }
-                    if (((val > 0xd7ff) && (val < 0xe000)) ||
-                        ((val > 0xfffd) && (val < 0x10000)) ||
-                        (val >= 0x110000)) {
-			xmlErrEncodingInt(ctxt, XML_ERR_INVALID_CHAR,
-					  "Char 0x%X out of allowed range\n",
-					  val);
-                    }
-                } else
-                    /* 2-byte code */
-                    ctxt->input->cur += 2;
+                    /* 4-byte code */
+                    ctxt->input->cur += 4;
+                    val = (cur[0] & 0x7) << 18;
+                    val |= (cur[1] & 0x3f) << 12;
+                    val |= (cur[2] & 0x3f) << 6;
+                    val |= cur[3] & 0x3f;
+                } else {
+                    /* 3-byte code */
+                    ctxt->input->cur += 3;
+                    val = (cur[0] & 0xf) << 12;
+                    val |= (cur[1] & 0x3f) << 6;
+                    val |= cur[2] & 0x3f;
+                }
+                if (((val > 0xd7ff) && (val < 0xe000)) ||
+                    ((val > 0xfffd) && (val < 0x10000)) ||
+                    (val >= 0x110000)) {
+		xmlErrEncodingInt(ctxt, XML_ERR_INVALID_CHAR,
+				  "Char 0x%X out of allowed range\n",
+				  val);
+                }
             } else
-                /* 1-byte code */
-                ctxt->input->cur++;
+                /* 2-byte code */
+                ctxt->input->cur += 2;
+        } else
+            /* 1-byte code */
+            ctxt->input->cur++;
 
-            ctxt->nbChars++;
-            if (*ctxt->input->cur == 0)
-                xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
-        }
+        ctxt->nbChars++;
+        if (*ctxt->input->cur == 0)
+            xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
     } else {
         /*
          * Assume it's a fixed length encoding (1) with

diff --git a/result/HTML/758606.html b/result/HTML/758606.html
new file mode 100644
index 0000000..4f21f62
--- /dev/null
+++ b/result/HTML/758606.html

@@ -0,0 +1,2 @@
+<!DOCTYPE >
+

diff --git a/result/HTML/758606.html.err b/result/HTML/758606.html.err
new file mode 100644
index 0000000..060433a
--- /dev/null
+++ b/result/HTML/758606.html.err

@@ -0,0 +1,16 @@
+./test/HTML/758606.html:1: HTML parser error : Comment not terminated 
+<!--
+<!--<!doctype
+    ^
+./test/HTML/758606.html:1: HTML parser error : Invalid char in CDATA 0xC
+<!--<!doctype
+    ^
+./test/HTML/758606.html:1: HTML parser error : Misplaced DOCTYPE declaration
+<!--<!doctype
+     ^
+./test/HTML/758606.html:2: HTML parser error : htmlParseDocTypeDecl : no DOCTYPE name !
+
+^
+./test/HTML/758606.html:2: HTML parser error : DOCTYPE improperly terminated
+
+^

diff --git a/result/HTML/758606.html.sax b/result/HTML/758606.html.sax
new file mode 100644
index 0000000..d44a5cf
--- /dev/null
+++ b/result/HTML/758606.html.sax

@@ -0,0 +1,10 @@
+SAX.setDocumentLocator()
+SAX.startDocument()
+SAX.error: Comment not terminated 
+<!--
+SAX.error: Invalid char in CDATA 0xC
+SAX.error: Misplaced DOCTYPE declaration
+SAX.error: htmlParseDocTypeDecl : no DOCTYPE name !
+SAX.error: DOCTYPE improperly terminated
+SAX.internalSubset((null), , )
+SAX.endDocument()

diff --git a/result/HTML/758606_2.html b/result/HTML/758606_2.html
new file mode 100644
index 0000000..273816a
--- /dev/null
+++ b/result/HTML/758606_2.html

@@ -0,0 +1,2 @@
+<!DOCTYPE >
+<html><body><p>&#145;</p></body></html>

diff --git a/result/HTML/758606_2.html.err b/result/HTML/758606_2.html.err
new file mode 100644
index 0000000..4be039f
--- /dev/null
+++ b/result/HTML/758606_2.html.err

@@ -0,0 +1,16 @@
+./test/HTML/758606_2.html:1: HTML parser error : Comment not terminated 
+<!--
+<!--<!dOctYPE
+    ^
+./test/HTML/758606_2.html:1: HTML parser error : Invalid char in CDATA 0xC
+<!--<!dOctYPE
+    ^
+./test/HTML/758606_2.html:1: HTML parser error : Misplaced DOCTYPE declaration
+<!dOctYPE
+  ^
+./test/HTML/758606_2.html:2: HTML parser error : htmlParseDocTypeDecl : no DOCTYPE name !
+
+^
+./test/HTML/758606_2.html:2: HTML parser error : DOCTYPE improperly terminated
+
+^

diff --git a/result/HTML/758606_2.html.sax b/result/HTML/758606_2.html.sax
new file mode 100644
index 0000000..80ff3d7
--- /dev/null
+++ b/result/HTML/758606_2.html.sax

@@ -0,0 +1,17 @@
+SAX.setDocumentLocator()
+SAX.startDocument()
+SAX.error: Comment not terminated 
+<!--
+SAX.error: Invalid char in CDATA 0xC
+SAX.startElement(html)
+SAX.startElement(body)
+SAX.startElement(p)
+SAX.characters(&#145;, 2)
+SAX.error: Misplaced DOCTYPE declaration
+SAX.error: htmlParseDocTypeDecl : no DOCTYPE name !
+SAX.error: DOCTYPE improperly terminated
+SAX.internalSubset((null), , )
+SAX.endElement(p)
+SAX.endElement(body)
+SAX.endElement(html)
+SAX.endDocument()

diff --git a/test/HTML/758606.html b/test/HTML/758606.html
new file mode 100644
index 0000000..01a013c
--- /dev/null
+++ b/test/HTML/758606.html

@@ -0,0 +1 @@
+<!--<!doctype

diff --git a/test/HTML/758606_2.html b/test/HTML/758606_2.html
new file mode 100644
index 0000000..daa185b
--- /dev/null
+++ b/test/HTML/758606_2.html

@@ -0,0 +1 @@
+<!--<!dOctYPE
commit	0bcd05c5cd83dec3406c8f68b769b1d610c72f76	[log] [tgz]
author	Pranjal Jumde <pjumde@apple.com>	Tue Mar 01 15:18:04 2016 -0800
committer	Daniel Veillard <veillard@redhat.com>	Mon May 23 15:01:07 2016 +0800
tree	d9b6339d37e1de4eac646331a69fb73d4e53a8b7
parent	00906759053986b8079985644172085f74331f83 [diff]