html: Chunk text data in push parser Follow the logic of the XML parser and chunk large text nodes.
diff --git a/HTMLparser.c b/HTMLparser.c index 92ccd43..211256b 100644 --- a/HTMLparser.c +++ b/HTMLparser.c
@@ -3113,6 +3113,7 @@ case '<': if (mode == 0) { done = 1; + complete = 1; goto next_chunk; } if (mode == DATA_PLAINTEXT) @@ -5146,16 +5147,16 @@ } else { ctxt->instate = XML_PARSER_CONTENT; /* - * check that the text sequence is complete - * before handing out the data to the parser - * to avoid problems with erroneous end of - * data detection. + * We follow the logic of the XML push parser */ - if ((!terminate) && - (htmlParseLookupString(ctxt, 0, "<", 1, 0) < 0)) - return; + if (avail < HTML_PARSER_BIG_BUFFER_SIZE) { + if ((!terminate) && + (htmlParseLookupString(ctxt, 0, "<", 1, 0) < 0)) + return; + } ctxt->checkIndex = 0; - htmlParseCharData(ctxt, /* partial */ 0); + if (htmlParseCharData(ctxt, !terminate) == 0) + return; } break;
diff --git a/runtest.c b/runtest.c index f6bb967..f1d7a75 100644 --- a/runtest.c +++ b/runtest.c
@@ -2256,9 +2256,13 @@ if ((options & XML_PARSE_HTML) && (ctxt->endCheckState)) { max = strlen((const char *) ctxt->name) + 2; + } else if (c == '&') { + max = (options & XML_PARSE_HTML) ? 32 : 1; + } else if (c == '<') { + max = 1; } else { /* 3 bytes for partial UTF-8 */ - max = ((c == '<') || (c == '&')) ? 1 : 3; + max = 3; } } else if (ctxt->instate == XML_PARSER_CDATA_SECTION) { /* 2 bytes for terminator, 3 bytes for UTF-8 */