HTMLtree.c - third_party/libxml2 - Git at Google

 /*
  * HTMLtree.c : implementation of access function for an HTML tree.
  *
  * See Copyright for the status of this software.
  *
  * Author: Daniel Veillard
  */


 #define IN_LIBXML
 #include "libxml.h"
 #ifdef LIBXML_HTML_ENABLED

 #include <string.h> /* for memset() only ! */
 #include <ctype.h>
 #include <stdlib.h>

 #include <libxml/xmlmemory.h>
 #include <libxml/HTMLparser.h>
 #include <libxml/HTMLtree.h>
 #include <libxml/entities.h>
 #include <libxml/xmlerror.h>
 #include <libxml/parserInternals.h>
 #include <libxml/uri.h>

 #include "private/buf.h"
 #include "private/html.h"
 #include "private/error.h"
 #include "private/html.h"
 #include "private/io.h"
 #include "private/save.h"
 #include "private/tree.h"

 /************************************************************************
  *									*
  *		Getting/Setting encoding meta tags			*
  *									*
  ************************************************************************/

 typedef struct {
     xmlAttrPtr attr; /* charset or content */
     const xmlChar *attrValue;
     htmlMetaEncodingOffsets off;
 } htmlMetaEncoding;

 static htmlNodePtr
 htmlFindFirstChild(htmlNodePtr parent, const char *name) {
     htmlNodePtr child;

     for (child = parent->children; child != NULL; child = child->next) {
         if ((child->type == XML_ELEMENT_NODE) &&
             (xmlStrcasecmp(child->name, BAD_CAST name) == 0))
             return(child);
     }

     return(NULL);
 }

 static htmlNodePtr
 htmlFindHead(htmlDocPtr doc) {
     htmlNodePtr html;

     if (doc == NULL)
         return(NULL);

     html = htmlFindFirstChild((htmlNodePtr) doc, "html");
     if (html == NULL)
         return(NULL);

     return(htmlFindFirstChild(html, "head"));
 }

 int
 htmlParseContentType(const xmlChar *val, htmlMetaEncodingOffsets *off) {
     const xmlChar *p = val;

     while (1) {
         size_t start, end;

         while ((*p != 'c') && (*p != 'C')) {
             if (*p == 0)
                 return(0);
             p += 1;
         }
         p += 1;

         if (xmlStrncasecmp(p, BAD_CAST "harset", 6) != 0)
             continue;

         p += 6;
         while (IS_WS_HTML(*p)) p += 1;

         if (*p != '=')
             continue;

         p += 1;
         while (IS_WS_HTML(*p)) p += 1;

         if (*p == 0)
             return(0);

         if ((*p == '"') || (*p == '\'')) {
             int quote = *p;

             p += 1;
             while (IS_WS_HTML(*p)) p += 1;

             start = p - val;
             end = start;

             while (*p != quote) {
                 if (*p == 0)
                     return(0);
                 if (!IS_WS_HTML(*p))
                     end = p + 1 - val;
                 p += 1;
             }
         } else {
             start = p - val;

             while ((*p != 0) && (*p != ';') && (!IS_WS_HTML(*p)))
                 p += 1;

             end = p - val;
         }

         off->start = start;
         off->end = end;
         off->size = p - val + strlen((char *) p);

         return(1);
     }

     return(0);
 }

 static xmlAttrPtr
 htmlFindMetaEncodingAttr(htmlNodePtr elem, int *outIsContentType) {
     xmlAttrPtr attr, contentAttr = NULL;
     int isContentType = 0;

     if (xmlStrcasecmp(elem->name, BAD_CAST "meta") != 0)
         return(NULL);

     for (attr = elem->properties; attr != NULL; attr = attr->next) {
         if (attr->ns != NULL)
             continue;
         if (xmlStrcasecmp(attr->name, BAD_CAST "charset") == 0) {
             *outIsContentType = 0;
             return(attr);
         }
         if (xmlStrcasecmp(attr->name, BAD_CAST "content") == 0)
             contentAttr = attr;
         if ((xmlStrcasecmp(attr->name, BAD_CAST "http-equiv") == 0) &&
             (attr->children != NULL) &&
             (attr->children->type == XML_TEXT_NODE) &&
             (attr->children->next == NULL) &&
             (xmlStrcasecmp(attr->children->content,
                            BAD_CAST "Content-Type") == 0))
             isContentType = 1;
     }

     if ((isContentType) && (contentAttr != NULL)) {
         *outIsContentType = 1;
         return(contentAttr);
     }

     return(NULL);
 }

 static int
 htmlParseMetaEncoding(htmlNodePtr elem, htmlMetaEncoding *menc) {
     xmlAttrPtr attr;
     const xmlChar *val = NULL;
     int isContentType;

     if ((elem->type != XML_ELEMENT_NODE) ||
         (xmlStrcasecmp(elem->name, BAD_CAST "meta") != 0))
         return(0);

     attr = htmlFindMetaEncodingAttr(elem, &isContentType);
     if (attr == NULL)
         return(0);

     if ((attr->children != NULL) &&
         (attr->children->type == XML_TEXT_NODE) &&
         (attr->children->next == NULL) &&
         (attr->children->content != NULL))
         val = attr->children->content;
     else
         val = BAD_CAST "";


     if (!isContentType) {
         size_t size = strlen((char *) val);
         size_t start = 0;
         size_t end = size;

         while ((start < size) && (IS_WS_HTML(val[start])))
             start += 1;

         while ((end > 0) && (IS_WS_HTML(val[end-1])))
             end -= 1;

         menc->attr = attr;
         menc->attrValue = val;
         menc->off.start = start;
         menc->off.end = end;
         menc->off.size = size;

         return(1);
     } else {
         if (htmlParseContentType(val, &menc->off)) {
             menc->attr = attr;
             menc->attrValue = val;

             return(1);
         }
     }

     return(0);
 }

 static xmlChar *
 htmlUpdateMetaEncoding(htmlMetaEncoding *menc, const char *encoding) {
     xmlChar *newVal, *p;
     size_t size, oldEncSize, newEncSize;

     /*
      * The pseudo "HTML" encoding only produces ASCII.
      */
     if (xmlStrcasecmp(BAD_CAST encoding, BAD_CAST "HTML") == 0)
         encoding = "ASCII";

     oldEncSize = menc->off.end - menc->off.start;
     newEncSize = strlen((char *) encoding);
     size = menc->off.size - oldEncSize + newEncSize;
     newVal = xmlMalloc(size + 1);
     if (newVal == NULL)
         return(NULL);

     p = newVal;
     memcpy(p, menc->attrValue, menc->off.start);
     p += menc->off.start;
     memcpy(p, encoding, newEncSize);
     p += newEncSize;
     memcpy(p, menc->attrValue + menc->off.end, menc->off.size - menc->off.end);
     newVal[size] = 0;

     return(newVal);
 }

 /**
  * Look up and encoding declaration in the meta tags.
  *
  * The returned string points into attribute content and can contain
  * trailing garbage. It should be copied before modifying or freeing
  * nodes.
  *
  * @param doc  the document
  * @returns the encoding ot NULL if not found.
  */
 const xmlChar *
 htmlGetMetaEncoding(xmlDoc *doc) {
     htmlNodePtr head, node;

     head = htmlFindHead(doc);
     if (head == NULL)
         return(NULL);

     for (node = head->children; node != NULL; node = node->next) {
         htmlMetaEncoding menc;

         if (htmlParseMetaEncoding(node, &menc)) {
             /*
              * Returning a `const xmlChar *` only allows to return
              * a suffix. In http-equiv meta tags, there could be
              * more data after the charset, although it's probably
              * rare in practice.
              */
             return(menc.attrValue + menc.off.start);
         }
     }

     return(NULL);
 }

 /**
  * Creates or updates a meta tag with an encoding declaration.
  *
  * NOTE: This will not change the document content encoding.
  *
  * @param doc  the document
  * @param encoding  the encoding string
  * @returns 0 in case of success, 1 if no head element was found or
  * arguments are invalid and -1 if memory allocation failed.
  */
 int
 htmlSetMetaEncoding(xmlDoc *doc, const xmlChar *encoding) {
     htmlNodePtr head, meta;
     int found = 0;

     if (encoding == NULL)
         return(1);

     head = htmlFindHead(doc);
     if (head == NULL)
         return(1);

     for (meta = head->children; meta != NULL; meta = meta->next) {
         htmlMetaEncoding menc;

         if (htmlParseMetaEncoding(meta, &menc)) {
             xmlChar *newVal;
             int ret;

             found = 1;

             newVal = htmlUpdateMetaEncoding(&menc, (char *) encoding);
             if (newVal == NULL)
                 return(-1);
             xmlNodeSetContent((xmlNodePtr) menc.attr, NULL);
             ret = xmlNodeAddContent((xmlNodePtr) menc.attr, newVal);
             xmlFree(newVal);

             if (ret < 0)
                 return(-1);
         }
     }

     if (found)
         return(0);

     meta = xmlNewDocNode(head->doc, NULL, BAD_CAST "meta", NULL);
     if (meta == NULL)
         return(-1);

     if (xmlNewProp(meta, BAD_CAST "charset", encoding) == NULL) {
         xmlFreeNode(meta);
         return(-1);
     }

     if (head->children == NULL)
         xmlAddChild(head, meta);
     else
         xmlAddPrevSibling(head->children, meta);

     return(0);
 }

 /**
  * Determine if a given attribute is a boolean attribute. This
  * doesn't handle HTML5.
  *
  * @deprecated Internal function, don't use.
  *
  * @param name  the name of the attribute to check
  * @returns false if the attribute is not boolean, true otherwise.
  */
 int
 htmlIsBooleanAttr(const xmlChar *name)
 {
     const char *str = NULL;

     if (name == NULL)
         return(0);

     /*
      * These are the HTML attributes which will be output
      * in minimized form, i.e. `<option selected="selected">` will be
      * output as `<option selected>`, as per XSLT 1.0 16.2 "HTML Output
      * Method":
      *
      * "checked", "compact", "declare", "defer", "disabled", "ismap",
      * "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly",
      * "selected"
      *
      * Additional attributes from HTML5 (not implemented yet):
      *
      * "allowfullscreen", "alpha", "async", "autofocus", "autoplay",
      * "controls", "default", "formnovalidate", "inert", "itemscope",
      * "loop", "muted", "nomodule", "novalidate", "open", "playsinline",
      * "required", "reversed", "shadowrootdelegatesfocus",
      * "shadowrootclonable", "shadowrootserializable",
      * "shadowrootcustomelementregistry", "truespeed"
      */

     switch (name[0] | 0x20) {
         case 'c':
             name += 1;
             switch (name[0] | 0x20) {
                 case 'h': str = "ecked"; break;
                 case 'o': str = "mpact"; break;
             }
             break;
         case 'd':
             name += 1;
             switch (name[0] | 0x20) {
                 case 'e':
                     name += 1;
                     switch (name[0] | 0x20) {
                         case 'c': str = "lare"; break;
                         case 'f': str = "er"; break;
                     }
                     break;
                 case 'i': str = "sabled"; break;
             }
             break;
         case 'i':
             str = "smap";
             break;
         case 'm':
             str = "ultiple";
             break;
         case 'n':
             name += 1;
             if ((name[0] | 0x20) != 'o')
                 break;
             name += 1;
             switch (name[0] | 0x20) {
                 case 'h': str = "ref"; break;
                 case 'r': str = "esize"; break;
                 case 's': str = "hade"; break;
                 case 'w': str = "rap"; break;
             }
             break;
         case 'r':
             str = "eadonly";
             break;
         case 's':
             str = "elected";
             break;
     }

     if (str == NULL)
         return(0);

     return(xmlStrcasecmp(name + 1, BAD_CAST str) == 0);
 }

 #ifdef LIBXML_OUTPUT_ENABLED
 /************************************************************************
  *									*
  *		Dumping HTML tree content to a simple buffer		*
  *									*
  ************************************************************************/

 static xmlParserErrors
 htmlFindOutputEncoder(const char *encoding, xmlCharEncodingHandler **out) {
     /*
      * Fallback to HTML if the encoding is unspecified
      */
     if (encoding == NULL)
         encoding = "HTML";

     return(xmlOpenCharEncodingHandler(encoding, /* output */ 1, out));
 }

 /**
  * Serialize an HTML document to an xmlBuf.
  *
  * @param buf  the xmlBuf output
  * @param doc  the document (unused)
  * @param cur  the current node
  * @param format  should formatting newlines been added
  * @returns the number of bytes written or -1 in case of error
  */
 static size_t
 htmlBufNodeDumpFormat(xmlBufPtr buf, xmlDocPtr doc ATTRIBUTE_UNUSED,
                       xmlNodePtr cur, int format) {
     size_t use;
     size_t ret;
     xmlOutputBufferPtr outbuf;

     if (cur == NULL) {
 	return ((size_t) -1);
     }
     if (buf == NULL) {
 	return ((size_t) -1);
     }
     outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer));
     if (outbuf == NULL)
 	return ((size_t) -1);
     memset(outbuf, 0, sizeof(xmlOutputBuffer));
     outbuf->buffer = buf;
     outbuf->encoder = NULL;
     outbuf->writecallback = NULL;
     outbuf->closecallback = NULL;
     outbuf->context = NULL;
     outbuf->written = 0;

     use = xmlBufUse(buf);
     htmlNodeDumpInternal(outbuf, cur, NULL, format);
     if (outbuf->error)
         ret = (size_t) -1;
     else
         ret = xmlBufUse(buf) - use;
     xmlFree(outbuf);
     return (ret);
 }

 /**
  * Serialize an HTML node to an xmlBuffer. Always uses UTF-8.
  *
  * @param buf  the HTML buffer output
  * @param doc  the document
  * @param cur  the current node
  * @returns the number of bytes written or -1 in case of error
  */
 int
 htmlNodeDump(xmlBuffer *buf, xmlDoc *doc, xmlNode *cur) {
     xmlBufPtr buffer;
     size_t ret1;
     int ret2;

     if ((buf == NULL) || (cur == NULL))
         return(-1);

     xmlInitParser();
     buffer = xmlBufFromBuffer(buf);
     if (buffer == NULL)
         return(-1);

     ret1 = htmlBufNodeDumpFormat(buffer, doc, cur, 1);

     ret2 = xmlBufBackToBuffer(buffer, buf);

     if ((ret1 == (size_t) -1) || (ret2 < 0))
         return(-1);
     return(ret1 > INT_MAX ? INT_MAX : ret1);
 }

 /**
  * Serialize an HTML node to an xmlBuffer.
  *
  * If encoding is NULL, ASCII with HTML 4.0 named character entities
  * will be used. This is inefficient compared to UTF-8 and might be
  * changed in a future version.
  *
  * @param out  the FILE pointer
  * @param doc  the document (unused)
  * @param cur  the current node
  * @param encoding  the document encoding (optional)
  * @param format  should formatting newlines been added
  * @returns the number of bytes written or -1 in case of failure.
  */
 int
 htmlNodeDumpFileFormat(FILE *out, xmlDoc *doc ATTRIBUTE_UNUSED,
 	               xmlNode *cur, const char *encoding, int format) {
     xmlOutputBufferPtr buf;
     xmlCharEncodingHandlerPtr handler;
     int ret;

     xmlInitParser();

     /*
      * save the content to a temp buffer.
      */
     if (htmlFindOutputEncoder(encoding, &handler) != XML_ERR_OK)
         return(-1);
     buf = xmlOutputBufferCreateFile(out, handler);
     if (buf == NULL) {
         xmlCharEncCloseFunc(handler);
         return(-1);
     }

     htmlNodeDumpInternal(buf, cur, NULL, format);

     ret = xmlOutputBufferClose(buf);
     return(ret);
 }

 /**
  * Same as #htmlNodeDumpFileFormat with `format` set to 1 which is
  * typically undesired. Use of this function is DISCOURAGED in favor
  * of #htmlNodeDumpFileFormat.
  *
  * @param out  the FILE pointer
  * @param doc  the document
  * @param cur  the current node
  */
 void
 htmlNodeDumpFile(FILE *out, xmlDoc *doc, xmlNode *cur) {
     htmlNodeDumpFileFormat(out, doc, cur, NULL, 1);
 }

 /**
  * Serialize an HTML node to a memory, also returning the size of
  * the result. It's up to the caller to free the memory.
  *
  * Uses the encoding of the document. If the document has no
  * encoding, ASCII with HTML 4.0 named character entities will
  * be used. This is inefficient compared to UTF-8 and might be
  * changed in a future version.
  *
  * @param cur  the document
  * @param mem  OUT: the memory pointer
  * @param size  OUT: the memory length
  * @param format  should formatting newlines been added
  */
 void
 htmlDocDumpMemoryFormat(xmlDoc *cur, xmlChar**mem, int *size, int format) {
     xmlOutputBufferPtr buf;
     xmlCharEncodingHandlerPtr handler = NULL;

     xmlInitParser();

     if ((mem == NULL) || (size == NULL))
         return;
     *mem = NULL;
     *size = 0;
     if (cur == NULL)
 	return;

     if (htmlFindOutputEncoder((char *) cur->encoding, &handler) != XML_ERR_OK)
         return;
     buf = xmlAllocOutputBuffer(handler);
     if (buf == NULL) {
         xmlCharEncCloseFunc(handler);
 	return;
     }

     htmlDocContentDumpFormatOutput(buf, cur, NULL, format);

     xmlOutputBufferFlush(buf);

     if (!buf->error) {
         if (buf->conv != NULL) {
             *size = xmlBufUse(buf->conv);
             *mem = xmlStrndup(xmlBufContent(buf->conv), *size);
         } else {
             *size = xmlBufUse(buf->buffer);
             *mem = xmlStrndup(xmlBufContent(buf->buffer), *size);
         }
     }

     xmlOutputBufferClose(buf);
 }

 /**
  * Same as #htmlDocDumpMemoryFormat with `format` set to 1 which
  * is typically undesired. Also see the warnings there. Use of
  * this function is DISCOURAGED in favor of
  * #htmlDocContentDumpFormatOutput.
  *
  * @param cur  the document
  * @param mem  OUT: the memory pointer
  * @param size  OUT: the memory length
  */
 void
 htmlDocDumpMemory(xmlDoc *cur, xmlChar**mem, int *size) {
     htmlDocDumpMemoryFormat(cur, mem, size, 1);
 }


 /************************************************************************
  *									*
  *		Dumping HTML tree content to an I/O output buffer	*
  *									*
  ************************************************************************/

 /**
  * Serialize the HTML document's DTD, if any.
  *
  * Ignores `encoding` and uses the encoding of the output buffer.
  *
  * @param buf  the HTML buffer output
  * @param doc  the document
  * @param encoding  the encoding string (unused)
  */
 static void
 htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
 	          const char *encoding ATTRIBUTE_UNUSED) {
     xmlDtdPtr cur = doc->intSubset;

     if (cur == NULL)
 	return;
     xmlOutputBufferWrite(buf, 10, "<!DOCTYPE ");
     xmlOutputBufferWriteString(buf, (const char *)cur->name);
     if (cur->ExternalID != NULL) {
 	xmlOutputBufferWrite(buf, 8, " PUBLIC ");
 	xmlOutputBufferWriteQuotedString(buf, cur->ExternalID);
 	if (cur->SystemID != NULL) {
 	    xmlOutputBufferWrite(buf, 1, " ");
 	    xmlOutputBufferWriteQuotedString(buf, cur->SystemID);
 	}
     } else if (cur->SystemID != NULL &&
 	       xmlStrcmp(cur->SystemID, BAD_CAST "about:legacy-compat")) {
 	xmlOutputBufferWrite(buf, 8, " SYSTEM ");
 	xmlOutputBufferWriteQuotedString(buf, cur->SystemID);
     }
     xmlOutputBufferWrite(buf, 2, ">\n");
 }

 static void
 htmlSerializeUri(xmlOutputBufferPtr buf, const xmlChar *content) {
     const xmlChar *tmp = content;

     /*
      * See appendix "B.2.1 Non-ASCII characters in URI attribute
      * values" in the HTML 4.01 spec. This is also recommended
      * by the HTML output method of the XSLT 1.0 spec.
      *
      * We also escape space and control chars.
      */

     /* Skip over initial whitespace */
     while (IS_WS_HTML(*tmp)) tmp++;
     if (tmp > content) {
         xmlOutputBufferWrite(buf, tmp - content, (char *) content);
         content = tmp;
     }

     while (1) {
         char escbuf[3];
         const char *repl;
         int replSize;
         int c = *tmp;

         while ((c > 0x20) && (c < 0x7F) && (c != '"') && (c != '&')) {
             tmp += 1;
             c = *tmp;
         }

         if (tmp > content)
             xmlOutputBufferWrite(buf, tmp - content, (char *) content);

         if ((c <= 0x20) || (c >= 0x7F)) {
             static const char hex[16] = {
                 '0', '1', '2', '3', '4', '5', '6', '7',
                 '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'
             };

             if (c == 0)
                 break;

             escbuf[0] = '%';
             escbuf[1] = hex[(c >> 4) & 0x0F];
             escbuf[2] = hex[c & 0x0F];
             repl = escbuf;
             replSize = 3;
         } else if (c == '"') {
             repl = "&quot;";
             replSize = 6;
         } else {
             repl = "&amp;";
             replSize = 5;
         }

         xmlOutputBufferWrite(buf, replSize, repl);
         tmp += 1;
         content = tmp;
     }
 }

 /**
  * Serialize an HTML attribute.
  *
  * @param buf  the HTML buffer output
  * @param cur  the attribute pointer
  */
 static void
 htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlAttrPtr cur) {
     xmlOutputBufferWrite(buf, 1, " ");

     if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
         xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
         xmlOutputBufferWrite(buf, 1, ":");
     }
     xmlOutputBufferWriteString(buf, (const char *)cur->name);

     /*
      * The HTML5 spec requires to always serialize empty attribute
      * values as `=""`. We should probably align with HTML5 at some
      * point.
      */
     if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) {
         xmlNodePtr child;
         int isUri;

         xmlOutputBufferWrite(buf, 2, "=\"");

         /*
          * Special handling of URIs doesn't conform to HTML5 and
          * should probably be removed at some point.
          */
         isUri = (cur->ns == NULL) && (cur->parent != NULL) &&
                 (cur->parent->ns == NULL) &&
                 ((!xmlStrcasecmp(cur->name, BAD_CAST "href")) ||
                  (!xmlStrcasecmp(cur->name, BAD_CAST "action")) ||
                  (!xmlStrcasecmp(cur->name, BAD_CAST "src")) ||
                  ((!xmlStrcasecmp(cur->name, BAD_CAST "name")) &&
                   (!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))));

         for (child = cur->children; child != NULL; child = child->next) {
             if (child->type == XML_TEXT_NODE) {
                 const xmlChar *content = child->content;

                 if (content == NULL)
                     continue;

                 if (isUri) {
                     htmlSerializeUri(buf, content);
                 } else {
                     xmlSerializeText(buf, content, SIZE_MAX,
                                      XML_ESCAPE_HTML | XML_ESCAPE_ATTR);
                 }
             } else if (child->type == XML_ENTITY_REF_NODE) {
                 /* TODO: We should probably expand entity refs */
                 xmlOutputBufferWrite(buf, 1, "&");
                 xmlOutputBufferWriteString(buf, (char *) child->name);
                 xmlOutputBufferWrite(buf, 1, ";");
             }
         }

         xmlOutputBufferWrite(buf, 1, "\"");
     }
 }

 /**
  * Serialize an HTML node to an output buffer.
  *
  * If `encoding` is specified, it is used to create or update meta
  * tags containing the character encoding.
  *
  * @param buf  the HTML buffer output
  * @param cur  the current node
  * @param encoding  the encoding string (optional)
  * @param format  should formatting newlines been added
  */
 void
 htmlNodeDumpInternal(xmlOutputBuffer *buf, xmlNode *cur,
                      const char *encoding, int format) {
     xmlNodePtr root, parent, metaHead = NULL;
     xmlAttrPtr attr;
     const htmlElemDesc * info;
     int isRaw = 0;

     xmlInitParser();

     if ((cur == NULL) || (buf == NULL)) {
 	return;
     }

     root = cur;
     parent = cur->parent;
     while (1) {
         switch (cur->type) {
         case XML_HTML_DOCUMENT_NODE:
         case XML_DOCUMENT_NODE:
             if (((xmlDocPtr) cur)->intSubset != NULL) {
                 htmlDtdDumpOutput(buf, (xmlDocPtr) cur, NULL);
             }
             if (cur->children != NULL) {
                 /* Always validate cur->parent when descending. */
                 if (cur->parent == parent) {
                     parent = cur;
                     cur = cur->children;
                     continue;
                 }
             } else {
                 xmlOutputBufferWrite(buf, 1, "\n");
             }
             break;

         case XML_ELEMENT_NODE: {
             htmlMetaEncoding menc;
             int isMeta = 0;
             int addMeta = 0;

             /*
              * Some users like lxml are known to pass nodes with a corrupted
              * tree structure. Fall back to a recursive call to handle this
              * case.
              */
             if ((cur->parent != parent) && (cur->children != NULL)) {
                 htmlNodeDumpInternal(buf, cur, encoding, format);
                 break;
             }

             /*
              * Get specific HTML info for that node.
              */
             if (cur->ns == NULL)
                 info = htmlTagLookup(cur->name);
             else
                 info = NULL;

             if (encoding != NULL) {
                 isMeta = htmlParseMetaEncoding(cur, &menc);

                 /*
                  * Don't add meta tag for "HTML" encoding.
                  */
                 if ((xmlStrcasecmp(BAD_CAST encoding,
                                    BAD_CAST "HTML") != 0) &&
                     (xmlStrcasecmp(cur->name, BAD_CAST "head") == 0) &&
                     (parent != NULL) &&
                     (xmlStrcasecmp(parent->name, BAD_CAST "html") == 0) &&
                     (parent->parent != NULL) &&
                     (parent->parent->parent == NULL) &&
                     (metaHead == NULL)) {
                     xmlNodePtr n;

                     metaHead = cur;
                     addMeta = 1;

                     for (n = cur->children; n != NULL; n = n->next) {
                         int unused;

                         if (htmlFindMetaEncodingAttr(n, &unused) != NULL) {
                             metaHead = NULL;
                             addMeta = 0;
                             break;
                         }
                     }
                 }
             }

             xmlOutputBufferWrite(buf, 1, "<");
             if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
                 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
                 xmlOutputBufferWrite(buf, 1, ":");
             }
             xmlOutputBufferWriteString(buf, (const char *)cur->name);
             if (cur->nsDef)
                 xmlNsListDumpOutput(buf, cur->nsDef);
             attr = cur->properties;
             while (attr != NULL) {
                 if ((!isMeta) || (attr != menc.attr)) {
                     htmlAttrDumpOutput(buf, attr);
                 } else {
                     xmlOutputBufferWrite(buf, 1, " ");
                     xmlOutputBufferWriteString(buf, (char *) attr->name);

                     xmlOutputBufferWrite(buf, 2, "=\"");
                     xmlSerializeText(buf, menc.attrValue, menc.off.start,
                                      XML_ESCAPE_HTML | XML_ESCAPE_ATTR);
                     xmlSerializeText(buf, BAD_CAST encoding, SIZE_MAX,
                                      XML_ESCAPE_HTML | XML_ESCAPE_ATTR);
                     xmlSerializeText(buf, menc.attrValue + menc.off.end,
                                      menc.off.size - menc.off.end,
                                      XML_ESCAPE_HTML | XML_ESCAPE_ATTR);
                     xmlOutputBufferWrite(buf, 1, "\"");
                 }
                 attr = attr->next;
             }

             if ((info != NULL) && (info->empty)) {
                 xmlOutputBufferWrite(buf, 1, ">");
             } else if (cur->children == NULL) {
                 if (addMeta) {
                     xmlOutputBufferWrite(buf, 16, "><meta charset=\"");
                     xmlSerializeText(buf, BAD_CAST encoding, SIZE_MAX,
                                      XML_ESCAPE_HTML | XML_ESCAPE_ATTR);
                     xmlOutputBufferWrite(buf, 4, "\"></");
                 } else {
                     xmlOutputBufferWrite(buf, 3, "></");
                 }
                 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
                     xmlOutputBufferWriteString(buf,
                             (const char *)cur->ns->prefix);
                     xmlOutputBufferWrite(buf, 1, ":");
                 }
                 xmlOutputBufferWriteString(buf, (const char *)cur->name);
                 xmlOutputBufferWrite(buf, 1, ">");
             } else {
                 xmlOutputBufferWrite(buf, 1, ">");
                 if ((format) &&
                     ((addMeta) ||
                      ((info != NULL) && (!info->isinline) &&
                       (cur->children->type != HTML_TEXT_NODE) &&
                       (cur->children->type != HTML_ENTITY_REF_NODE) &&
                       (cur->children != cur->last) &&
                       (cur->name != NULL) &&
                       (cur->name[0] != 'p')))) /* p, pre, param */
                     xmlOutputBufferWrite(buf, 1, "\n");
                 if (addMeta) {
                     xmlOutputBufferWrite(buf, 15, "<meta charset=\"");
                     xmlSerializeText(buf, BAD_CAST encoding, SIZE_MAX,
                                      XML_ESCAPE_HTML | XML_ESCAPE_ATTR);
                     xmlOutputBufferWrite(buf, 2, "\">");
                     if ((format) &&
                         (cur->children->type != HTML_TEXT_NODE) &&
                         (cur->children->type != HTML_ENTITY_REF_NODE))
                         xmlOutputBufferWrite(buf, 1, "\n");
                 }

                 if ((info != NULL) && (info->dataMode >= DATA_RAWTEXT))
                     isRaw = 1;

                 parent = cur;
                 cur = cur->children;
                 continue;
             }

             if ((format) && (cur->next != NULL) &&
                 (info != NULL) && (!info->isinline)) {
                 if ((cur->next->type != HTML_TEXT_NODE) &&
                     (cur->next->type != HTML_ENTITY_REF_NODE) &&
                     (parent != NULL) &&
                     (parent->name != NULL) &&
                     (parent->name[0] != 'p')) /* p, pre, param */
                     xmlOutputBufferWrite(buf, 1, "\n");
             }

             break;
         }

         case XML_ATTRIBUTE_NODE:
             htmlAttrDumpOutput(buf, (xmlAttrPtr) cur);
             break;

         case HTML_TEXT_NODE:
             if (cur->content == NULL)
                 break;
             if ((cur->name == (const xmlChar *)xmlStringTextNoenc) ||
                 (isRaw)) {
                 xmlOutputBufferWriteString(buf, (const char *)cur->content);
             } else {
                 xmlSerializeText(buf, cur->content, SIZE_MAX, XML_ESCAPE_HTML);
             }
             break;

         case HTML_COMMENT_NODE:
             if (cur->content != NULL) {
                 xmlOutputBufferWrite(buf, 4, "<!--");
                 xmlOutputBufferWriteString(buf, (const char *)cur->content);
                 xmlOutputBufferWrite(buf, 3, "-->");
             }
             break;

         case HTML_PI_NODE:
             if (cur->name != NULL) {
                 xmlOutputBufferWrite(buf, 2, "<?");
                 xmlOutputBufferWriteString(buf, (const char *)cur->name);
                 if (cur->content != NULL) {
                     xmlOutputBufferWrite(buf, 1, " ");
                     xmlOutputBufferWriteString(buf,
                             (const char *)cur->content);
                 }
                 xmlOutputBufferWrite(buf, 1, ">");
             }
             break;

         case HTML_ENTITY_REF_NODE:
             xmlOutputBufferWrite(buf, 1, "&");
             xmlOutputBufferWriteString(buf, (const char *)cur->name);
             xmlOutputBufferWrite(buf, 1, ";");
             break;

         case HTML_PRESERVE_NODE:
             if (cur->content != NULL) {
                 xmlOutputBufferWriteString(buf, (const char *)cur->content);
             }
             break;

         default:
             break;
         }

         while (1) {
             if (cur == root)
                 return;
             if (cur->next != NULL) {
                 cur = cur->next;
                 break;
             }

             isRaw = 0;

             cur = parent;
             /* cur->parent was validated when descending. */
             parent = cur->parent;

             if ((cur->type == XML_HTML_DOCUMENT_NODE) ||
                 (cur->type == XML_DOCUMENT_NODE)) {
                 xmlOutputBufferWrite(buf, 1, "\n");
             } else {
                 if ((format) && (cur->ns == NULL))
                     info = htmlTagLookup(cur->name);
                 else
                     info = NULL;

                 if ((format) && (info != NULL) && (!info->isinline) &&
                     (cur->last->type != HTML_TEXT_NODE) &&
                     (cur->last->type != HTML_ENTITY_REF_NODE) &&
                     ((cur->children != cur->last) || (cur == metaHead)) &&
                     (cur->name != NULL) &&
                     (cur->name[0] != 'p')) /* p, pre, param */
                     xmlOutputBufferWrite(buf, 1, "\n");

                 xmlOutputBufferWrite(buf, 2, "</");
                 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
                     xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
                     xmlOutputBufferWrite(buf, 1, ":");
                 }
                 xmlOutputBufferWriteString(buf, (const char *)cur->name);
                 xmlOutputBufferWrite(buf, 1, ">");

                 if ((format) && (info != NULL) && (!info->isinline) &&
                     (cur->next != NULL)) {
                     if ((cur->next->type != HTML_TEXT_NODE) &&
                         (cur->next->type != HTML_ENTITY_REF_NODE) &&
                         (parent != NULL) &&
                         (parent->name != NULL) &&
                         (parent->name[0] != 'p')) /* p, pre, param */
                         xmlOutputBufferWrite(buf, 1, "\n");
                 }

                 if (cur == metaHead)
                     metaHead = NULL;
             }
         }
     }
 }

 /**
  * Serialize an HTML node to an output buffer.
  *
  * @param buf  the HTML buffer output
  * @param doc  the document (unused)
  * @param cur  the current node
  * @param encoding  the encoding string (unused)
  * @param format  should formatting newlines been added
  */
 void
 htmlNodeDumpFormatOutput(xmlOutputBuffer *buf,
                          xmlDoc *doc ATTRIBUTE_UNUSED, xmlNode *cur,
                          const char *encoding ATTRIBUTE_UNUSED, int format) {
     htmlNodeDumpInternal(buf, cur, NULL, format);
 }

 /**
  * Same as #htmlNodeDumpFormatOutput with `format` set to 1 which is
  * typically undesired. Use of this function is DISCOURAGED in favor
  * of #htmlNodeDumpFormatOutput.
  *
  * @param buf  the HTML buffer output
  * @param doc  the document (unused)
  * @param cur  the current node
  * @param encoding  the encoding string (unused)
  */
 void
 htmlNodeDumpOutput(xmlOutputBuffer *buf, xmlDoc *doc ATTRIBUTE_UNUSED,
                    xmlNode *cur, const char *encoding ATTRIBUTE_UNUSED) {
     htmlNodeDumpInternal(buf, cur, NULL, 1);
 }

 /**
  * Serialize an HTML document to an output buffer.
  *
  * @param buf  the HTML buffer output
  * @param cur  the document
  * @param encoding  the encoding string (unused)
  * @param format  should formatting newlines been added
  */
 void
 htmlDocContentDumpFormatOutput(xmlOutputBuffer *buf, xmlDoc *cur,
 	                       const char *encoding ATTRIBUTE_UNUSED,
                                int format) {
     htmlNodeDumpInternal(buf, (xmlNodePtr) cur, NULL, format);
 }

 /**
  * Same as #htmlDocContentDumpFormatOutput with `format` set to 1
  * which is typically undesired. Use of this function is DISCOURAGED
  * in favor of #htmlDocContentDumpFormatOutput.
  *
  * @param buf  the HTML buffer output
  * @param cur  the document
  * @param encoding  the encoding string (unused)
  */
 void
 htmlDocContentDumpOutput(xmlOutputBuffer *buf, xmlDoc *cur,
 	                 const char *encoding ATTRIBUTE_UNUSED) {
     htmlNodeDumpInternal(buf, (xmlNodePtr) cur, NULL, 1);
 }

 /************************************************************************
  *									*
  *		Saving functions front-ends				*
  *									*
  ************************************************************************/

 /**
  * Serialize an HTML document to an open `FILE`.
  *
  * Uses the encoding of the document. If the document has no
  * encoding, ASCII with HTML 4.0 named character entities will
  * be used. This is inefficient compared to UTF-8 and might be
  * changed in a future version.
  *
  * Enables "formatting" unconditionally which is typically
  * undesired.
  *
  * Use of this function is DISCOURAGED in favor of
  * #htmlNodeDumpFileFormat.
  *
  * @param f  the FILE*
  * @param cur  the document
  * @returns the number of bytes written or -1 in case of failure.
  */
 int
 htmlDocDump(FILE *f, xmlDoc *cur) {
     xmlOutputBufferPtr buf;
     xmlCharEncodingHandlerPtr handler = NULL;
     int ret;

     xmlInitParser();

     if ((cur == NULL) || (f == NULL)) {
 	return(-1);
     }

     if (htmlFindOutputEncoder((char *) cur->encoding, &handler) != XML_ERR_OK)
         return(-1);
     buf = xmlOutputBufferCreateFile(f, handler);
     if (buf == NULL) {
         xmlCharEncCloseFunc(handler);
         return(-1);
     }
     htmlDocContentDumpOutput(buf, cur, NULL);

     ret = xmlOutputBufferClose(buf);
     return(ret);
 }

 /**
  * Serialize an HTML document to a file.
  *
  * Same as #htmlSaveFileFormat with `encoding` set to NULL and
  * `format` set to 1 which is typically undesired.
  *
  * Use of this function is DISCOURAGED in favor of
  * #htmlSaveFileFormat.
  *
  * @param filename  the filename (or URL)
  * @param cur  the document
  * @returns the number of bytes written or -1 in case of failure.
  */
 int
 htmlSaveFile(const char *filename, xmlDoc *cur) {
     return(htmlSaveFileFormat(filename, cur, NULL, 1));
 }

 /**
  * Serialize an HTML document to a file using a given encoding.
  *
  * If `filename` is `"-"`, stdout is used. This is potentially
  * insecure and might be changed in a future version.
  *
  * If encoding is NULL, ASCII with HTML 4.0 named character entities
  * will be used. This is inefficient compared to UTF-8 and might be
  * changed in a future version.
  *
  * Sets or updates meta tags containing the character encoding.
  *
  * @param filename  the filename
  * @param cur  the document
  * @param format  should formatting newlines been added
  * @param encoding  the document encoding (optional)
  * @returns the number of bytes written or -1 in case of failure.
  */
 int
 htmlSaveFileFormat(const char *filename, xmlDoc *cur,
 	           const char *encoding, int format) {
     xmlOutputBufferPtr buf;
     xmlCharEncodingHandlerPtr handler = NULL;
     int ret;

     if ((cur == NULL) || (filename == NULL))
         return(-1);

     xmlInitParser();

     if (htmlFindOutputEncoder(encoding, &handler) != XML_ERR_OK)
         return(-1);

     /*
      * save the content to a temp buffer.
      */
     buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
     if (buf == NULL) {
         xmlCharEncCloseFunc(handler);
         return(0);
     }

     htmlDocContentDumpFormatOutput(buf, cur, encoding, format);

     ret = xmlOutputBufferClose(buf);
     return(ret);
 }

 /**
  * Serialize an HTML document to a file.
  *
  * Same as #htmlSaveFileFormat with `format` set to 1 which is
  * typically undesired. Also see the warnings there. Use of this
  * function is DISCOURAGED in favor of #htmlSaveFileFormat.
  *
  * @param filename  the filename
  * @param cur  the document
  * @param encoding  the document encoding
  * @returns the number of bytes written or -1 in case of failure.
  */
 int
 htmlSaveFileEnc(const char *filename, xmlDoc *cur, const char *encoding) {
     return(htmlSaveFileFormat(filename, cur, encoding, 1));
 }

 #endif /* LIBXML_OUTPUT_ENABLED */

 #endif /* LIBXML_HTML_ENABLED */