* Summary: interface for an HTML 4.0 non-verifying parser | |
* Description: this module implements an HTML 4.0 non-verifying parser | |
* with API compatible with the XML parser ones. It should | |
* be able to parse "real world" HTML, even if severely | |
* broken from a specification point of view. | |
* | |
* Copy: See Copyright for the status of this software. | |
* | |
* Author: Patrick Monnerat <pm@datasphere.ch>, DATASPHERE S.A. | |
/if not defined(HTML_PARSER_H__) | |
/define HTML_PARSER_H__ | |
/include "libxmlrpg/xmlversion" | |
/if defined(LIBXML_HTML_ENABLED) | |
/include "libxmlrpg/xmlTypesC" | |
/include "libxmlrpg/parser" | |
* Most of the back-end structures from XML and HTML are shared. | |
d htmlParserCtxtPtr... | |
d s based(######typedef######) | |
d like(xmlParserCtxtPtr) | |
d htmlParserCtxt ds based(htmlParserCtxtPtr) | |
d likeds(xmlParserCtxt) | |
d htmlParserNodeInfoPtr... | |
d s based(######typedef######) | |
d like(xmlParserNodeInfoPtr) | |
d htmlParserNodeInfo... | |
d ds based(htmlParserNodeInfoPtr) | |
d likeds(xmlParserNodeInfo) | |
d htmlSAXHandlerPtr... | |
d s based(######typedef######) | |
d like(xmlSAXHandlerPtr) | |
d htmlSAXHandler ds based(htmlSAXHandlerPtr) | |
d likeds(xmlSAXHandler) | |
d htmlParserInputPtr... | |
d s based(######typedef######) | |
d like(xmlParserInputPtr) | |
d htmlParserInput... | |
d ds based(htmlParserInputPtr) | |
d likeds(xmlParserInput) | |
d htmlDocPtr s based(######typedef######) | |
d like(xmlDocPtr) | |
d htmlNodePtr s based(######typedef######) | |
d like(xmlNodePtr) | |
* Internal description of an HTML element, representing HTML 4.01 | |
* and XHTML 1.0 (which share the same structure). | |
d htmlElemDescPtr... | |
d s * based(######typedef######) | |
d htmlElemDesc ds based(htmlElemDescPtr) | |
d align qualified | |
d name * const char * | |
d startTag like(xmlCchar) Start tag implied ? | |
d endTag like(xmlCchar) End tag implied ? | |
d saveEndTag like(xmlCchar) Save end tag ? | |
d empty like(xmlCchar) Empty element ? | |
d depr like(xmlCchar) Deprecated element ? | |
d dtd like(xmlCchar) Loose DTD/Frameset | |
d isinline like(xmlCchar) Block 0/inline elem? | |
d desc * const char * | |
* | |
* New fields encapsulating HTML structure | |
* | |
* Bugs: | |
* This is a very limited representation. It fails to tell us when | |
* an element *requires* subelements (we only have whether they're | |
* allowed or not), and it doesn't tell us where CDATA and PCDATA | |
* are allowed. Some element relationships are not fully represented: | |
* these are flagged with the word MODIFIER | |
* | |
d subelts * const char * * | |
d defaultsubelt * const char * | |
d attrs_opt * const char * * | |
d attrs_depr * const char * * | |
d attrs_req * const char * * | |
* Internal description of an HTML entity. | |
d htmlEntityDescPtr... | |
d s * based(######typedef######) | |
d htmlEntityDesc... | |
d ds based(htmlEntityDescPtr) | |
d align qualified | |
d value like(xmlCuint) | |
d name * const char * | |
d desc * const char * | |
* There is only few public functions. | |
d htmlTagLookup pr extproc('htmlTagLookup') | |
d like(htmlElemDescPtr) const | |
d tag * value options(*string) const xmlChar * | |
d htmlEntityLookup... | |
d pr extproc('htmlEntityLookup') | |
d like(htmlEntityDescPtr) const | |
d name * value options(*string) const xmlChar * | |
d htmlEntityValueLookup... | |
d pr extproc('htmlEntityValueLookup') | |
d like(htmlEntityDescPtr) const | |
d value value like(xmlCuint) | |
d htmlIsAutoClosed... | |
d pr extproc('htmlIsAutoClosed') | |
d like(xmlCint) | |
d doc value like(htmlDocPtr) | |
d elem value like(htmlNodePtr) | |
d htmlAutoCloseTag... | |
d pr extproc('htmlAutoCloseTag') | |
d like(xmlCint) | |
d doc value like(htmlDocPtr) | |
d name * value options(*string) const xmlChar * | |
d elem value like(htmlNodePtr) | |
d htmlParseEntityRef... | |
d pr extproc('htmlParseEntityRef') | |
d like(htmlEntityDescPtr) const | |
d ctxt value like(htmlParserCtxtPtr) | |
d str * const xmlChar *(*) | |
d htmlParseCharRef... | |
d pr extproc('htmlParseCharRef') | |
d like(xmlCint) | |
d ctxt value like(htmlParserCtxtPtr) | |
d htmlParseElement... | |
d pr extproc('htmlParseElement') | |
d ctxt value like(htmlParserCtxtPtr) | |
d htmlNewParserCtxt... | |
d pr extproc('htmlNewParserCtxt') | |
d like(htmlParserCtxtPtr) | |
d htmlCreateMemoryParserCtxt... | |
d pr extproc('htmlCreateMemoryParserCtxt') | |
d like(htmlParserCtxtPtr) | |
d buffer * value options(*string) const char * | |
d size value like(xmlCint) | |
d htmlParseDocument... | |
d pr extproc('htmlParseDocument') | |
d like(xmlCint) | |
d ctxt value like(htmlParserCtxtPtr) | |
d htmlSAXParseDoc... | |
d pr extproc('htmlSAXParseDoc') | |
d like(htmlDocPtr) | |
d cur * value options(*string) xmlChar * | |
d encoding * value options(*string) const char * | |
d sax value like(htmlSAXHandlerPtr) | |
d userData * value void * | |
d htmlParseDoc pr extproc('htmlParseDoc') | |
d like(htmlDocPtr) | |
d cur * value options(*string) xmlChar * | |
d encoding * value options(*string) const char * | |
d htmlSAXParseFile... | |
d pr extproc('htmlSAXParseFile') | |
d like(htmlDocPtr) | |
d filename * value options(*string) const char * | |
d encoding * value options(*string) const char * | |
d sax value like(htmlSAXHandlerPtr) | |
d userData * value void * | |
d htmlParseFile pr extproc('htmlParseFile') | |
d like(htmlDocPtr) | |
d filename * value options(*string) const char * | |
d encoding * value options(*string) const char * | |
d UTF8ToHtml pr extproc('UTF8ToHtml') | |
d like(xmlCint) | |
d out 65535 options(*varsize) unsigned char [] | |
d outlen like(xmlCint) | |
d in * value options(*string) const unsigned char* | |
d inlen like(xmlCint) | |
d htmlEncodeEntities... | |
d pr extproc('htmlEncodeEntities') | |
d like(xmlCint) | |
d out 65535 options(*varsize) unsigned char [] | |
d outlen like(xmlCint) | |
d in * value options(*string) const unsigned char* | |
d inlen like(xmlCint) | |
d quoteChar value like(xmlCint) | |
d htmlIsScriptAttribute... | |
d pr extproc('htmlIsScriptAttribute') | |
d like(xmlCint) | |
d name * value options(*string) const xmlChar * | |
d htmlHandleOmittedElem... | |
d pr extproc('htmlHandleOmittedElem') | |
d like(xmlCint) | |
d val value like(xmlCint) | |
/if defined(LIBXML_PUSH_ENABLED) | |
* Interfaces for the Push mode. | |
d htmlCreatePushParserCtxt... | |
d pr extproc('htmlCreatePushParserCtxt') | |
d like(htmlParserCtxtPtr) | |
d sax value like(htmlSAXHandlerPtr) | |
d user_data * value void * | |
d chunk * value options(*string) const char * | |
d size value like(xmlCint) | |
d filename * value options(*string) const char * | |
d enc value like(xmlCharEncoding) | |
d htmlParseChunk pr extproc('htmlParseChunk') | |
d like(xmlCint) | |
d ctxt value like(htmlParserCtxtPtr) | |
d chunk * value options(*string) const char * | |
d size value like(xmlCint) | |
d terminate value like(xmlCint) | |
/endif LIBXML_PUSH_ENABLED | |
d htmlFreeParserCtxt... | |
d pr extproc('htmlFreeParserCtxt') | |
d ctxt value like(htmlParserCtxtPtr) | |
* New set of simpler/more flexible APIs | |
* xmlParserOption: | |
* | |
* This is the set of XML parser options that can be passed down | |
* to the xmlReadDoc() and similar calls. | |
d htmlParserOption... | |
d s based(######typedef######) | |
d like(xmlCenum) | |
d HTML_PARSE_RECOVER... Relaxed parsing | |
d c X'00000001' | |
d HTML_PARSE_NODEFDTD... No default doctype | |
d c X'00000004' | |
d HTML_PARSE_NOERROR... No error reports | |
d c X'00000020' | |
d HTML_PARSE_NOWARNING... No warning reports | |
d c X'00000040' | |
d HTML_PARSE_PEDANTIC... Pedantic err reports | |
d c X'00000080' | |
d HTML_PARSE_NOBLANKS... Remove blank nodes | |
d c X'00000100' | |
d HTML_PARSE_NONET... Forbid net access | |
d c X'00000800' | |
d HTML_PARSE_NOIMPLIED... No implied html/body | |
d c X'00002000' | |
d HTML_PARSE_COMPACT... compact small txtnod | |
d c X'00010000' | |
d HTML_PARSE_IGNORE_ENC... Ignore encoding hint | |
d c X'00200000' | |
d htmlCtxtReset pr extproc('htmlCtxtReset') | |
d ctxt value like(htmlParserCtxtPtr) | |
d htmlCtxtUseOptions... | |
d pr extproc('htmlCtxtUseOptions') | |
d like(xmlCint) | |
d ctxt value like(htmlParserCtxtPtr) | |
d options value like(xmlCint) | |
d htmlReadDoc pr extproc('htmlReadDoc') | |
d like(htmlDocPtr) | |
d cur * value options(*string) const xmlChar * | |
d URL * value options(*string) const char * | |
d encoding * value options(*string) const char * | |
d options value like(xmlCint) | |
d htmlReadFile pr extproc('htmlReadFile') | |
d like(htmlDocPtr) | |
d URL * value options(*string) const char * | |
d encoding * value options(*string) const char * | |
d options value like(xmlCint) | |
d htmlReadMemory pr extproc('htmlReadMemory') | |
d like(htmlDocPtr) | |
d buffer * value options(*string) const char * | |
d size value like(xmlCint) | |
d URL * value options(*string) const char * | |
d encoding * value options(*string) const char * | |
d options value like(xmlCint) | |
d htmlReadFd pr extproc('htmlReadFd') | |
d like(htmlDocPtr) | |
d fd value like(xmlCint) | |
d URL * value options(*string) const char * | |
d encoding * value options(*string) const char * | |
d options value like(xmlCint) | |
d htmlReadIO pr extproc('htmlReadIO') | |
d like(htmlDocPtr) | |
d ioread value like(xmlInputReadCallback) | |
d ioclose value like(xmlInputCloseCallback) | |
d ioctx * value void * | |
d URL * value options(*string) const char * | |
d encoding * value options(*string) const char * | |
d options value like(xmlCint) | |
d htmlCtxtReadDoc... | |
d pr extproc('htmlCtxtReadDoc') | |
d like(htmlDocPtr) | |
d ctxt value like(xmlParserCtxtPtr) | |
d cur * value options(*string) const xmlChar * | |
d URL * value options(*string) const char * | |
d encoding * value options(*string) const char * | |
d options value like(xmlCint) | |
d htmlCtxtReadFile... | |
d pr extproc('htmlCtxtReadFile') | |
d like(htmlDocPtr) | |
d ctxt value like(xmlParserCtxtPtr) | |
d filename * value options(*string) const char * | |
d encoding * value options(*string) const char * | |
d options value like(xmlCint) | |
d htmlCtxtReadMemory... | |
d pr extproc('htmlCtxtReadMemory') | |
d like(htmlDocPtr) | |
d ctxt value like(xmlParserCtxtPtr) | |
d buffer * value options(*string) const char * | |
d size value like(xmlCint) | |
d URL * value options(*string) const char * | |
d encoding * value options(*string) const char * | |
d options value like(xmlCint) | |
d htmlCtxtReadFd pr extproc('htmlCtxtReadFd') | |
d like(htmlDocPtr) | |
d ctxt value like(xmlParserCtxtPtr) | |
d fd value like(xmlCint) | |
d URL * value options(*string) const char * | |
d encoding * value options(*string) const char * | |
d options value like(xmlCint) | |
d htmlCtxtReadIO pr extproc('htmlCtxtReadIO') | |
d like(htmlDocPtr) | |
d ctxt value like(xmlParserCtxtPtr) | |
d ioread value like(xmlInputReadCallback) | |
d ioclose value like(xmlInputCloseCallback) | |
d ioctx * value void * | |
d URL * value options(*string) const char * | |
d encoding * value options(*string) const char * | |
d options value like(xmlCint) | |
* Further knowledge of HTML structure | |
d htmlStatus s based(######typedef######) | |
d like(xmlCenum) | |
d HTML_NA c X'0000' No check at all | |
d HTML_INVALID c X'0001' | |
d HTML_DEPRECATED... | |
d c X'0002' | |
d HTML_VALID c X'0004' | |
d HTML_REQUIRED c X'000C' HTML_VALID ored-in | |
* Using htmlElemDesc rather than name here, to emphasise the fact | |
* that otherwise there's a lookup overhead | |
d htmlAttrAllowed... | |
d pr extproc('htmlAttrAllowed') | |
d like(htmlStatus) | |
d #param1 value like(htmlElemDescPtr) const | |
d #param2 * value options(*string) const xmlChar * | |
d #param3 value like(xmlCint) | |
d htmlElementAllowedHere... | |
d pr extproc('htmlElementAllowedHere') | |
d like(xmlCint) | |
d #param1 value like(htmlElemDescPtr) const | |
d #param2 * value options(*string) const xmlChar * | |
d htmlElementStatusHere... | |
d pr extproc('htmlElementStatusHere') | |
d like(htmlStatus) | |
d #param1 value like(htmlElemDescPtr) const | |
d #param2 value like(htmlElemDescPtr) const | |
d htmlNodeStatus pr extproc('htmlNodeStatus') | |
d like(htmlStatus) | |
d #param1 value like(htmlNodePtr) | |
d #param2 value like(xmlCint) | |
* C macros implemented as procedures for ILE/RPG support. | |
d htmlDefaultSubelement... | |
d pr * extproc('__htmlDefaultSubelement') const char * | |
d elt * value const htmlElemDesc * | |
d htmlElementAllowedHereDesc... | |
d pr extproc( | |
d '__htmlElementAllowedHereDesc') | |
d like(xmlCint) | |
d parent * value const htmlElemDesc * | |
d elt * value const htmlElemDesc * | |
d htmlRequiredAttrs... | |
d pr * extproc('__htmlRequiredAttrs') const char * * | |
d elt * value const htmlElemDesc * | |
/endif LIBXML_HTML_ENABLED | |
/endif HTML_PARSER_H__ |