2 * HTMLparser.h : interface for an HTML 4.0 non-verifying parser
4 * See Copyright for the status of this software.
9 #ifndef __HTML_PARSER_H__
10 #define __HTML_PARSER_H__
11 #include <libxml/parser.h>
18 * Most of the back-end structures from XML and HTML are shared.
20 typedef xmlParserCtxt htmlParserCtxt;
21 typedef xmlParserCtxtPtr htmlParserCtxtPtr;
22 typedef xmlParserNodeInfo htmlParserNodeInfo;
23 typedef xmlSAXHandler htmlSAXHandler;
24 typedef xmlSAXHandlerPtr htmlSAXHandlerPtr;
25 typedef xmlParserInput htmlParserInput;
26 typedef xmlParserInputPtr htmlParserInputPtr;
27 typedef xmlDocPtr htmlDocPtr;
28 typedef xmlNodePtr htmlNodePtr;
31 * Internal description of an HTML element, representing HTML 4.01
32 * and XHTML 1.0 (which share the same structure).
34 typedef struct _htmlElemDesc htmlElemDesc;
35 typedef htmlElemDesc *htmlElemDescPtr;
36 struct _htmlElemDesc {
37 const char *name; /* The tag name */
38 char startTag; /* Whether the start tag can be implied */
39 char endTag; /* Whether the end tag can be implied */
40 char saveEndTag; /* Whether the end tag should be saved */
41 char empty; /* Is this an empty element ? */
42 char depr; /* Is this a deprecated element ? */
43 char dtd; /* 1: only in Loose DTD, 2: only Frameset one */
44 char isinline; /* is this a block 0 or inline 1 element */
45 const char *desc; /* the description */
48 * New fields encapsulating HTML structure
51 * This is a very limited representation. It fails to tell us when
52 * an element *requires* subelements (we only have whether they're
53 * allowed or not), and it doesn't tell us where CDATA and PCDATA
54 * are allowed. Some element relationships are not fully represented:
55 * these are flagged with the word MODIFIER
57 const char** subelts; /* allowed sub-elements of this element */
58 const char* defaultsubelt; /* subelement for suggested auto-repair
59 if necessary or NULL */
60 const char** attrs_opt; /* Optional Attributes */
61 const char** attrs_depr; /* Additional deprecated attributes */
62 const char** attrs_req; /* Required attributes */
66 * Internal description of an HTML entity.
68 typedef struct _htmlEntityDesc htmlEntityDesc;
69 typedef htmlEntityDesc *htmlEntityDescPtr;
70 struct _htmlEntityDesc {
71 unsigned int value; /* the UNICODE value for the character */
72 const char *name; /* The entity name */
73 const char *desc; /* the description */
77 * There is only few public functions.
79 const htmlElemDesc * htmlTagLookup (const xmlChar *tag);
80 const htmlEntityDesc * htmlEntityLookup(const xmlChar *name);
81 const htmlEntityDesc * htmlEntityValueLookup(unsigned int value);
83 int htmlIsAutoClosed(htmlDocPtr doc,
85 int htmlAutoCloseTag(htmlDocPtr doc,
88 const htmlEntityDesc * htmlParseEntityRef(htmlParserCtxtPtr ctxt,
90 int htmlParseCharRef(htmlParserCtxtPtr ctxt);
91 void htmlParseElement(htmlParserCtxtPtr ctxt);
93 int htmlParseDocument(htmlParserCtxtPtr ctxt);
94 htmlDocPtr htmlSAXParseDoc (xmlChar *cur,
96 htmlSAXHandlerPtr sax,
98 htmlDocPtr htmlParseDoc (xmlChar *cur,
99 const char *encoding);
100 htmlDocPtr htmlSAXParseFile(const char *filename,
101 const char *encoding,
102 htmlSAXHandlerPtr sax,
104 htmlDocPtr htmlParseFile (const char *filename,
105 const char *encoding);
106 int UTF8ToHtml (unsigned char *out,
108 const unsigned char *in,
110 int htmlEncodeEntities(unsigned char *out,
112 const unsigned char *in,
113 int *inlen, int quoteChar);
114 int htmlIsScriptAttribute(const xmlChar *name);
115 int htmlHandleOmittedElem(int val);
118 * Interfaces for the Push mode.
120 void htmlFreeParserCtxt (htmlParserCtxtPtr ctxt);
121 htmlParserCtxtPtr htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax,
125 const char *filename,
126 xmlCharEncoding enc);
127 int htmlParseChunk (htmlParserCtxtPtr ctxt,
132 /* NRK/Jan2003: further knowledge of HTML structure
135 HTML_NA = 0 , /* something we don't check at all */
137 HTML_DEPRECATED = 0x2 ,
139 HTML_REQUIRED = 0xc /* VALID bit set so ( & HTML_VALID ) is TRUE */
142 /* Using htmlElemDesc rather than name here, to emphasise the fact
143 that otherwise there's a lookup overhead
145 htmlStatus htmlAttrAllowed(const htmlElemDesc*, const xmlChar*, int) ;
146 int htmlElementAllowedHere(const htmlElemDesc*, const xmlChar*) ;
147 htmlStatus htmlElementStatusHere(const htmlElemDesc*, const htmlElemDesc*) ;
148 htmlStatus htmlNodeStatus(const htmlNodePtr, int) ;
149 #define htmlDefaultSubelement(elt) elt->defaultsubelt
150 #define htmlElementAllowedHereDesc(parent,elt) \
151 htmlElementAllowedHere((parent), (elt)->name)
152 #define htmlRequiredAttrs(elt) (elt)->attrs_req
159 #endif /* __HTML_PARSER_H__ */