2 * HTMLparser.h : interface for an HTML 4.0 non-verifying parser
4 * See Copyright for the status of this software.
9 #ifndef __HTML_PARSER_H__
10 #define __HTML_PARSER_H__
11 #include <libxml/parser.h>
18 * Most of the back-end structures from XML and HTML are shared.
20 typedef xmlParserCtxt htmlParserCtxt;
21 typedef xmlParserCtxtPtr htmlParserCtxtPtr;
22 typedef xmlParserNodeInfo htmlParserNodeInfo;
23 typedef xmlSAXHandler htmlSAXHandler;
24 typedef xmlSAXHandlerPtr htmlSAXHandlerPtr;
25 typedef xmlParserInput htmlParserInput;
26 typedef xmlParserInputPtr htmlParserInputPtr;
27 typedef xmlDocPtr htmlDocPtr;
28 typedef xmlNodePtr htmlNodePtr;
31 * Internal description of an HTML element, representing HTML 4.01
32 * and XHTML 1.0 (which share the same structure).
34 typedef struct _htmlElemDesc htmlElemDesc;
35 typedef htmlElemDesc *htmlElemDescPtr;
36 struct _htmlElemDesc {
37 const char *name; /* The tag name */
38 char startTag; /* Whether the start tag can be implied */
39 char endTag; /* Whether the end tag can be implied */
40 char saveEndTag; /* Whether the end tag should be saved */
41 char empty; /* Is this an empty element ? */
42 char depr; /* Is this a deprecated element ? */
43 char dtd; /* 1: only in Loose DTD, 2: only Frameset one */
44 char isinline; /* is this a block 0 or inline 1 element */
45 const char *desc; /* the description */
48 * New fields encapsulating HTML structure
51 * This is a very limited representation. It fails to tell us when
52 * an element *requires* subelements (we only have whether they're
53 * allowed or not), and it doesn't tell us where CDATA and PCDATA
54 * are allowed. Some element relationships are not fully represented:
55 * these are flagged with the word MODIFIER
57 const char** subelts; /* allowed sub-elements of this element */
58 const char* defaultsubelt; /* subelement for suggested auto-repair
59 if necessary or NULL */
60 const char** attrs_opt; /* Optional Attributes */
61 const char** attrs_depr; /* Additional deprecated attributes */
62 const char** attrs_req; /* Required attributes */
66 * Internal description of an HTML entity.
68 typedef struct _htmlEntityDesc htmlEntityDesc;
69 typedef htmlEntityDesc *htmlEntityDescPtr;
70 struct _htmlEntityDesc {
71 unsigned int value; /* the UNICODE value for the character */
72 const char *name; /* The entity name */
73 const char *desc; /* the description */
77 * There is only few public functions.
79 const htmlElemDesc * htmlTagLookup (const xmlChar *tag);
80 const htmlEntityDesc * htmlEntityLookup(const xmlChar *name);
81 const htmlEntityDesc * htmlEntityValueLookup(unsigned int value);
83 int htmlIsAutoClosed(htmlDocPtr doc,
85 int htmlAutoCloseTag(htmlDocPtr doc,
88 const htmlEntityDesc * htmlParseEntityRef(htmlParserCtxtPtr ctxt,
90 int htmlParseCharRef(htmlParserCtxtPtr ctxt);
91 void htmlParseElement(htmlParserCtxtPtr ctxt);
93 htmlParserCtxtPtr htmlCreateMemoryParserCtxt(const char *buffer,
96 int htmlParseDocument(htmlParserCtxtPtr ctxt);
97 htmlDocPtr htmlSAXParseDoc (xmlChar *cur,
99 htmlSAXHandlerPtr sax,
101 htmlDocPtr htmlParseDoc (xmlChar *cur,
102 const char *encoding);
103 htmlDocPtr htmlSAXParseFile(const char *filename,
104 const char *encoding,
105 htmlSAXHandlerPtr sax,
107 htmlDocPtr htmlParseFile (const char *filename,
108 const char *encoding);
109 int UTF8ToHtml (unsigned char *out,
111 const unsigned char *in,
113 int htmlEncodeEntities(unsigned char *out,
115 const unsigned char *in,
116 int *inlen, int quoteChar);
117 int htmlIsScriptAttribute(const xmlChar *name);
118 int htmlHandleOmittedElem(int val);
121 * Interfaces for the Push mode.
123 void htmlFreeParserCtxt (htmlParserCtxtPtr ctxt);
124 htmlParserCtxtPtr htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax,
128 const char *filename,
129 xmlCharEncoding enc);
130 int htmlParseChunk (htmlParserCtxtPtr ctxt,
135 /* NRK/Jan2003: further knowledge of HTML structure
138 HTML_NA = 0 , /* something we don't check at all */
140 HTML_DEPRECATED = 0x2 ,
142 HTML_REQUIRED = 0xc /* VALID bit set so ( & HTML_VALID ) is TRUE */
145 /* Using htmlElemDesc rather than name here, to emphasise the fact
146 that otherwise there's a lookup overhead
148 htmlStatus htmlAttrAllowed(const htmlElemDesc*, const xmlChar*, int) ;
149 int htmlElementAllowedHere(const htmlElemDesc*, const xmlChar*) ;
150 htmlStatus htmlElementStatusHere(const htmlElemDesc*, const htmlElemDesc*) ;
151 htmlStatus htmlNodeStatus(const htmlNodePtr, int) ;
152 #define htmlDefaultSubelement(elt) elt->defaultsubelt
153 #define htmlElementAllowedHereDesc(parent,elt) \
154 htmlElementAllowedHere((parent), (elt)->name)
155 #define htmlRequiredAttrs(elt) (elt)->attrs_req
162 #endif /* __HTML_PARSER_H__ */