2 * testHTML.c : a small tester program for HTML input.
4 * See Copyright for the status of this software.
11 #ifdef LIBXML_HTML_ENABLED
17 #ifdef HAVE_SYS_TYPES_H
18 #include <sys/types.h>
20 #ifdef HAVE_SYS_STAT_H
33 #include <libxml/xmlmemory.h>
34 #include <libxml/HTMLparser.h>
35 #include <libxml/HTMLtree.h>
36 #include <libxml/debugXML.h>
37 #include <libxml/xmlerror.h>
38 #include <libxml/globals.h>
40 #ifdef LIBXML_DEBUG_ENABLED
45 static int repeat = 0;
48 static char *encoding = NULL;
50 xmlSAXHandler emptySAXHandlerStruct = {
51 NULL, /* internalSubset */
52 NULL, /* isStandalone */
53 NULL, /* hasInternalSubset */
54 NULL, /* hasExternalSubset */
55 NULL, /* resolveEntity */
57 NULL, /* entityDecl */
58 NULL, /* notationDecl */
59 NULL, /* attributeDecl */
60 NULL, /* elementDecl */
61 NULL, /* unparsedEntityDecl */
62 NULL, /* setDocumentLocator */
63 NULL, /* startDocument */
64 NULL, /* endDocument */
65 NULL, /* startElement */
66 NULL, /* endElement */
68 NULL, /* characters */
69 NULL, /* ignorableWhitespace */
70 NULL, /* processingInstruction */
72 NULL, /* xmlParserWarning */
73 NULL, /* xmlParserError */
74 NULL, /* xmlParserError */
75 NULL, /* getParameterEntity */
76 NULL, /* cdataBlock */
77 NULL, /* externalSubset */
81 xmlSAXHandlerPtr emptySAXHandler = &emptySAXHandlerStruct;
82 extern xmlSAXHandlerPtr debugSAXHandler;
84 /************************************************************************
88 ************************************************************************/
92 * @ctxt: An XML parser context
94 * Is this document tagged standalone ?
99 isStandaloneDebug(void *ctx ATTRIBUTE_UNUSED)
101 fprintf(stdout, "SAX.isStandalone()\n");
106 * hasInternalSubsetDebug:
107 * @ctxt: An XML parser context
109 * Does this document has an internal subset
114 hasInternalSubsetDebug(void *ctx ATTRIBUTE_UNUSED)
116 fprintf(stdout, "SAX.hasInternalSubset()\n");
121 * hasExternalSubsetDebug:
122 * @ctxt: An XML parser context
124 * Does this document has an external subset
129 hasExternalSubsetDebug(void *ctx ATTRIBUTE_UNUSED)
131 fprintf(stdout, "SAX.hasExternalSubset()\n");
136 * hasInternalSubsetDebug:
137 * @ctxt: An XML parser context
139 * Does this document has an internal subset
142 internalSubsetDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *name,
143 const xmlChar *ExternalID, const xmlChar *SystemID)
145 fprintf(stdout, "SAX.internalSubset(%s,", name);
146 if (ExternalID == NULL)
147 fprintf(stdout, " ,");
149 fprintf(stdout, " %s,", ExternalID);
150 if (SystemID == NULL)
151 fprintf(stdout, " )\n");
153 fprintf(stdout, " %s)\n", SystemID);
157 * resolveEntityDebug:
158 * @ctxt: An XML parser context
159 * @publicId: The public ID of the entity
160 * @systemId: The system ID of the entity
162 * Special entity resolver, better left to the parser, it has
163 * more context than the application layer.
164 * The default behaviour is to NOT resolve the entities, in that case
165 * the ENTITY_REF nodes are built in the structure (and the parameter
168 * Returns the xmlParserInputPtr if inlined or NULL for DOM behaviour.
170 static xmlParserInputPtr
171 resolveEntityDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *publicId, const xmlChar *systemId)
173 /* xmlParserCtxtPtr ctxt = (xmlParserCtxtPtr) ctx; */
176 fprintf(stdout, "SAX.resolveEntity(");
177 if (publicId != NULL)
178 fprintf(stdout, "%s", (char *)publicId);
180 fprintf(stdout, " ");
181 if (systemId != NULL)
182 fprintf(stdout, ", %s)\n", (char *)systemId);
184 fprintf(stdout, ", )\n");
186 if (systemId != NULL) {
187 return(xmlNewInputFromFile(ctxt, (char *) systemId));
195 * @ctxt: An XML parser context
196 * @name: The entity name
198 * Get an entity by name
200 * Returns the xmlParserInputPtr if inlined or NULL for DOM behaviour.
203 getEntityDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *name)
205 fprintf(stdout, "SAX.getEntity(%s)\n", name);
210 * getParameterEntityDebug:
211 * @ctxt: An XML parser context
212 * @name: The entity name
214 * Get a parameter entity by name
216 * Returns the xmlParserInputPtr
219 getParameterEntityDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *name)
221 fprintf(stdout, "SAX.getParameterEntity(%s)\n", name);
228 * @ctxt: An XML parser context
229 * @name: the entity name
230 * @type: the entity type
231 * @publicId: The public ID of the entity
232 * @systemId: The system ID of the entity
233 * @content: the entity value (without processing).
235 * An entity definition has been parsed
238 entityDeclDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *name, int type,
239 const xmlChar *publicId, const xmlChar *systemId, xmlChar *content)
241 fprintf(stdout, "SAX.entityDecl(%s, %d, %s, %s, %s)\n",
242 name, type, publicId, systemId, content);
246 * attributeDeclDebug:
247 * @ctxt: An XML parser context
248 * @name: the attribute name
249 * @type: the attribute type
251 * An attribute definition has been parsed
254 attributeDeclDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *elem, const xmlChar *name,
255 int type, int def, const xmlChar *defaultValue,
256 xmlEnumerationPtr tree ATTRIBUTE_UNUSED)
258 fprintf(stdout, "SAX.attributeDecl(%s, %s, %d, %d, %s, ...)\n",
259 elem, name, type, def, defaultValue);
264 * @ctxt: An XML parser context
265 * @name: the element name
266 * @type: the element type
267 * @content: the element value (without processing).
269 * An element definition has been parsed
272 elementDeclDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *name, int type,
273 xmlElementContentPtr content ATTRIBUTE_UNUSED)
275 fprintf(stdout, "SAX.elementDecl(%s, %d, ...)\n",
281 * @ctxt: An XML parser context
282 * @name: The name of the notation
283 * @publicId: The public ID of the entity
284 * @systemId: The system ID of the entity
286 * What to do when a notation declaration has been parsed.
289 notationDeclDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *name,
290 const xmlChar *publicId, const xmlChar *systemId)
292 fprintf(stdout, "SAX.notationDecl(%s, %s, %s)\n",
293 (char *) name, (char *) publicId, (char *) systemId);
297 * unparsedEntityDeclDebug:
298 * @ctxt: An XML parser context
299 * @name: The name of the entity
300 * @publicId: The public ID of the entity
301 * @systemId: The system ID of the entity
302 * @notationName: the name of the notation
304 * What to do when an unparsed entity declaration is parsed
307 unparsedEntityDeclDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *name,
308 const xmlChar *publicId, const xmlChar *systemId,
309 const xmlChar *notationName)
311 fprintf(stdout, "SAX.unparsedEntityDecl(%s, %s, %s, %s)\n",
312 (char *) name, (char *) publicId, (char *) systemId,
313 (char *) notationName);
317 * setDocumentLocatorDebug:
318 * @ctxt: An XML parser context
319 * @loc: A SAX Locator
321 * Receive the document locator at startup, actually xmlDefaultSAXLocator
322 * Everything is available on the context, so this is useless in our case.
325 setDocumentLocatorDebug(void *ctx ATTRIBUTE_UNUSED, xmlSAXLocatorPtr loc ATTRIBUTE_UNUSED)
327 fprintf(stdout, "SAX.setDocumentLocator()\n");
331 * startDocumentDebug:
332 * @ctxt: An XML parser context
334 * called when the document start being processed.
337 startDocumentDebug(void *ctx ATTRIBUTE_UNUSED)
339 fprintf(stdout, "SAX.startDocument()\n");
344 * @ctxt: An XML parser context
346 * called when the document end has been detected.
349 endDocumentDebug(void *ctx ATTRIBUTE_UNUSED)
351 fprintf(stdout, "SAX.endDocument()\n");
356 * @ctxt: An XML parser context
357 * @name: The element name
359 * called when an opening tag has been processed.
362 startElementDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *name, const xmlChar **atts)
366 fprintf(stdout, "SAX.startElement(%s", (char *) name);
368 for (i = 0;(atts[i] != NULL);i++) {
369 fprintf(stdout, ", %s", atts[i++]);
370 if (atts[i] != NULL) {
371 unsigned char output[40];
372 const unsigned char *att = atts[i];
374 fprintf(stdout, "='");
375 while ((attlen = strlen((char*)att)) > 0) {
376 outlen = sizeof output - 1;
377 htmlEncodeEntities(output, &outlen, att, &attlen, '\'');
379 fprintf(stdout, "%s", (char *) output);
382 fprintf(stdout, "'");
386 fprintf(stdout, ")\n");
391 * @ctxt: An XML parser context
392 * @name: The element name
394 * called when the end of an element has been detected.
397 endElementDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *name)
399 fprintf(stdout, "SAX.endElement(%s)\n", (char *) name);
404 * @ctxt: An XML parser context
405 * @ch: a xmlChar string
406 * @len: the number of xmlChar
408 * receiving some chars from the parser.
409 * Question: how much at a time ???
412 charactersDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *ch, int len)
414 unsigned char output[40];
415 int inlen = len, outlen = 30;
417 htmlEncodeEntities(output, &outlen, ch, &inlen, 0);
420 fprintf(stdout, "SAX.characters(%s, %d)\n", output, len);
425 * @ctxt: An XML parser context
426 * @ch: a xmlChar string
427 * @len: the number of xmlChar
429 * receiving some cdata chars from the parser.
430 * Question: how much at a time ???
433 cdataDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *ch, int len)
435 unsigned char output[40];
436 int inlen = len, outlen = 30;
438 htmlEncodeEntities(output, &outlen, ch, &inlen, 0);
441 fprintf(stdout, "SAX.cdata(%s, %d)\n", output, len);
446 * @ctxt: An XML parser context
447 * @name: The entity name
449 * called when an entity reference is detected.
452 referenceDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *name)
454 fprintf(stdout, "SAX.reference(%s)\n", name);
458 * ignorableWhitespaceDebug:
459 * @ctxt: An XML parser context
460 * @ch: a xmlChar string
461 * @start: the first char in the string
462 * @len: the number of xmlChar
464 * receiving some ignorable whitespaces from the parser.
465 * Question: how much at a time ???
468 ignorableWhitespaceDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *ch, int len)
473 for (i = 0;(i<len) && (i < 30);i++)
477 fprintf(stdout, "SAX.ignorableWhitespace(%s, %d)\n", output, len);
481 * processingInstructionDebug:
482 * @ctxt: An XML parser context
483 * @target: the target name
484 * @data: the PI data's
485 * @len: the number of xmlChar
487 * A processing instruction has been parsed.
490 processingInstructionDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *target,
493 fprintf(stdout, "SAX.processingInstruction(%s, %s)\n",
494 (char *) target, (char *) data);
499 * @ctxt: An XML parser context
500 * @value: the comment content
502 * A comment has been parsed.
505 commentDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *value)
507 fprintf(stdout, "SAX.comment(%s)\n", value);
512 * @ctxt: An XML parser context
513 * @msg: the message to display/transmit
514 * @...: extra parameters for the message display
516 * Display and format a warning messages, gives file, line, position and
520 warningDebug(void *ctx ATTRIBUTE_UNUSED, const char *msg, ...)
525 fprintf(stdout, "SAX.warning: ");
526 vfprintf(stdout, msg, args);
532 * @ctxt: An XML parser context
533 * @msg: the message to display/transmit
534 * @...: extra parameters for the message display
536 * Display and format a error messages, gives file, line, position and
540 errorDebug(void *ctx ATTRIBUTE_UNUSED, const char *msg, ...)
545 fprintf(stdout, "SAX.error: ");
546 vfprintf(stdout, msg, args);
552 * @ctxt: An XML parser context
553 * @msg: the message to display/transmit
554 * @...: extra parameters for the message display
556 * Display and format a fatalError messages, gives file, line, position and
560 fatalErrorDebug(void *ctx ATTRIBUTE_UNUSED, const char *msg, ...)
565 fprintf(stdout, "SAX.fatalError: ");
566 vfprintf(stdout, msg, args);
570 xmlSAXHandler debugSAXHandlerStruct = {
573 hasInternalSubsetDebug,
574 hasExternalSubsetDebug,
581 unparsedEntityDeclDebug,
582 setDocumentLocatorDebug,
589 ignorableWhitespaceDebug,
590 processingInstructionDebug,
595 getParameterEntityDebug,
601 xmlSAXHandlerPtr debugSAXHandler = &debugSAXHandlerStruct;
602 /************************************************************************
606 ************************************************************************/
609 parseSAXFile(char *filename) {
610 htmlDocPtr doc = NULL;
613 * Empty callbacks for checking
618 f = fopen(filename, "r");
622 htmlParserCtxtPtr ctxt;
626 res = fread(chars, 1, 4, f);
628 ctxt = htmlCreatePushParserCtxt(emptySAXHandler, NULL,
629 chars, res, filename, XML_CHAR_ENCODING_NONE);
630 while ((res = fread(chars, 1, size, f)) > 0) {
631 htmlParseChunk(ctxt, chars, res, 0);
633 htmlParseChunk(ctxt, chars, 0, 1);
635 htmlFreeParserCtxt(ctxt);
638 fprintf(stdout, "htmlSAXParseFile returned non-NULL\n");
644 f = fopen(filename, "r");
648 htmlParserCtxtPtr ctxt;
652 res = fread(chars, 1, 4, f);
654 ctxt = htmlCreatePushParserCtxt(debugSAXHandler, NULL,
655 chars, res, filename, XML_CHAR_ENCODING_NONE);
656 while ((res = fread(chars, 1, size, f)) > 0) {
657 htmlParseChunk(ctxt, chars, res, 0);
659 htmlParseChunk(ctxt, chars, 0, 1);
661 htmlFreeParserCtxt(ctxt);
664 fprintf(stdout, "htmlSAXParseFile returned non-NULL\n");
671 doc = htmlSAXParseFile(filename, NULL, emptySAXHandler, NULL);
673 fprintf(stdout, "htmlSAXParseFile returned non-NULL\n");
681 doc = htmlSAXParseFile(filename, NULL, debugSAXHandler, NULL);
683 fprintf(stdout, "htmlSAXParseFile returned non-NULL\n");
691 parseAndPrintFile(char *filename) {
692 htmlDocPtr doc = NULL, tmp;
695 * build an HTML tree from a string;
700 f = fopen(filename, "r");
704 htmlParserCtxtPtr ctxt;
708 res = fread(chars, 1, 4, f);
710 ctxt = htmlCreatePushParserCtxt(NULL, NULL,
711 chars, res, filename, XML_CHAR_ENCODING_NONE);
712 while ((res = fread(chars, 1, size, f)) > 0) {
713 htmlParseChunk(ctxt, chars, res, 0);
715 htmlParseChunk(ctxt, chars, 0, 1);
717 htmlFreeParserCtxt(ctxt);
722 doc = htmlParseFile(filename, NULL);
725 xmlGenericError(xmlGenericErrorContext,
726 "Could not parse %s\n", filename);
730 * test intermediate copy if needed.
734 doc = xmlCopyDoc(doc, 1);
742 #ifdef LIBXML_DEBUG_ENABLED
745 htmlSaveFileEnc("-", doc, encoding);
747 htmlDocDump(stdout, doc);
749 xmlDebugDumpDocument(stdout, doc);
752 htmlSaveFileEnc("-", doc, encoding);
754 htmlDocDump(stdout, doc);
764 int main(int argc, char **argv) {
768 for (i = 1; i < argc ; i++) {
769 #ifdef LIBXML_DEBUG_ENABLED
770 if ((!strcmp(argv[i], "-debug")) || (!strcmp(argv[i], "--debug")))
774 if ((!strcmp(argv[i], "-copy")) || (!strcmp(argv[i], "--copy")))
776 else if ((!strcmp(argv[i], "-push")) || (!strcmp(argv[i], "--push")))
778 else if ((!strcmp(argv[i], "-sax")) || (!strcmp(argv[i], "--sax")))
780 else if ((!strcmp(argv[i], "-noout")) || (!strcmp(argv[i], "--noout")))
782 else if ((!strcmp(argv[i], "-repeat")) ||
783 (!strcmp(argv[i], "--repeat")))
785 else if ((!strcmp(argv[i], "-encode")) ||
786 (!strcmp(argv[i], "--encode"))) {
791 for (i = 1; i < argc ; i++) {
792 if ((!strcmp(argv[i], "-encode")) ||
793 (!strcmp(argv[i], "--encode"))) {
797 if (argv[i][0] != '-') {
799 for (count = 0;count < 100 * repeat;count++) {
801 parseSAXFile(argv[i]);
803 parseAndPrintFile(argv[i]);
807 parseSAXFile(argv[i]);
809 parseAndPrintFile(argv[i]);
815 printf("Usage : %s [--debug] [--copy] [--copy] HTMLfiles ...\n",
817 printf("\tParse the HTML files and output the result of the parsing\n");
818 #ifdef LIBXML_DEBUG_ENABLED
819 printf("\t--debug : dump a debug tree of the in-memory document\n");
821 printf("\t--copy : used to test the internal copy implementation\n");
822 printf("\t--sax : debug the sequence of SAX callbacks\n");
823 printf("\t--repeat : parse the file 100 times, for timing\n");
824 printf("\t--noout : do not print the result\n");
825 printf("\t--push : use the push mode parser\n");
826 printf("\t--encode encoding : output in the given encoding\n");
833 #else /* !LIBXML_HTML_ENABLED */
835 int main(int argc, char **argv) {
836 printf("%s : HTML support not compiled in\n", argv[0]);