2 * testHTML.c : a small tester program for HTML input.
4 * See Copyright for the status of this software.
11 #ifdef LIBXML_HTML_ENABLED
17 #ifdef HAVE_SYS_TYPES_H
18 #include <sys/types.h>
20 #ifdef HAVE_SYS_STAT_H
33 #include <libxml/xmlmemory.h>
34 #include <libxml/HTMLparser.h>
35 #include <libxml/HTMLtree.h>
36 #include <libxml/debugXML.h>
37 #include <libxml/xmlerror.h>
38 #include <libxml/globals.h>
40 #ifdef LIBXML_DEBUG_ENABLED
45 static int repeat
= 0;
48 static char *encoding
= NULL
;
50 xmlSAXHandler emptySAXHandlerStruct
= {
51 NULL
, /* internalSubset */
52 NULL
, /* isStandalone */
53 NULL
, /* hasInternalSubset */
54 NULL
, /* hasExternalSubset */
55 NULL
, /* resolveEntity */
57 NULL
, /* entityDecl */
58 NULL
, /* notationDecl */
59 NULL
, /* attributeDecl */
60 NULL
, /* elementDecl */
61 NULL
, /* unparsedEntityDecl */
62 NULL
, /* setDocumentLocator */
63 NULL
, /* startDocument */
64 NULL
, /* endDocument */
65 NULL
, /* startElement */
66 NULL
, /* endElement */
68 NULL
, /* characters */
69 NULL
, /* ignorableWhitespace */
70 NULL
, /* processingInstruction */
72 NULL
, /* xmlParserWarning */
73 NULL
, /* xmlParserError */
74 NULL
, /* xmlParserError */
75 NULL
, /* getParameterEntity */
76 NULL
, /* cdataBlock */
77 NULL
, /* externalSubset */
81 xmlSAXHandlerPtr emptySAXHandler
= &emptySAXHandlerStruct
;
82 extern xmlSAXHandlerPtr debugSAXHandler
;
84 /************************************************************************
88 ************************************************************************/
92 * @ctxt: An XML parser context
94 * Is this document tagged standalone ?
99 isStandaloneDebug(void *ctx ATTRIBUTE_UNUSED
)
101 fprintf(stdout
, "SAX.isStandalone()\n");
106 * hasInternalSubsetDebug:
107 * @ctxt: An XML parser context
109 * Does this document has an internal subset
114 hasInternalSubsetDebug(void *ctx ATTRIBUTE_UNUSED
)
116 fprintf(stdout
, "SAX.hasInternalSubset()\n");
121 * hasExternalSubsetDebug:
122 * @ctxt: An XML parser context
124 * Does this document has an external subset
129 hasExternalSubsetDebug(void *ctx ATTRIBUTE_UNUSED
)
131 fprintf(stdout
, "SAX.hasExternalSubset()\n");
136 * hasInternalSubsetDebug:
137 * @ctxt: An XML parser context
139 * Does this document has an internal subset
142 internalSubsetDebug(void *ctx ATTRIBUTE_UNUSED
, const xmlChar
*name
,
143 const xmlChar
*ExternalID
, const xmlChar
*SystemID
)
145 fprintf(stdout
, "SAX.internalSubset(%s,", name
);
146 if (ExternalID
== NULL
)
147 fprintf(stdout
, " ,");
149 fprintf(stdout
, " %s,", ExternalID
);
150 if (SystemID
== NULL
)
151 fprintf(stdout
, " )\n");
153 fprintf(stdout
, " %s)\n", SystemID
);
157 * resolveEntityDebug:
158 * @ctxt: An XML parser context
159 * @publicId: The public ID of the entity
160 * @systemId: The system ID of the entity
162 * Special entity resolver, better left to the parser, it has
163 * more context than the application layer.
164 * The default behaviour is to NOT resolve the entities, in that case
165 * the ENTITY_REF nodes are built in the structure (and the parameter
168 * Returns the xmlParserInputPtr if inlined or NULL for DOM behaviour.
170 static xmlParserInputPtr
171 resolveEntityDebug(void *ctx ATTRIBUTE_UNUSED
, const xmlChar
*publicId
, const xmlChar
*systemId
)
173 /* xmlParserCtxtPtr ctxt = (xmlParserCtxtPtr) ctx; */
176 fprintf(stdout
, "SAX.resolveEntity(");
177 if (publicId
!= NULL
)
178 fprintf(stdout
, "%s", (char *)publicId
);
180 fprintf(stdout
, " ");
181 if (systemId
!= NULL
)
182 fprintf(stdout
, ", %s)\n", (char *)systemId
);
184 fprintf(stdout
, ", )\n");
186 if (systemId != NULL) {
187 return(xmlNewInputFromFile(ctxt, (char *) systemId));
195 * @ctxt: An XML parser context
196 * @name: The entity name
198 * Get an entity by name
200 * Returns the xmlParserInputPtr if inlined or NULL for DOM behaviour.
203 getEntityDebug(void *ctx ATTRIBUTE_UNUSED
, const xmlChar
*name
)
205 fprintf(stdout
, "SAX.getEntity(%s)\n", name
);
210 * getParameterEntityDebug:
211 * @ctxt: An XML parser context
212 * @name: The entity name
214 * Get a parameter entity by name
216 * Returns the xmlParserInputPtr
219 getParameterEntityDebug(void *ctx ATTRIBUTE_UNUSED
, const xmlChar
*name
)
221 fprintf(stdout
, "SAX.getParameterEntity(%s)\n", name
);
228 * @ctxt: An XML parser context
229 * @name: the entity name
230 * @type: the entity type
231 * @publicId: The public ID of the entity
232 * @systemId: The system ID of the entity
233 * @content: the entity value (without processing).
235 * An entity definition has been parsed
238 entityDeclDebug(void *ctx ATTRIBUTE_UNUSED
, const xmlChar
*name
, int type
,
239 const xmlChar
*publicId
, const xmlChar
*systemId
, xmlChar
*content
)
241 fprintf(stdout
, "SAX.entityDecl(%s, %d, %s, %s, %s)\n",
242 name
, type
, publicId
, systemId
, content
);
246 * attributeDeclDebug:
247 * @ctxt: An XML parser context
248 * @name: the attribute name
249 * @type: the attribute type
251 * An attribute definition has been parsed
254 attributeDeclDebug(void *ctx ATTRIBUTE_UNUSED
, const xmlChar
*elem
, const xmlChar
*name
,
255 int type
, int def
, const xmlChar
*defaultValue
,
256 xmlEnumerationPtr tree ATTRIBUTE_UNUSED
)
258 fprintf(stdout
, "SAX.attributeDecl(%s, %s, %d, %d, %s, ...)\n",
259 elem
, name
, type
, def
, defaultValue
);
264 * @ctxt: An XML parser context
265 * @name: the element name
266 * @type: the element type
267 * @content: the element value (without processing).
269 * An element definition has been parsed
272 elementDeclDebug(void *ctx ATTRIBUTE_UNUSED
, const xmlChar
*name
, int type
,
273 xmlElementContentPtr content ATTRIBUTE_UNUSED
)
275 fprintf(stdout
, "SAX.elementDecl(%s, %d, ...)\n",
281 * @ctxt: An XML parser context
282 * @name: The name of the notation
283 * @publicId: The public ID of the entity
284 * @systemId: The system ID of the entity
286 * What to do when a notation declaration has been parsed.
289 notationDeclDebug(void *ctx ATTRIBUTE_UNUSED
, const xmlChar
*name
,
290 const xmlChar
*publicId
, const xmlChar
*systemId
)
292 fprintf(stdout
, "SAX.notationDecl(%s, %s, %s)\n",
293 (char *) name
, (char *) publicId
, (char *) systemId
);
297 * unparsedEntityDeclDebug:
298 * @ctxt: An XML parser context
299 * @name: The name of the entity
300 * @publicId: The public ID of the entity
301 * @systemId: The system ID of the entity
302 * @notationName: the name of the notation
304 * What to do when an unparsed entity declaration is parsed
307 unparsedEntityDeclDebug(void *ctx ATTRIBUTE_UNUSED
, const xmlChar
*name
,
308 const xmlChar
*publicId
, const xmlChar
*systemId
,
309 const xmlChar
*notationName
)
311 fprintf(stdout
, "SAX.unparsedEntityDecl(%s, %s, %s, %s)\n",
312 (char *) name
, (char *) publicId
, (char *) systemId
,
313 (char *) notationName
);
317 * setDocumentLocatorDebug:
318 * @ctxt: An XML parser context
319 * @loc: A SAX Locator
321 * Receive the document locator at startup, actually xmlDefaultSAXLocator
322 * Everything is available on the context, so this is useless in our case.
325 setDocumentLocatorDebug(void *ctx ATTRIBUTE_UNUSED
, xmlSAXLocatorPtr loc ATTRIBUTE_UNUSED
)
327 fprintf(stdout
, "SAX.setDocumentLocator()\n");
331 * startDocumentDebug:
332 * @ctxt: An XML parser context
334 * called when the document start being processed.
337 startDocumentDebug(void *ctx ATTRIBUTE_UNUSED
)
339 fprintf(stdout
, "SAX.startDocument()\n");
344 * @ctxt: An XML parser context
346 * called when the document end has been detected.
349 endDocumentDebug(void *ctx ATTRIBUTE_UNUSED
)
351 fprintf(stdout
, "SAX.endDocument()\n");
356 * @ctxt: An XML parser context
357 * @name: The element name
359 * called when an opening tag has been processed.
362 startElementDebug(void *ctx ATTRIBUTE_UNUSED
, const xmlChar
*name
, const xmlChar
**atts
)
366 fprintf(stdout
, "SAX.startElement(%s", (char *) name
);
368 for (i
= 0;(atts
[i
] != NULL
);i
++) {
369 fprintf(stdout
, ", %s", atts
[i
++]);
370 if (atts
[i
] != NULL
) {
371 unsigned char output
[40];
372 const unsigned char *att
= atts
[i
];
374 fprintf(stdout
, "='");
375 while ((attlen
= strlen((char*)att
)) > 0) {
376 outlen
= sizeof output
- 1;
377 htmlEncodeEntities(output
, &outlen
, att
, &attlen
, '\'');
378 fprintf(stdout
, "%.*s", outlen
, output
);
381 fprintf(stdout
, "'");
385 fprintf(stdout
, ")\n");
390 * @ctxt: An XML parser context
391 * @name: The element name
393 * called when the end of an element has been detected.
396 endElementDebug(void *ctx ATTRIBUTE_UNUSED
, const xmlChar
*name
)
398 fprintf(stdout
, "SAX.endElement(%s)\n", (char *) name
);
403 * @ctxt: An XML parser context
404 * @ch: a xmlChar string
405 * @len: the number of xmlChar
407 * receiving some chars from the parser.
408 * Question: how much at a time ???
411 charactersDebug(void *ctx ATTRIBUTE_UNUSED
, const xmlChar
*ch
, int len
)
413 unsigned char output
[40];
414 int inlen
= len
, outlen
= 30;
416 htmlEncodeEntities(output
, &outlen
, ch
, &inlen
, 0);
419 fprintf(stdout
, "SAX.characters(%s, %d)\n", output
, len
);
424 * @ctxt: An XML parser context
425 * @ch: a xmlChar string
426 * @len: the number of xmlChar
428 * receiving some cdata chars from the parser.
429 * Question: how much at a time ???
432 cdataDebug(void *ctx ATTRIBUTE_UNUSED
, const xmlChar
*ch
, int len
)
434 unsigned char output
[40];
435 int inlen
= len
, outlen
= 30;
437 htmlEncodeEntities(output
, &outlen
, ch
, &inlen
, 0);
440 fprintf(stdout
, "SAX.cdata(%s, %d)\n", output
, len
);
445 * @ctxt: An XML parser context
446 * @name: The entity name
448 * called when an entity reference is detected.
451 referenceDebug(void *ctx ATTRIBUTE_UNUSED
, const xmlChar
*name
)
453 fprintf(stdout
, "SAX.reference(%s)\n", name
);
457 * ignorableWhitespaceDebug:
458 * @ctxt: An XML parser context
459 * @ch: a xmlChar string
460 * @start: the first char in the string
461 * @len: the number of xmlChar
463 * receiving some ignorable whitespaces from the parser.
464 * Question: how much at a time ???
467 ignorableWhitespaceDebug(void *ctx ATTRIBUTE_UNUSED
, const xmlChar
*ch
, int len
)
472 for (i
= 0;(i
<len
) && (i
< 30);i
++)
476 fprintf(stdout
, "SAX.ignorableWhitespace(%s, %d)\n", output
, len
);
480 * processingInstructionDebug:
481 * @ctxt: An XML parser context
482 * @target: the target name
483 * @data: the PI data's
484 * @len: the number of xmlChar
486 * A processing instruction has been parsed.
489 processingInstructionDebug(void *ctx ATTRIBUTE_UNUSED
, const xmlChar
*target
,
492 fprintf(stdout
, "SAX.processingInstruction(%s, %s)\n",
493 (char *) target
, (char *) data
);
498 * @ctxt: An XML parser context
499 * @value: the comment content
501 * A comment has been parsed.
504 commentDebug(void *ctx ATTRIBUTE_UNUSED
, const xmlChar
*value
)
506 fprintf(stdout
, "SAX.comment(%s)\n", value
);
511 * @ctxt: An XML parser context
512 * @msg: the message to display/transmit
513 * @...: extra parameters for the message display
515 * Display and format a warning messages, gives file, line, position and
519 warningDebug(void *ctx ATTRIBUTE_UNUSED
, const char *msg
, ...)
524 fprintf(stdout
, "SAX.warning: ");
525 vfprintf(stdout
, msg
, args
);
531 * @ctxt: An XML parser context
532 * @msg: the message to display/transmit
533 * @...: extra parameters for the message display
535 * Display and format a error messages, gives file, line, position and
539 errorDebug(void *ctx ATTRIBUTE_UNUSED
, const char *msg
, ...)
544 fprintf(stdout
, "SAX.error: ");
545 vfprintf(stdout
, msg
, args
);
551 * @ctxt: An XML parser context
552 * @msg: the message to display/transmit
553 * @...: extra parameters for the message display
555 * Display and format a fatalError messages, gives file, line, position and
559 fatalErrorDebug(void *ctx ATTRIBUTE_UNUSED
, const char *msg
, ...)
564 fprintf(stdout
, "SAX.fatalError: ");
565 vfprintf(stdout
, msg
, args
);
569 xmlSAXHandler debugSAXHandlerStruct
= {
572 hasInternalSubsetDebug
,
573 hasExternalSubsetDebug
,
580 unparsedEntityDeclDebug
,
581 setDocumentLocatorDebug
,
588 ignorableWhitespaceDebug
,
589 processingInstructionDebug
,
594 getParameterEntityDebug
,
600 xmlSAXHandlerPtr debugSAXHandler
= &debugSAXHandlerStruct
;
601 /************************************************************************
605 ************************************************************************/
608 parseSAXFile(char *filename
) {
609 htmlDocPtr doc
= NULL
;
612 * Empty callbacks for checking
617 f
= fopen(filename
, "r");
621 htmlParserCtxtPtr ctxt
;
625 res
= fread(chars
, 1, 4, f
);
627 ctxt
= htmlCreatePushParserCtxt(emptySAXHandler
, NULL
,
628 chars
, res
, filename
, 0);
629 while ((res
= fread(chars
, 1, size
, f
)) > 0) {
630 htmlParseChunk(ctxt
, chars
, res
, 0);
632 htmlParseChunk(ctxt
, chars
, 0, 1);
634 htmlFreeParserCtxt(ctxt
);
637 fprintf(stdout
, "htmlSAXParseFile returned non-NULL\n");
643 f
= fopen(filename
, "r");
647 htmlParserCtxtPtr ctxt
;
651 res
= fread(chars
, 1, 4, f
);
653 ctxt
= htmlCreatePushParserCtxt(debugSAXHandler
, NULL
,
654 chars
, res
, filename
, 0);
655 while ((res
= fread(chars
, 1, size
, f
)) > 0) {
656 htmlParseChunk(ctxt
, chars
, res
, 0);
658 htmlParseChunk(ctxt
, chars
, 0, 1);
660 htmlFreeParserCtxt(ctxt
);
663 fprintf(stdout
, "htmlSAXParseFile returned non-NULL\n");
670 doc
= htmlSAXParseFile(filename
, NULL
, emptySAXHandler
, NULL
);
672 fprintf(stdout
, "htmlSAXParseFile returned non-NULL\n");
680 doc
= htmlSAXParseFile(filename
, NULL
, debugSAXHandler
, NULL
);
682 fprintf(stdout
, "htmlSAXParseFile returned non-NULL\n");
690 parseAndPrintFile(char *filename
) {
691 htmlDocPtr doc
= NULL
, tmp
;
694 * build an HTML tree from a string;
699 f
= fopen(filename
, "r");
703 htmlParserCtxtPtr ctxt
;
707 res
= fread(chars
, 1, 4, f
);
709 ctxt
= htmlCreatePushParserCtxt(NULL
, NULL
,
710 chars
, res
, filename
, 0);
711 while ((res
= fread(chars
, 1, size
, f
)) > 0) {
712 htmlParseChunk(ctxt
, chars
, res
, 0);
714 htmlParseChunk(ctxt
, chars
, 0, 1);
716 htmlFreeParserCtxt(ctxt
);
721 doc
= htmlParseFile(filename
, NULL
);
724 xmlGenericError(xmlGenericErrorContext
,
725 "Could not parse %s\n", filename
);
729 * test intermediate copy if needed.
733 doc
= xmlCopyDoc(doc
, 1);
741 #ifdef LIBXML_DEBUG_ENABLED
744 htmlSaveFileEnc("-", doc
, encoding
);
746 htmlDocDump(stdout
, doc
);
748 xmlDebugDumpDocument(stdout
, doc
);
751 htmlSaveFileEnc("-", doc
, encoding
);
753 htmlDocDump(stdout
, doc
);
763 int main(int argc
, char **argv
) {
767 for (i
= 1; i
< argc
; i
++) {
768 #ifdef LIBXML_DEBUG_ENABLED
769 if ((!strcmp(argv
[i
], "-debug")) || (!strcmp(argv
[i
], "--debug")))
773 if ((!strcmp(argv
[i
], "-copy")) || (!strcmp(argv
[i
], "--copy")))
775 else if ((!strcmp(argv
[i
], "-push")) || (!strcmp(argv
[i
], "--push")))
777 else if ((!strcmp(argv
[i
], "-sax")) || (!strcmp(argv
[i
], "--sax")))
779 else if ((!strcmp(argv
[i
], "-noout")) || (!strcmp(argv
[i
], "--noout")))
781 else if ((!strcmp(argv
[i
], "-repeat")) ||
782 (!strcmp(argv
[i
], "--repeat")))
784 else if ((!strcmp(argv
[i
], "-encode")) ||
785 (!strcmp(argv
[i
], "--encode"))) {
790 for (i
= 1; i
< argc
; i
++) {
791 if ((!strcmp(argv
[i
], "-encode")) ||
792 (!strcmp(argv
[i
], "--encode"))) {
796 if (argv
[i
][0] != '-') {
798 for (count
= 0;count
< 100 * repeat
;count
++) {
800 parseSAXFile(argv
[i
]);
802 parseAndPrintFile(argv
[i
]);
806 parseSAXFile(argv
[i
]);
808 parseAndPrintFile(argv
[i
]);
814 printf("Usage : %s [--debug] [--copy] [--copy] HTMLfiles ...\n",
816 printf("\tParse the HTML files and output the result of the parsing\n");
817 #ifdef LIBXML_DEBUG_ENABLED
818 printf("\t--debug : dump a debug tree of the in-memory document\n");
820 printf("\t--copy : used to test the internal copy implementation\n");
821 printf("\t--sax : debug the sequence of SAX callbacks\n");
822 printf("\t--repeat : parse the file 100 times, for timing\n");
823 printf("\t--noout : do not print the result\n");
824 printf("\t--push : use the push mode parser\n");
825 printf("\t--encode encoding : output in the given encoding\n");
832 #else /* !LIBXML_HTML_ENABLED */
834 int main(int argc
, char **argv
) {
835 printf("%s : HTML support not compiled in\n", argv
[0]);