2 * HTMLparser.c : an HTML 4.0 non-verifying parser
4 * See Copyright for the status of this software.
11 #ifdef LIBXML_HTML_ENABLED
20 #ifdef HAVE_SYS_STAT_H
33 #include <libxml/xmlmemory.h>
34 #include <libxml/tree.h>
35 #include <libxml/parser.h>
36 #include <libxml/parserInternals.h>
37 #include <libxml/xmlerror.h>
38 #include <libxml/HTMLparser.h>
39 #include <libxml/HTMLtree.h>
40 #include <libxml/entities.h>
41 #include <libxml/encoding.h>
42 #include <libxml/valid.h>
43 #include <libxml/xmlIO.h>
44 #include <libxml/globals.h>
46 #define HTML_MAX_NAMELEN 1000
47 #define HTML_PARSER_BIG_BUFFER_SIZE 1000
48 #define HTML_PARSER_BUFFER_SIZE 100
51 /* #define DEBUG_PUSH */
53 static int htmlOmittedDefaultValue
= 1;
55 xmlChar
* htmlDecodeEntities(htmlParserCtxtPtr ctxt
, int len
,
56 xmlChar end
, xmlChar end2
, xmlChar end3
);
57 static void htmlParseComment(htmlParserCtxtPtr ctxt
);
59 /************************************************************************
61 * Parser stacks related functions and macros *
63 ************************************************************************/
66 * Generic function for accessing stacks in the Parser Context
69 #define PUSH_AND_POP(scope, type, name) \
70 scope int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \
71 if (ctxt->name##Nr >= ctxt->name##Max) { \
72 ctxt->name##Max *= 2; \
73 ctxt->name##Tab = (type *) xmlRealloc(ctxt->name##Tab, \
74 ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
75 if (ctxt->name##Tab == NULL) { \
76 xmlGenericError(xmlGenericErrorContext, \
77 "realloc failed !\n"); \
81 ctxt->name##Tab[ctxt->name##Nr] = value; \
83 return(ctxt->name##Nr++); \
85 scope type html##name##Pop(htmlParserCtxtPtr ctxt) { \
87 if (ctxt->name##Nr <= 0) return(0); \
89 if (ctxt->name##Nr < 0) return(0); \
90 if (ctxt->name##Nr > 0) \
91 ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
94 ret = ctxt->name##Tab[ctxt->name##Nr]; \
95 ctxt->name##Tab[ctxt->name##Nr] = 0; \
99 /* PUSH_AND_POP(static, xmlNodePtr, node) */
100 PUSH_AND_POP(static, xmlChar
*, name
)
103 * Macros for accessing the content. Those should be used only by the parser,
106 * Dirty macros, i.e. one need to make assumption on the context to use them
108 * CUR_PTR return the current pointer to the xmlChar to be parsed.
109 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
110 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
111 * in UNICODE mode. This should be used internally by the parser
112 * only to compare to ASCII values otherwise it would break when
113 * running with UTF-8 encoding.
114 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
115 * to compare on ASCII based substring.
116 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
117 * it should be used only to compare on ASCII based substring.
118 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
119 * strings within the parser.
121 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
123 * CURRENT Returns the current char value, with the full decoding of
124 * UTF-8 if we are using this mode. It returns an int.
125 * NEXT Skip to the next character, this does the proper decoding
126 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
127 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
130 #define UPPER (toupper(*ctxt->input->cur))
132 #define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
134 #define NXT(val) ctxt->input->cur[(val)]
136 #define UPP(val) (toupper(ctxt->input->cur[(val)]))
138 #define CUR_PTR ctxt->input->cur
140 #define SHRINK xmlParserInputShrink(ctxt->input)
142 #define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
144 #define CURRENT ((int) (*ctxt->input->cur))
146 #define SKIP_BLANKS htmlSkipBlankChars(ctxt)
148 /* Inported from XML */
150 /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
151 #define CUR ((int) (*ctxt->input->cur))
152 #define NEXT xmlNextChar(ctxt),ctxt->nbChars++
154 #define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
155 #define NXT(val) ctxt->input->cur[(val)]
156 #define CUR_PTR ctxt->input->cur
159 #define NEXTL(l) do { \
160 if (*(ctxt->input->cur) == '\n') { \
161 ctxt->input->line++; ctxt->input->col = 1; \
162 } else ctxt->input->col++; \
163 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
168 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
169 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
172 #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
173 #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
175 #define COPY_BUF(l,b,i,v) \
176 if (l == 1) b[i++] = (xmlChar) v; \
177 else i += xmlCopyChar(l,&b[i],v)
181 * @ctxt: the HTML parser context
182 * @len: pointer to the length of the char read
184 * The current char value, if using UTF-8 this may actually span multiple
185 * bytes in the input buffer. Implement the end of line normalization:
186 * 2.11 End-of-Line Handling
187 * If the encoding is unspecified, in the case we find an ISO-Latin-1
188 * char, then the encoding converter is plugged in automatically.
190 * Returns the current char value and its length
194 htmlCurrentChar(xmlParserCtxtPtr ctxt
, int *len
) {
195 if (ctxt
->instate
== XML_PARSER_EOF
)
198 if (ctxt
->token
!= 0) {
202 if (ctxt
->charset
== XML_CHAR_ENCODING_UTF8
) {
204 * We are supposed to handle UTF8, check it's valid
205 * From rfc2044: encoding of the Unicode values on UTF-8:
207 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
208 * 0000 0000-0000 007F 0xxxxxxx
209 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
210 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
212 * Check for the 0x110000 limit too
214 const unsigned char *cur
= ctxt
->input
->cur
;
221 xmlParserInputGrow(ctxt
->input
, INPUT_CHUNK
);
222 if ((cur
[1] & 0xc0) != 0x80)
224 if ((c
& 0xe0) == 0xe0) {
227 xmlParserInputGrow(ctxt
->input
, INPUT_CHUNK
);
228 if ((cur
[2] & 0xc0) != 0x80)
230 if ((c
& 0xf0) == 0xf0) {
232 xmlParserInputGrow(ctxt
->input
, INPUT_CHUNK
);
233 if (((c
& 0xf8) != 0xf0) ||
234 ((cur
[3] & 0xc0) != 0x80))
238 val
= (cur
[0] & 0x7) << 18;
239 val
|= (cur
[1] & 0x3f) << 12;
240 val
|= (cur
[2] & 0x3f) << 6;
241 val
|= cur
[3] & 0x3f;
245 val
= (cur
[0] & 0xf) << 12;
246 val
|= (cur
[1] & 0x3f) << 6;
247 val
|= cur
[2] & 0x3f;
252 val
= (cur
[0] & 0x1f) << 6;
253 val
|= cur
[1] & 0x3f;
256 ctxt
->errNo
= XML_ERR_INVALID_ENCODING
;
257 if ((ctxt
->sax
!= NULL
) &&
258 (ctxt
->sax
->error
!= NULL
))
259 ctxt
->sax
->error(ctxt
->userData
,
260 "Char 0x%X out of allowed range\n", val
);
261 ctxt
->wellFormed
= 0;
262 if (ctxt
->recovery
== 0) ctxt
->disableSAX
= 1;
268 return((int) *ctxt
->input
->cur
);
272 * Assume it's a fixed length encoding (1) with
273 * a compatible encoding for the ASCII set, since
274 * XML constructs only use < 128 chars
277 if ((int) *ctxt
->input
->cur
< 0x80)
278 return((int) *ctxt
->input
->cur
);
281 * Humm this is bad, do an automatic flow conversion
283 xmlSwitchEncoding(ctxt
, XML_CHAR_ENCODING_8859_1
);
284 ctxt
->charset
= XML_CHAR_ENCODING_UTF8
;
285 return(xmlCurrentChar(ctxt
, len
));
289 * If we detect an UTF8 error that probably mean that the
290 * input encoding didn't get properly advertized in the
291 * declaration header. Report the error and switch the encoding
292 * to ISO-Latin-1 (if you don't like this policy, just declare the
295 ctxt
->errNo
= XML_ERR_INVALID_ENCODING
;
296 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->error
!= NULL
)) {
297 ctxt
->sax
->error(ctxt
->userData
,
298 "Input is not proper UTF-8, indicate encoding !\n");
299 ctxt
->sax
->error(ctxt
->userData
, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
300 ctxt
->input
->cur
[0], ctxt
->input
->cur
[1],
301 ctxt
->input
->cur
[2], ctxt
->input
->cur
[3]);
304 ctxt
->charset
= XML_CHAR_ENCODING_8859_1
;
306 return((int) *ctxt
->input
->cur
);
310 * htmlSkipBlankChars:
311 * @ctxt: the HTML parser context
313 * skip all blanks character found at that point in the input streams.
315 * Returns the number of space chars skipped
319 htmlSkipBlankChars(xmlParserCtxtPtr ctxt
) {
322 while (IS_BLANK(*(ctxt
->input
->cur
))) {
323 if ((*ctxt
->input
->cur
== 0) &&
324 (xmlParserInputGrow(ctxt
->input
, INPUT_CHUNK
) <= 0)) {
327 if (*(ctxt
->input
->cur
) == '\n') {
328 ctxt
->input
->line
++; ctxt
->input
->col
= 1;
329 } else ctxt
->input
->col
++;
332 if (*ctxt
->input
->cur
== 0)
333 xmlParserInputGrow(ctxt
->input
, INPUT_CHUNK
);
342 /************************************************************************
344 * The list of HTML elements and their properties *
346 ************************************************************************/
349 * Start Tag: 1 means the start tag can be ommited
350 * End Tag: 1 means the end tag can be ommited
351 * 2 means it's forbidden (empty elements)
352 * 3 means the tag is stylistic and should be closed easily
353 * Depr: this element is deprecated
354 * DTD: 1 means that this element is valid only in the Loose DTD
355 * 2 means that this element is valid only in the Frameset DTD
357 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
359 static const htmlElemDesc
360 html40ElementTable
[] = {
361 { "a", 0, 0, 0, 0, 0, 0, 1, "anchor " },
362 { "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form" },
363 { "acronym", 0, 0, 0, 0, 0, 0, 1, "" },
364 { "address", 0, 0, 0, 0, 0, 0, 0, "information on author " },
365 { "applet", 0, 0, 0, 0, 1, 1, 2, "java applet " },
366 { "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area " },
367 { "b", 0, 3, 0, 0, 0, 0, 1, "bold text style" },
368 { "base", 0, 2, 2, 1, 0, 0, 0, "document base uri " },
369 { "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " },
370 { "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride " },
371 { "big", 0, 3, 0, 0, 0, 0, 1, "large text style" },
372 { "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation " },
373 { "body", 1, 1, 0, 0, 0, 0, 0, "document body " },
374 { "br", 0, 2, 2, 1, 0, 0, 1, "forced line break " },
375 { "button", 0, 0, 0, 0, 0, 0, 2, "push button " },
376 { "caption", 0, 0, 0, 0, 0, 0, 0, "table caption " },
377 { "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center " },
378 { "cite", 0, 0, 0, 0, 0, 0, 1, "citation" },
379 { "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment" },
380 { "col", 0, 2, 2, 1, 0, 0, 0, "table column " },
381 { "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group " },
382 { "dd", 0, 1, 0, 0, 0, 0, 0, "definition description " },
383 { "del", 0, 0, 0, 0, 0, 0, 2, "deleted text " },
384 { "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition" },
385 { "dir", 0, 0, 0, 0, 1, 1, 0, "directory list" },
386 { "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container"},
387 { "dl", 0, 0, 0, 0, 0, 0, 0, "definition list " },
388 { "dt", 0, 1, 0, 0, 0, 0, 0, "definition term " },
389 { "em", 0, 3, 0, 0, 0, 0, 1, "emphasis" },
390 { "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group " },
391 { "font", 0, 3, 0, 0, 1, 1, 1, "local change to font " },
392 { "form", 0, 0, 0, 0, 0, 0, 0, "interactive form " },
393 { "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " },
394 { "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" },
395 { "h1", 0, 0, 0, 0, 0, 0, 0, "heading " },
396 { "h2", 0, 0, 0, 0, 0, 0, 0, "heading " },
397 { "h3", 0, 0, 0, 0, 0, 0, 0, "heading " },
398 { "h4", 0, 0, 0, 0, 0, 0, 0, "heading " },
399 { "h5", 0, 0, 0, 0, 0, 0, 0, "heading " },
400 { "h6", 0, 0, 0, 0, 0, 0, 0, "heading " },
401 { "head", 1, 1, 0, 0, 0, 0, 0, "document head " },
402 { "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " },
403 { "html", 1, 1, 0, 0, 0, 0, 0, "document root element " },
404 { "i", 0, 3, 0, 0, 0, 0, 1, "italic text style" },
405 { "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow " },
406 { "img", 0, 2, 2, 1, 0, 0, 1, "embedded image " },
407 { "input", 0, 2, 2, 1, 0, 0, 1, "form control " },
408 { "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text" },
409 { "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt " },
410 { "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user" },
411 { "label", 0, 0, 0, 0, 0, 0, 1, "form field label text " },
412 { "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend " },
413 { "li", 0, 1, 1, 0, 0, 0, 0, "list item " },
414 { "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link " },
415 { "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map " },
416 { "menu", 0, 0, 0, 0, 1, 1, 0, "menu list " },
417 { "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation " },
418 { "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering " },
419 { "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
420 { "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object " },
421 { "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list " },
422 { "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group " },
423 { "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " },
424 { "p", 0, 1, 0, 0, 0, 0, 0, "paragraph " },
425 { "param", 0, 2, 2, 1, 0, 0, 0, "named property value " },
426 { "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text " },
427 { "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation " },
428 { "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style" },
429 { "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc." },
430 { "script", 0, 0, 0, 0, 0, 0, 2, "script statements " },
431 { "select", 0, 0, 0, 0, 0, 0, 1, "option selector " },
432 { "small", 0, 3, 0, 0, 0, 0, 1, "small text style" },
433 { "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container " },
434 { "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text" },
435 { "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis" },
436 { "style", 0, 0, 0, 0, 0, 0, 0, "style info " },
437 { "sub", 0, 3, 0, 0, 0, 0, 1, "subscript" },
438 { "sup", 0, 3, 0, 0, 0, 0, 1, "superscript " },
439 { "table", 0, 0, 0, 0, 0, 0, 0, " " },
440 { "tbody", 1, 0, 0, 0, 0, 0, 0, "table body " },
441 { "td", 0, 0, 0, 0, 0, 0, 0, "table data cell" },
442 { "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field " },
443 { "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer " },
444 { "th", 0, 1, 0, 0, 0, 0, 0, "table header cell" },
445 { "thead", 0, 1, 0, 0, 0, 0, 0, "table header " },
446 { "title", 0, 0, 0, 0, 0, 0, 0, "document title " },
447 { "tr", 0, 0, 0, 0, 0, 0, 0, "table row " },
448 { "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style" },
449 { "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style" },
450 { "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list " },
451 { "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument" },
455 * start tags that imply the end of current element
457 static const char *htmlStartClose
[] = {
458 "form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
459 "dl", "ul", "ol", "menu", "dir", "address", "pre",
460 "listing", "xmp", "head", NULL
,
463 "body", "head", "style", "link", "title", "p", NULL
,
464 "li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
465 "pre", "listing", "xmp", "head", "li", NULL
,
466 "hr", "p", "head", NULL
,
467 "h1", "p", "head", NULL
,
468 "h2", "p", "head", NULL
,
469 "h3", "p", "head", NULL
,
470 "h4", "p", "head", NULL
,
471 "h5", "p", "head", NULL
,
472 "h6", "p", "head", NULL
,
473 "dir", "p", "head", NULL
,
474 "address", "p", "head", "ul", NULL
,
475 "pre", "p", "head", "ul", NULL
,
476 "listing", "p", "head", NULL
,
477 "xmp", "p", "head", NULL
,
478 "blockquote", "p", "head", NULL
,
479 "dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
481 "dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
483 "dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
485 "ul", "p", "head", "ol", "menu", "dir", "address", "pre",
486 "listing", "xmp", NULL
,
487 "ol", "p", "head", "ul", NULL
,
488 "menu", "p", "head", "ul", NULL
,
489 "p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL
,
490 "div", "p", "head", NULL
,
491 "noscript", "p", "head", NULL
,
492 "center", "font", "b", "i", "p", "head", NULL
,
494 "caption", "p", NULL
,
495 "colgroup", "caption", "colgroup", "col", "p", NULL
,
496 "col", "caption", "col", "p", NULL
,
497 "table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
498 "listing", "xmp", "a", NULL
,
499 "th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL
,
500 "td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL
,
501 "tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL
,
502 "thead", "caption", "col", "colgroup", NULL
,
503 "tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
505 "tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
506 "tfoot", "tbody", "p", NULL
,
507 "optgroup", "option", NULL
,
508 "option", "option", NULL
,
509 "fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
510 "pre", "listing", "xmp", "a", NULL
,
515 * The list of HTML elements which are supposed not to have
516 * CDATA content and where a p element will be implied
518 * TODO: extend that list by reading the HTML SGML DTD on
521 static const char *htmlNoContentElements
[] = {
529 * The list of HTML attributes which are of content %Script;
530 * NOTE: when adding ones, check htmlIsScriptAttribute() since
531 * it assumes the name starts with 'on'
533 static const char *htmlScriptAttributes
[] = {
555 * This table is used by the htmlparser to know what to do with
556 * broken html pages. By assigning different priorities to different
557 * elements the parser can decide how to handle extra endtags.
558 * Endtags are only allowed to close elements with lower or equal
567 static const elementPriority htmlEndPriority
[] = {
579 {NULL
, 100} /* Default priority */
582 static const char** htmlStartCloseIndex
[100];
583 static int htmlStartCloseIndexinitialized
= 0;
585 /************************************************************************
587 * functions to handle HTML specific data *
589 ************************************************************************/
594 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
595 * This is not reentrant. Call xmlInitParser() once before processing in
596 * case of use in multithreaded programs.
599 htmlInitAutoClose(void) {
602 if (htmlStartCloseIndexinitialized
) return;
604 for (indx
= 0;indx
< 100;indx
++) htmlStartCloseIndex
[indx
] = NULL
;
606 while ((htmlStartClose
[i
] != NULL
) && (indx
< 100 - 1)) {
607 htmlStartCloseIndex
[indx
++] = &htmlStartClose
[i
];
608 while (htmlStartClose
[i
] != NULL
) i
++;
611 htmlStartCloseIndexinitialized
= 1;
616 * @tag: The tag name in lowercase
618 * Lookup the HTML tag in the ElementTable
620 * Returns the related htmlElemDescPtr or NULL if not found.
623 htmlTagLookup(const xmlChar
*tag
) {
626 for (i
= 0; i
< (sizeof(html40ElementTable
) /
627 sizeof(html40ElementTable
[0]));i
++) {
628 if (!xmlStrcasecmp(tag
, BAD_CAST html40ElementTable
[i
].name
))
629 return((const htmlElemDescPtr
) (const htmlElemDescPtr
) (const htmlElemDescPtr
) (const htmlElemDescPtr
) (const htmlElemDescPtr
) (const htmlElemDescPtr
) (const htmlElemDescPtr
) (const htmlElemDescPtr
) (const htmlElemDescPtr
) &html40ElementTable
[i
]);
635 * htmlGetEndPriority:
636 * @name: The name of the element to look up the priority for.
638 * Return value: The "endtag" priority.
641 htmlGetEndPriority (const xmlChar
*name
) {
644 while ((htmlEndPriority
[i
].name
!= NULL
) &&
645 (!xmlStrEqual((const xmlChar
*)htmlEndPriority
[i
].name
, name
)))
648 return(htmlEndPriority
[i
].priority
);
652 * htmlCheckAutoClose:
653 * @newtag: The new tag name
654 * @oldtag: The old tag name
656 * Checks whether the new tag is one of the registered valid tags for
658 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
660 * Returns 0 if no, 1 if yes.
663 htmlCheckAutoClose(const xmlChar
*newtag
, const xmlChar
*oldtag
) {
665 const char **closed
= NULL
;
667 if (htmlStartCloseIndexinitialized
== 0) htmlInitAutoClose();
669 /* inefficient, but not a big deal */
670 for (indx
= 0; indx
< 100;indx
++) {
671 closed
= htmlStartCloseIndex
[indx
];
672 if (closed
== NULL
) return(0);
673 if (xmlStrEqual(BAD_CAST
*closed
, newtag
)) break;
676 i
= closed
- htmlStartClose
;
678 while (htmlStartClose
[i
] != NULL
) {
679 if (xmlStrEqual(BAD_CAST htmlStartClose
[i
], oldtag
)) {
688 * htmlAutoCloseOnClose:
689 * @ctxt: an HTML parser context
690 * @newtag: The new tag name
691 * @force: force the tag closure
693 * The HTML DTD allows an ending tag to implicitly close other tags.
696 htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt
, const xmlChar
*newtag
) {
697 const htmlElemDesc
* info
;
702 xmlGenericError(xmlGenericErrorContext
,"Close of %s stack: %d elements\n", newtag
, ctxt
->nameNr
);
703 for (i
= 0;i
< ctxt
->nameNr
;i
++)
704 xmlGenericError(xmlGenericErrorContext
,"%d : %s\n", i
, ctxt
->nameTab
[i
]);
707 priority
= htmlGetEndPriority (newtag
);
709 for (i
= (ctxt
->nameNr
- 1);i
>= 0;i
--) {
711 if (xmlStrEqual(newtag
, ctxt
->nameTab
[i
])) break;
713 * A missplaced endtag can only close elements with lower
714 * or equal priority, so if we find an element with higher
715 * priority before we find an element with
716 * matching name, we just ignore this endtag
718 if (htmlGetEndPriority (ctxt
->nameTab
[i
]) > priority
) return;
722 while (!xmlStrEqual(newtag
, ctxt
->name
)) {
723 info
= htmlTagLookup(ctxt
->name
);
724 if ((info
== NULL
) || (info
->endTag
== 1)) {
726 xmlGenericError(xmlGenericErrorContext
,"htmlAutoCloseOnClose: %s closes %s\n", newtag
, ctxt
->name
);
728 } else if (info
->endTag
== 3) {
730 xmlGenericError(xmlGenericErrorContext
,"End of tag %s: expecting %s\n", newtag
, ctxt
->name
);
733 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->error
!= NULL
))
734 ctxt
->sax
->error(ctxt
->userData
,
735 "Opening and ending tag mismatch: %s and %s\n",
737 ctxt
->wellFormed
= 0;
739 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->endElement
!= NULL
))
740 ctxt
->sax
->endElement(ctxt
->userData
, ctxt
->name
);
741 oldname
= htmlnamePop(ctxt
);
742 if (oldname
!= NULL
) {
744 xmlGenericError(xmlGenericErrorContext
,"htmlAutoCloseOnClose: popped %s\n", oldname
);
752 * htmlAutoCloseOnEnd:
753 * @ctxt: an HTML parser context
755 * Close all remaining tags at the end of the stream
758 htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt
) {
762 if (ctxt
->nameNr
== 0)
765 xmlGenericError(xmlGenericErrorContext
,"Close of stack: %d elements\n", ctxt
->nameNr
);
768 for (i
= (ctxt
->nameNr
- 1);i
>= 0;i
--) {
770 xmlGenericError(xmlGenericErrorContext
,"%d : %s\n", i
, ctxt
->nameTab
[i
]);
772 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->endElement
!= NULL
))
773 ctxt
->sax
->endElement(ctxt
->userData
, ctxt
->name
);
774 oldname
= htmlnamePop(ctxt
);
775 if (oldname
!= NULL
) {
777 xmlGenericError(xmlGenericErrorContext
,"htmlAutoCloseOnEnd: popped %s\n", oldname
);
786 * @ctxt: an HTML parser context
787 * @newtag: The new tag name or NULL
789 * The HTML DTD allows a tag to implicitly close other tags.
790 * The list is kept in htmlStartClose array. This function is
791 * called when a new tag has been detected and generates the
792 * appropriates closes if possible/needed.
793 * If newtag is NULL this mean we are at the end of the resource
794 * and we should check
797 htmlAutoClose(htmlParserCtxtPtr ctxt
, const xmlChar
*newtag
) {
799 while ((newtag
!= NULL
) && (ctxt
->name
!= NULL
) &&
800 (htmlCheckAutoClose(newtag
, ctxt
->name
))) {
802 xmlGenericError(xmlGenericErrorContext
,"htmlAutoClose: %s closes %s\n", newtag
, ctxt
->name
);
804 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->endElement
!= NULL
))
805 ctxt
->sax
->endElement(ctxt
->userData
, ctxt
->name
);
806 oldname
= htmlnamePop(ctxt
);
807 if (oldname
!= NULL
) {
809 xmlGenericError(xmlGenericErrorContext
,"htmlAutoClose: popped %s\n", oldname
);
814 if (newtag
== NULL
) {
815 htmlAutoCloseOnEnd(ctxt
);
818 while ((newtag
== NULL
) && (ctxt
->name
!= NULL
) &&
819 ((xmlStrEqual(ctxt
->name
, BAD_CAST
"head")) ||
820 (xmlStrEqual(ctxt
->name
, BAD_CAST
"body")) ||
821 (xmlStrEqual(ctxt
->name
, BAD_CAST
"html")))) {
823 xmlGenericError(xmlGenericErrorContext
,"htmlAutoClose: EOF closes %s\n", ctxt
->name
);
825 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->endElement
!= NULL
))
826 ctxt
->sax
->endElement(ctxt
->userData
, ctxt
->name
);
827 oldname
= htmlnamePop(ctxt
);
828 if (oldname
!= NULL
) {
830 xmlGenericError(xmlGenericErrorContext
,"htmlAutoClose: popped %s\n", oldname
);
840 * @doc: the HTML document
841 * @name: The tag name
842 * @elem: the HTML element
844 * The HTML DTD allows a tag to implicitly close other tags.
845 * The list is kept in htmlStartClose array. This function checks
846 * if the element or one of it's children would autoclose the
849 * Returns 1 if autoclose, 0 otherwise
852 htmlAutoCloseTag(htmlDocPtr doc
, const xmlChar
*name
, htmlNodePtr elem
) {
855 if (elem
== NULL
) return(1);
856 if (xmlStrEqual(name
, elem
->name
)) return(0);
857 if (htmlCheckAutoClose(elem
->name
, name
)) return(1);
858 child
= elem
->children
;
859 while (child
!= NULL
) {
860 if (htmlAutoCloseTag(doc
, name
, child
)) return(1);
868 * @doc: the HTML document
869 * @elem: the HTML element
871 * The HTML DTD allows a tag to implicitly close other tags.
872 * The list is kept in htmlStartClose array. This function checks
873 * if a tag is autoclosed by one of it's child
875 * Returns 1 if autoclosed, 0 otherwise
878 htmlIsAutoClosed(htmlDocPtr doc
, htmlNodePtr elem
) {
881 if (elem
== NULL
) return(1);
882 child
= elem
->children
;
883 while (child
!= NULL
) {
884 if (htmlAutoCloseTag(doc
, elem
->name
, child
)) return(1);
892 * @ctxt: an HTML parser context
893 * @newtag: The new tag name
895 * The HTML DTD allows a tag to exists only implicitly
896 * called when a new tag has been detected and generates the
897 * appropriates implicit tags if missing
900 htmlCheckImplied(htmlParserCtxtPtr ctxt
, const xmlChar
*newtag
) {
901 if (!htmlOmittedDefaultValue
)
903 if (xmlStrEqual(newtag
, BAD_CAST
"html"))
905 if (ctxt
->nameNr
<= 0) {
907 xmlGenericError(xmlGenericErrorContext
,"Implied element html: pushed html\n");
909 htmlnamePush(ctxt
, xmlStrdup(BAD_CAST
"html"));
910 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->startElement
!= NULL
))
911 ctxt
->sax
->startElement(ctxt
->userData
, BAD_CAST
"html", NULL
);
913 if ((xmlStrEqual(newtag
, BAD_CAST
"body")) || (xmlStrEqual(newtag
, BAD_CAST
"head")))
915 if ((ctxt
->nameNr
<= 1) &&
916 ((xmlStrEqual(newtag
, BAD_CAST
"script")) ||
917 (xmlStrEqual(newtag
, BAD_CAST
"style")) ||
918 (xmlStrEqual(newtag
, BAD_CAST
"meta")) ||
919 (xmlStrEqual(newtag
, BAD_CAST
"link")) ||
920 (xmlStrEqual(newtag
, BAD_CAST
"title")) ||
921 (xmlStrEqual(newtag
, BAD_CAST
"base")))) {
923 * dropped OBJECT ... i you put it first BODY will be
927 xmlGenericError(xmlGenericErrorContext
,"Implied element head: pushed head\n");
929 htmlnamePush(ctxt
, xmlStrdup(BAD_CAST
"head"));
930 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->startElement
!= NULL
))
931 ctxt
->sax
->startElement(ctxt
->userData
, BAD_CAST
"head", NULL
);
932 } else if ((!xmlStrEqual(newtag
, BAD_CAST
"noframes")) &&
933 (!xmlStrEqual(newtag
, BAD_CAST
"frame")) &&
934 (!xmlStrEqual(newtag
, BAD_CAST
"frameset"))) {
936 for (i
= 0;i
< ctxt
->nameNr
;i
++) {
937 if (xmlStrEqual(ctxt
->nameTab
[i
], BAD_CAST
"body")) {
940 if (xmlStrEqual(ctxt
->nameTab
[i
], BAD_CAST
"head")) {
946 xmlGenericError(xmlGenericErrorContext
,"Implied element body: pushed body\n");
948 htmlnamePush(ctxt
, xmlStrdup(BAD_CAST
"body"));
949 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->startElement
!= NULL
))
950 ctxt
->sax
->startElement(ctxt
->userData
, BAD_CAST
"body", NULL
);
956 * @ctxt: an HTML parser context
958 * Check whether a p element need to be implied before inserting
959 * characters in the current element.
961 * Returns 1 if a paragraph has been inserted, 0 if not and -1
966 htmlCheckParagraph(htmlParserCtxtPtr ctxt
) {
974 htmlAutoClose(ctxt
, BAD_CAST
"p");
975 htmlCheckImplied(ctxt
, BAD_CAST
"p");
976 htmlnamePush(ctxt
, xmlStrdup(BAD_CAST
"p"));
977 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->startElement
!= NULL
))
978 ctxt
->sax
->startElement(ctxt
->userData
, BAD_CAST
"p", NULL
);
981 if (!htmlOmittedDefaultValue
)
983 for (i
= 0; htmlNoContentElements
[i
] != NULL
; i
++) {
984 if (xmlStrEqual(tag
, BAD_CAST htmlNoContentElements
[i
])) {
986 xmlGenericError(xmlGenericErrorContext
,"Implied element paragraph\n");
988 htmlAutoClose(ctxt
, BAD_CAST
"p");
989 htmlCheckImplied(ctxt
, BAD_CAST
"p");
990 htmlnamePush(ctxt
, xmlStrdup(BAD_CAST
"p"));
991 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->startElement
!= NULL
))
992 ctxt
->sax
->startElement(ctxt
->userData
, BAD_CAST
"p", NULL
);
1000 * htmlIsScriptAttribute:
1001 * @name: an attribute name
1003 * Check if an attribute is of content type Script
1005 * Returns 1 is the attribute is a script 0 otherwise
1008 htmlIsScriptAttribute(const xmlChar
*name
) {
1014 * all script attributes start with 'on'
1016 if ((name
[0] != 'o') || (name
[1] != 'n'))
1019 i
< sizeof(htmlScriptAttributes
)/sizeof(htmlScriptAttributes
[0]);
1021 if (xmlStrEqual(name
, (const xmlChar
*) htmlScriptAttributes
[i
]))
1027 /************************************************************************
1029 * The list of HTML predefined entities *
1031 ************************************************************************/
1034 static const htmlEntityDesc html40EntitiesTable
[] = {
1036 * the 4 absolute ones, plus apostrophe.
1038 { 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1039 { 38, "amp", "ampersand, U+0026 ISOnum" },
1040 { 39, "apos", "single quote" },
1041 { 60, "lt", "less-than sign, U+003C ISOnum" },
1042 { 62, "gt", "greater-than sign, U+003E ISOnum" },
1045 * A bunch still in the 128-255 range
1046 * Replacing them depend really on the charset used.
1048 { 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1049 { 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1050 { 162, "cent", "cent sign, U+00A2 ISOnum" },
1051 { 163, "pound","pound sign, U+00A3 ISOnum" },
1052 { 164, "curren","currency sign, U+00A4 ISOnum" },
1053 { 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1054 { 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1055 { 167, "sect", "section sign, U+00A7 ISOnum" },
1056 { 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1057 { 169, "copy", "copyright sign, U+00A9 ISOnum" },
1058 { 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1059 { 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1060 { 172, "not", "not sign, U+00AC ISOnum" },
1061 { 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1062 { 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1063 { 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1064 { 176, "deg", "degree sign, U+00B0 ISOnum" },
1065 { 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1066 { 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1067 { 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1068 { 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1069 { 181, "micro","micro sign, U+00B5 ISOnum" },
1070 { 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1071 { 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1072 { 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1073 { 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1074 { 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1075 { 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1076 { 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1077 { 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1078 { 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1079 { 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1080 { 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1081 { 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1082 { 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1083 { 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1084 { 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1085 { 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1086 { 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1087 { 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1088 { 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1089 { 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1090 { 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1091 { 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1092 { 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1093 { 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1094 { 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1095 { 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1096 { 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1097 { 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1098 { 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1099 { 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1100 { 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1101 { 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1102 { 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1103 { 215, "times","multiplication sign, U+00D7 ISOnum" },
1104 { 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1105 { 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1106 { 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1107 { 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1108 { 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1109 { 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1110 { 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1111 { 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1112 { 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1113 { 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1114 { 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1115 { 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1116 { 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1117 { 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1118 { 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1119 { 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1120 { 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1121 { 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1122 { 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1123 { 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1124 { 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1125 { 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1126 { 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1127 { 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1128 { 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1129 { 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1130 { 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1131 { 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1132 { 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1133 { 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1134 { 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1135 { 247, "divide","division sign, U+00F7 ISOnum" },
1136 { 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1137 { 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1138 { 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1139 { 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1140 { 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1141 { 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1142 { 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1143 { 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1145 { 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1146 { 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1147 { 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1148 { 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1149 { 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1152 * Anything below should really be kept as entities references
1154 { 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1156 { 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1157 { 732, "tilde","small tilde, U+02DC ISOdia" },
1159 { 913, "Alpha","greek capital letter alpha, U+0391" },
1160 { 914, "Beta", "greek capital letter beta, U+0392" },
1161 { 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1162 { 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1163 { 917, "Epsilon","greek capital letter epsilon, U+0395" },
1164 { 918, "Zeta", "greek capital letter zeta, U+0396" },
1165 { 919, "Eta", "greek capital letter eta, U+0397" },
1166 { 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1167 { 921, "Iota", "greek capital letter iota, U+0399" },
1168 { 922, "Kappa","greek capital letter kappa, U+039A" },
1169 { 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1170 { 924, "Mu", "greek capital letter mu, U+039C" },
1171 { 925, "Nu", "greek capital letter nu, U+039D" },
1172 { 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1173 { 927, "Omicron","greek capital letter omicron, U+039F" },
1174 { 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1175 { 929, "Rho", "greek capital letter rho, U+03A1" },
1176 { 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1177 { 932, "Tau", "greek capital letter tau, U+03A4" },
1178 { 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1179 { 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1180 { 935, "Chi", "greek capital letter chi, U+03A7" },
1181 { 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1182 { 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1184 { 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1185 { 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1186 { 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1187 { 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1188 { 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1189 { 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1190 { 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1191 { 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1192 { 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1193 { 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1194 { 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1195 { 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1196 { 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1197 { 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1198 { 959, "omicron","greek small letter omicron, U+03BF NEW" },
1199 { 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1200 { 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1201 { 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1202 { 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1203 { 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1204 { 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1205 { 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1206 { 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1207 { 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1208 { 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1209 { 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1210 { 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1211 { 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1213 { 8194, "ensp", "en space, U+2002 ISOpub" },
1214 { 8195, "emsp", "em space, U+2003 ISOpub" },
1215 { 8201, "thinsp","thin space, U+2009 ISOpub" },
1216 { 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1217 { 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1218 { 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1219 { 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1220 { 8211, "ndash","en dash, U+2013 ISOpub" },
1221 { 8212, "mdash","em dash, U+2014 ISOpub" },
1222 { 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1223 { 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1224 { 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1225 { 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1226 { 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1227 { 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1228 { 8224, "dagger","dagger, U+2020 ISOpub" },
1229 { 8225, "Dagger","double dagger, U+2021 ISOpub" },
1231 { 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1232 { 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1234 { 8240, "permil","per mille sign, U+2030 ISOtech" },
1236 { 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1237 { 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1239 { 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1240 { 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1242 { 8254, "oline","overline = spacing overscore, U+203E NEW" },
1243 { 8260, "frasl","fraction slash, U+2044 NEW" },
1245 { 8364, "euro", "euro sign, U+20AC NEW" },
1247 { 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1248 { 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1249 { 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1250 { 8482, "trade","trade mark sign, U+2122 ISOnum" },
1251 { 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1252 { 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1253 { 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1254 { 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1255 { 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1256 { 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1257 { 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1258 { 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1259 { 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1260 { 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1261 { 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1262 { 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1264 { 8704, "forall","for all, U+2200 ISOtech" },
1265 { 8706, "part", "partial differential, U+2202 ISOtech" },
1266 { 8707, "exist","there exists, U+2203 ISOtech" },
1267 { 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1268 { 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1269 { 8712, "isin", "element of, U+2208 ISOtech" },
1270 { 8713, "notin","not an element of, U+2209 ISOtech" },
1271 { 8715, "ni", "contains as member, U+220B ISOtech" },
1272 { 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1273 { 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
1274 { 8722, "minus","minus sign, U+2212 ISOtech" },
1275 { 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1276 { 8730, "radic","square root = radical sign, U+221A ISOtech" },
1277 { 8733, "prop", "proportional to, U+221D ISOtech" },
1278 { 8734, "infin","infinity, U+221E ISOtech" },
1279 { 8736, "ang", "angle, U+2220 ISOamso" },
1280 { 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1281 { 8744, "or", "logical or = vee, U+2228 ISOtech" },
1282 { 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1283 { 8746, "cup", "union = cup, U+222A ISOtech" },
1284 { 8747, "int", "integral, U+222B ISOtech" },
1285 { 8756, "there4","therefore, U+2234 ISOtech" },
1286 { 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1287 { 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1288 { 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1289 { 8800, "ne", "not equal to, U+2260 ISOtech" },
1290 { 8801, "equiv","identical to, U+2261 ISOtech" },
1291 { 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1292 { 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1293 { 8834, "sub", "subset of, U+2282 ISOtech" },
1294 { 8835, "sup", "superset of, U+2283 ISOtech" },
1295 { 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1296 { 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1297 { 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1298 { 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1299 { 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1300 { 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1301 { 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1302 { 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1303 { 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1304 { 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1305 { 8971, "rfloor","right floor, U+230B ISOamsc" },
1306 { 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1307 { 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1308 { 9674, "loz", "lozenge, U+25CA ISOpub" },
1310 { 9824, "spades","black spade suit, U+2660 ISOpub" },
1311 { 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1312 { 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1313 { 9830, "diams","black diamond suit, U+2666 ISOpub" },
1317 /************************************************************************
1319 * Commodity functions to handle entities *
1321 ************************************************************************/
1324 * Macro used to grow the current buffer.
1326 #define growBuffer(buffer) { \
1327 buffer##_size *= 2; \
1328 buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1329 if (buffer == NULL) { \
1330 xmlGenericError(xmlGenericErrorContext, "realloc failed\n"); \
1337 * @name: the entity name
1339 * Lookup the given entity in EntitiesTable
1341 * TODO: the linear scan is really ugly, an hash table is really needed.
1343 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1345 const htmlEntityDesc
*
1346 htmlEntityLookup(const xmlChar
*name
) {
1349 for (i
= 0;i
< (sizeof(html40EntitiesTable
)/
1350 sizeof(html40EntitiesTable
[0]));i
++) {
1351 if (xmlStrEqual(name
, BAD_CAST html40EntitiesTable
[i
].name
)) {
1353 xmlGenericError(xmlGenericErrorContext
,"Found entity %s\n", name
);
1355 return((const htmlEntityDescPtr
) &html40EntitiesTable
[i
]);
1362 * htmlEntityValueLookup:
1363 * @value: the entity's unicode value
1365 * Lookup the given entity in EntitiesTable
1367 * TODO: the linear scan is really ugly, an hash table is really needed.
1369 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1371 const htmlEntityDesc
*
1372 htmlEntityValueLookup(unsigned int value
) {
1375 unsigned int lv
= 0;
1378 for (i
= 0;i
< (sizeof(html40EntitiesTable
)/
1379 sizeof(html40EntitiesTable
[0]));i
++) {
1380 if (html40EntitiesTable
[i
].value
>= value
) {
1381 if (html40EntitiesTable
[i
].value
> value
)
1384 xmlGenericError(xmlGenericErrorContext
,"Found entity %s\n", html40EntitiesTable
[i
].name
);
1386 return((const htmlEntityDescPtr
) &html40EntitiesTable
[i
]);
1389 if (lv
> html40EntitiesTable
[i
].value
) {
1390 xmlGenericError(xmlGenericErrorContext
,
1391 "html40EntitiesTable[] is not sorted (%d > %d)!\n",
1392 lv
, html40EntitiesTable
[i
].value
);
1394 lv
= html40EntitiesTable
[i
].value
;
1402 * @out: a pointer to an array of bytes to store the result
1403 * @outlen: the length of @out
1404 * @in: a pointer to an array of UTF-8 chars
1405 * @inlen: the length of @in
1407 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1408 * plus HTML entities block of chars out.
1410 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1411 * The value of @inlen after return is the number of octets consumed
1412 * as the return value is positive, else unpredictable.
1413 * The value of @outlen after return is the number of octets consumed.
1416 UTF8ToHtml(unsigned char* out
, int *outlen
,
1417 const unsigned char* in
, int *inlen
) {
1418 const unsigned char* processed
= in
;
1419 const unsigned char* outend
;
1420 const unsigned char* outstart
= out
;
1421 const unsigned char* instart
= in
;
1422 const unsigned char* inend
;
1428 * initialization nothing to do
1434 inend
= in
+ (*inlen
);
1435 outend
= out
+ (*outlen
);
1436 while (in
< inend
) {
1438 if (d
< 0x80) { c
= d
; trailing
= 0; }
1439 else if (d
< 0xC0) {
1440 /* trailing byte in leading position */
1441 *outlen
= out
- outstart
;
1442 *inlen
= processed
- instart
;
1444 } else if (d
< 0xE0) { c
= d
& 0x1F; trailing
= 1; }
1445 else if (d
< 0xF0) { c
= d
& 0x0F; trailing
= 2; }
1446 else if (d
< 0xF8) { c
= d
& 0x07; trailing
= 3; }
1448 /* no chance for this in Ascii */
1449 *outlen
= out
- outstart
;
1450 *inlen
= processed
- instart
;
1454 if (inend
- in
< trailing
) {
1458 for ( ; trailing
; trailing
--) {
1459 if ((in
>= inend
) || (((d
= *in
++) & 0xC0) != 0x80))
1465 /* assertion: c is a single UTF-4 value */
1467 if (out
+ 1 >= outend
)
1472 const htmlEntityDesc
* ent
;
1475 * Try to lookup a predefined HTML entity for it
1478 ent
= htmlEntityValueLookup(c
);
1480 /* no chance for this in Ascii */
1481 *outlen
= out
- outstart
;
1482 *inlen
= processed
- instart
;
1485 len
= strlen(ent
->name
);
1486 if (out
+ 2 + len
>= outend
)
1489 memcpy(out
, ent
->name
, len
);
1495 *outlen
= out
- outstart
;
1496 *inlen
= processed
- instart
;
1501 * htmlEncodeEntities:
1502 * @out: a pointer to an array of bytes to store the result
1503 * @outlen: the length of @out
1504 * @in: a pointer to an array of UTF-8 chars
1505 * @inlen: the length of @in
1506 * @quoteChar: the quote character to escape (' or ") or zero.
1508 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1509 * plus HTML entities block of chars out.
1511 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1512 * The value of @inlen after return is the number of octets consumed
1513 * as the return value is positive, else unpredictable.
1514 * The value of @outlen after return is the number of octets consumed.
1517 htmlEncodeEntities(unsigned char* out
, int *outlen
,
1518 const unsigned char* in
, int *inlen
, int quoteChar
) {
1519 const unsigned char* processed
= in
;
1520 const unsigned char* outend
= out
+ (*outlen
);
1521 const unsigned char* outstart
= out
;
1522 const unsigned char* instart
= in
;
1523 const unsigned char* inend
= in
+ (*inlen
);
1527 while (in
< inend
) {
1529 if (d
< 0x80) { c
= d
; trailing
= 0; }
1530 else if (d
< 0xC0) {
1531 /* trailing byte in leading position */
1532 *outlen
= out
- outstart
;
1533 *inlen
= processed
- instart
;
1535 } else if (d
< 0xE0) { c
= d
& 0x1F; trailing
= 1; }
1536 else if (d
< 0xF0) { c
= d
& 0x0F; trailing
= 2; }
1537 else if (d
< 0xF8) { c
= d
& 0x07; trailing
= 3; }
1539 /* no chance for this in Ascii */
1540 *outlen
= out
- outstart
;
1541 *inlen
= processed
- instart
;
1545 if (inend
- in
< trailing
)
1548 while (trailing
--) {
1549 if (((d
= *in
++) & 0xC0) != 0x80) {
1550 *outlen
= out
- outstart
;
1551 *inlen
= processed
- instart
;
1558 /* assertion: c is a single UTF-4 value */
1559 if ((c
< 0x80) && (c
!= (unsigned int) quoteChar
) &&
1560 (c
!= '&') && (c
!= '<') && (c
!= '>')) {
1565 const htmlEntityDesc
* ent
;
1571 * Try to lookup a predefined HTML entity for it
1573 ent
= htmlEntityValueLookup(c
);
1575 snprintf(nbuf
, sizeof(nbuf
), "#%u", c
);
1581 if (out
+ 2 + len
> outend
)
1584 memcpy(out
, cp
, len
);
1590 *outlen
= out
- outstart
;
1591 *inlen
= processed
- instart
;
1596 * htmlDecodeEntities:
1597 * @ctxt: the parser context
1598 * @len: the len to decode (in bytes !), -1 for no size limit
1599 * @end: an end marker xmlChar, 0 if none
1600 * @end2: an end marker xmlChar, 0 if none
1601 * @end3: an end marker xmlChar, 0 if none
1603 * Substitute the HTML entities by their value
1607 * Returns A newly allocated string with the substitution done. The caller
1608 * must deallocate it !
1611 htmlDecodeEntities(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED
, int len ATTRIBUTE_UNUSED
,
1612 xmlChar end ATTRIBUTE_UNUSED
, xmlChar end2 ATTRIBUTE_UNUSED
, xmlChar end3 ATTRIBUTE_UNUSED
) {
1613 static int deprecated
= 0;
1615 xmlGenericError(xmlGenericErrorContext
,
1616 "htmlDecodeEntities() deprecated function reached\n");
1622 /************************************************************************
1624 * Commodity functions to handle streams *
1626 ************************************************************************/
1629 * htmlNewInputStream:
1630 * @ctxt: an HTML parser context
1632 * Create a new input stream structure
1633 * Returns the new input stream or NULL
1635 static htmlParserInputPtr
1636 htmlNewInputStream(htmlParserCtxtPtr ctxt
) {
1637 htmlParserInputPtr input
;
1639 input
= (xmlParserInputPtr
) xmlMalloc(sizeof(htmlParserInput
));
1640 if (input
== NULL
) {
1641 ctxt
->errNo
= XML_ERR_NO_MEMORY
;
1642 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->error
!= NULL
))
1643 ctxt
->sax
->error(ctxt
->userData
,
1644 "malloc: couldn't allocate a new input stream\n");
1647 memset(input
, 0, sizeof(htmlParserInput
));
1648 input
->filename
= NULL
;
1649 input
->directory
= NULL
;
1657 input
->version
= NULL
;
1658 input
->consumed
= 0;
1664 /************************************************************************
1666 * Commodity functions, cleanup needed ? *
1668 ************************************************************************/
1670 * all tags allowing pc data from the html 4.01 loose dtd
1671 * NOTE: it might be more apropriate to integrate this information
1672 * into the html40ElementTable array but I don't want to risk any
1673 * binary incomptibility
1675 static const char *allowPCData
[] = {
1676 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
1677 "blockquote", "body", "button", "caption", "center", "cite", "code",
1678 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
1679 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
1680 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
1681 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
1686 * @ctxt: an HTML parser context
1688 * @len: the size of @str
1690 * Is this a sequence of blank chars that one can ignore ?
1692 * Returns 1 if ignorable 0 otherwise.
1695 static int areBlanks(htmlParserCtxtPtr ctxt
, const xmlChar
*str
, int len
) {
1698 xmlNodePtr lastChild
;
1700 for (j
= 0;j
< len
;j
++)
1701 if (!(IS_BLANK(str
[j
]))) return(0);
1703 if (CUR
== 0) return(1);
1704 if (CUR
!= '<') return(0);
1705 if (ctxt
->name
== NULL
)
1707 if (xmlStrEqual(ctxt
->name
, BAD_CAST
"html"))
1709 if (xmlStrEqual(ctxt
->name
, BAD_CAST
"head"))
1711 if (xmlStrEqual(ctxt
->name
, BAD_CAST
"body"))
1713 if (ctxt
->node
== NULL
) return(0);
1714 lastChild
= xmlGetLastChild(ctxt
->node
);
1715 if (lastChild
== NULL
) {
1716 if ((ctxt
->node
->type
!= XML_ELEMENT_NODE
) &&
1717 (ctxt
->node
->content
!= NULL
)) return(0);
1718 /* keep ws in constructs like ...<b> </b>...
1719 for all tags "b" allowing PCDATA */
1720 for ( i
= 0; i
< sizeof(allowPCData
)/sizeof(allowPCData
[0]); i
++ ) {
1721 if ( xmlStrEqual(ctxt
->name
, BAD_CAST allowPCData
[i
]) ) {
1725 } else if (xmlNodeIsText(lastChild
)) {
1728 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
1729 for all tags "p" allowing PCDATA */
1730 for ( i
= 0; i
< sizeof(allowPCData
)/sizeof(allowPCData
[0]); i
++ ) {
1731 if ( xmlStrEqual(lastChild
->name
, BAD_CAST allowPCData
[i
]) ) {
1741 * @URI: URI for the dtd, or NULL
1742 * @ExternalID: the external ID of the DTD, or NULL
1744 * Creates a new HTML document without a DTD node if @URI and @ExternalID
1747 * Returns a new document, do not initialize the DTD if not provided
1750 htmlNewDocNoDtD(const xmlChar
*URI
, const xmlChar
*ExternalID
) {
1754 * Allocate a new document and fill the fields.
1756 cur
= (xmlDocPtr
) xmlMalloc(sizeof(xmlDoc
));
1758 xmlGenericError(xmlGenericErrorContext
,
1759 "htmlNewDocNoDtD : malloc failed\n");
1762 memset(cur
, 0, sizeof(xmlDoc
));
1764 cur
->type
= XML_HTML_DOCUMENT_NODE
;
1765 cur
->version
= NULL
;
1766 cur
->intSubset
= NULL
;
1769 cur
->children
= NULL
;
1770 cur
->extSubset
= NULL
;
1772 cur
->encoding
= NULL
;
1773 cur
->standalone
= 1;
1774 cur
->compression
= 0;
1777 cur
->_private
= NULL
;
1778 if ((ExternalID
!= NULL
) ||
1780 xmlCreateIntSubset(cur
, BAD_CAST
"HTML", ExternalID
, URI
);
1786 * @URI: URI for the dtd, or NULL
1787 * @ExternalID: the external ID of the DTD, or NULL
1789 * Creates a new HTML document
1791 * Returns a new document
1794 htmlNewDoc(const xmlChar
*URI
, const xmlChar
*ExternalID
) {
1795 if ((URI
== NULL
) && (ExternalID
== NULL
))
1796 return(htmlNewDocNoDtD(
1797 BAD_CAST
"http://www.w3.org/TR/REC-html40/loose.dtd",
1798 BAD_CAST
"-//W3C//DTD HTML 4.0 Transitional//EN"));
1800 return(htmlNewDocNoDtD(URI
, ExternalID
));
1804 /************************************************************************
1806 * The parser itself *
1807 * Relates to http://www.w3.org/TR/html40 *
1809 ************************************************************************/
1811 /************************************************************************
1813 * The parser itself *
1815 ************************************************************************/
1818 * htmlParseHTMLName:
1819 * @ctxt: an HTML parser context
1821 * parse an HTML tag or attribute name, note that we convert it to lowercase
1822 * since HTML names are not case-sensitive.
1824 * Returns the Tag Name parsed or NULL
1828 htmlParseHTMLName(htmlParserCtxtPtr ctxt
) {
1829 xmlChar
*ret
= NULL
;
1831 xmlChar loc
[HTML_PARSER_BUFFER_SIZE
];
1833 if (!IS_LETTER(CUR
) && (CUR
!= '_') &&
1834 (CUR
!= ':')) return(NULL
);
1836 while ((i
< HTML_PARSER_BUFFER_SIZE
) &&
1837 ((IS_LETTER(CUR
)) || (IS_DIGIT(CUR
)) ||
1838 (CUR
== ':') || (CUR
== '-') || (CUR
== '_'))) {
1839 if ((CUR
>= 'A') && (CUR
<= 'Z')) loc
[i
] = CUR
+ 0x20;
1846 ret
= xmlStrndup(loc
, i
);
1853 * @ctxt: an HTML parser context
1855 * parse an HTML name, this routine is case sensitive.
1857 * Returns the Name parsed or NULL
1861 htmlParseName(htmlParserCtxtPtr ctxt
) {
1862 xmlChar buf
[HTML_MAX_NAMELEN
];
1866 if (!IS_LETTER(CUR
) && (CUR
!= '_')) {
1870 while ((IS_LETTER(CUR
)) || (IS_DIGIT(CUR
)) ||
1871 (CUR
== '.') || (CUR
== '-') ||
1872 (CUR
== '_') || (CUR
== ':') ||
1873 (IS_COMBINING(CUR
)) ||
1874 (IS_EXTENDER(CUR
))) {
1877 if (len
>= HTML_MAX_NAMELEN
) {
1878 xmlGenericError(xmlGenericErrorContext
,
1879 "htmlParseName: reached HTML_MAX_NAMELEN limit\n");
1880 while ((IS_LETTER(CUR
)) || (IS_DIGIT(CUR
)) ||
1881 (CUR
== '.') || (CUR
== '-') ||
1882 (CUR
== '_') || (CUR
== ':') ||
1883 (IS_COMBINING(CUR
)) ||
1889 return(xmlStrndup(buf
, len
));
1893 * htmlParseHTMLAttribute:
1894 * @ctxt: an HTML parser context
1895 * @stop: a char stop value
1897 * parse an HTML attribute value till the stop (quote), if
1898 * stop is 0 then it stops at the first space
1900 * Returns the attribute parsed or NULL
1904 htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt
, const xmlChar stop
) {
1905 xmlChar
*buffer
= NULL
;
1906 int buffer_size
= 0;
1907 xmlChar
*out
= NULL
;
1908 xmlChar
*name
= NULL
;
1910 xmlChar
*cur
= NULL
;
1911 const htmlEntityDesc
* ent
;
1914 * allocate a translation buffer.
1916 buffer_size
= HTML_PARSER_BUFFER_SIZE
;
1917 buffer
= (xmlChar
*) xmlMalloc(buffer_size
* sizeof(xmlChar
));
1918 if (buffer
== NULL
) {
1919 xmlGenericError(xmlGenericErrorContext
,
1920 "htmlParseHTMLAttribute: malloc failed\n");
1926 * Ok loop until we reach one of the ending chars
1928 while ((CUR
!= 0) && (CUR
!= stop
)) {
1929 if ((stop
== 0) && (CUR
== '>')) break;
1930 if ((stop
== 0) && (IS_BLANK(CUR
))) break;
1932 if (NXT(1) == '#') {
1936 c
= htmlParseCharRef(ctxt
);
1938 { *out
++ = c
; bits
= -6; }
1940 { *out
++ =((c
>> 6) & 0x1F) | 0xC0; bits
= 0; }
1941 else if (c
< 0x10000)
1942 { *out
++ =((c
>> 12) & 0x0F) | 0xE0; bits
= 6; }
1944 { *out
++ =((c
>> 18) & 0x07) | 0xF0; bits
= 12; }
1946 for ( ; bits
>= 0; bits
-= 6) {
1947 *out
++ = ((c
>> bits
) & 0x3F) | 0x80;
1950 if (out
- buffer
> buffer_size
- 100) {
1951 int indx
= out
- buffer
;
1954 out
= &buffer
[indx
];
1957 ent
= htmlParseEntityRef(ctxt
, &name
);
1960 if (out
- buffer
> buffer_size
- 100) {
1961 int indx
= out
- buffer
;
1964 out
= &buffer
[indx
];
1966 } else if (ent
== NULL
) {
1970 if (out
- buffer
> buffer_size
- 100) {
1971 int indx
= out
- buffer
;
1974 out
= &buffer
[indx
];
1983 if (out
- buffer
> buffer_size
- 100) {
1984 int indx
= out
- buffer
;
1987 out
= &buffer
[indx
];
1989 c
= (xmlChar
)ent
->value
;
1991 { *out
++ = c
; bits
= -6; }
1993 { *out
++ =((c
>> 6) & 0x1F) | 0xC0; bits
= 0; }
1994 else if (c
< 0x10000)
1995 { *out
++ =((c
>> 12) & 0x0F) | 0xE0; bits
= 6; }
1997 { *out
++ =((c
>> 18) & 0x07) | 0xF0; bits
= 12; }
1999 for ( ; bits
>= 0; bits
-= 6) {
2000 *out
++ = ((c
>> bits
) & 0x3F) | 0x80;
2009 if (out
- buffer
> buffer_size
- 100) {
2010 int indx
= out
- buffer
;
2013 out
= &buffer
[indx
];
2017 { *out
++ = c
; bits
= -6; }
2019 { *out
++ =((c
>> 6) & 0x1F) | 0xC0; bits
= 0; }
2020 else if (c
< 0x10000)
2021 { *out
++ =((c
>> 12) & 0x0F) | 0xE0; bits
= 6; }
2023 { *out
++ =((c
>> 18) & 0x07) | 0xF0; bits
= 12; }
2025 for ( ; bits
>= 0; bits
-= 6) {
2026 *out
++ = ((c
>> bits
) & 0x3F) | 0x80;
2036 * htmlParseEntityRef:
2037 * @ctxt: an HTML parser context
2038 * @str: location to store the entity name
2040 * parse an HTML ENTITY references
2042 * [68] EntityRef ::= '&' Name ';'
2044 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2045 * if non-NULL *str will have to be freed by the caller.
2047 const htmlEntityDesc
*
2048 htmlParseEntityRef(htmlParserCtxtPtr ctxt
, xmlChar
**str
) {
2050 const htmlEntityDesc
* ent
= NULL
;
2055 name
= htmlParseName(ctxt
);
2057 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->error
!= NULL
))
2058 ctxt
->sax
->error(ctxt
->userData
, "htmlParseEntityRef: no name\n");
2059 ctxt
->wellFormed
= 0;
2066 * Lookup the entity in the table.
2068 ent
= htmlEntityLookup(name
);
2069 if (ent
!= NULL
) /* OK that's ugly !!! */
2072 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->error
!= NULL
))
2073 ctxt
->sax
->error(ctxt
->userData
,
2074 "htmlParseEntityRef: expecting ';'\n");
2083 * htmlParseAttValue:
2084 * @ctxt: an HTML parser context
2086 * parse a value for an attribute
2087 * Note: the parser won't do substitution of entities here, this
2088 * will be handled later in xmlStringGetNodeList, unless it was
2089 * asked for ctxt->replaceEntities != 0
2091 * Returns the AttValue parsed or NULL.
2095 htmlParseAttValue(htmlParserCtxtPtr ctxt
) {
2096 xmlChar
*ret
= NULL
;
2100 ret
= htmlParseHTMLAttribute(ctxt
, '"');
2102 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->error
!= NULL
))
2103 ctxt
->sax
->error(ctxt
->userData
, "AttValue: ' expected\n");
2104 ctxt
->wellFormed
= 0;
2107 } else if (CUR
== '\'') {
2109 ret
= htmlParseHTMLAttribute(ctxt
, '\'');
2111 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->error
!= NULL
))
2112 ctxt
->sax
->error(ctxt
->userData
, "AttValue: ' expected\n");
2113 ctxt
->wellFormed
= 0;
2118 * That's an HTMLism, the attribute value may not be quoted
2120 ret
= htmlParseHTMLAttribute(ctxt
, 0);
2122 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->error
!= NULL
))
2123 ctxt
->sax
->error(ctxt
->userData
, "AttValue: no value found\n");
2124 ctxt
->wellFormed
= 0;
2131 * htmlParseSystemLiteral:
2132 * @ctxt: an HTML parser context
2134 * parse an HTML Literal
2136 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2138 * Returns the SystemLiteral parsed or NULL
2142 htmlParseSystemLiteral(htmlParserCtxtPtr ctxt
) {
2144 xmlChar
*ret
= NULL
;
2149 while ((IS_CHAR(CUR
)) && (CUR
!= '"'))
2151 if (!IS_CHAR(CUR
)) {
2152 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->error
!= NULL
))
2153 ctxt
->sax
->error(ctxt
->userData
, "Unfinished SystemLiteral\n");
2154 ctxt
->wellFormed
= 0;
2156 ret
= xmlStrndup(q
, CUR_PTR
- q
);
2159 } else if (CUR
== '\'') {
2162 while ((IS_CHAR(CUR
)) && (CUR
!= '\''))
2164 if (!IS_CHAR(CUR
)) {
2165 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->error
!= NULL
))
2166 ctxt
->sax
->error(ctxt
->userData
, "Unfinished SystemLiteral\n");
2167 ctxt
->wellFormed
= 0;
2169 ret
= xmlStrndup(q
, CUR_PTR
- q
);
2173 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->error
!= NULL
))
2174 ctxt
->sax
->error(ctxt
->userData
,
2175 "SystemLiteral \" or ' expected\n");
2176 ctxt
->wellFormed
= 0;
2183 * htmlParsePubidLiteral:
2184 * @ctxt: an HTML parser context
2186 * parse an HTML public literal
2188 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2190 * Returns the PubidLiteral parsed or NULL.
2194 htmlParsePubidLiteral(htmlParserCtxtPtr ctxt
) {
2196 xmlChar
*ret
= NULL
;
2198 * Name ::= (Letter | '_') (NameChar)*
2203 while (IS_PUBIDCHAR(CUR
)) NEXT
;
2205 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->error
!= NULL
))
2206 ctxt
->sax
->error(ctxt
->userData
, "Unfinished PubidLiteral\n");
2207 ctxt
->wellFormed
= 0;
2209 ret
= xmlStrndup(q
, CUR_PTR
- q
);
2212 } else if (CUR
== '\'') {
2215 while ((IS_LETTER(CUR
)) && (CUR
!= '\''))
2217 if (!IS_LETTER(CUR
)) {
2218 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->error
!= NULL
))
2219 ctxt
->sax
->error(ctxt
->userData
, "Unfinished PubidLiteral\n");
2220 ctxt
->wellFormed
= 0;
2222 ret
= xmlStrndup(q
, CUR_PTR
- q
);
2226 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->error
!= NULL
))
2227 ctxt
->sax
->error(ctxt
->userData
, "SystemLiteral \" or ' expected\n");
2228 ctxt
->wellFormed
= 0;
2236 * @ctxt: an HTML parser context
2238 * parse the content of an HTML SCRIPT or STYLE element
2239 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2240 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2241 * http://www.w3.org/TR/html4/types.html#type-script
2242 * http://www.w3.org/TR/html4/types.html#h-6.15
2243 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2245 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2246 * element and the value of intrinsic event attributes. User agents must
2247 * not evaluate script data as HTML markup but instead must pass it on as
2248 * data to a script engine.
2250 * - The content is passed like CDATA
2251 * - the attributes for style and scripting "onXXX" are also described
2252 * as CDATA but SGML allows entities references in attributes so their
2253 * processing is identical as other attributes
2256 htmlParseScript(htmlParserCtxtPtr ctxt
) {
2257 xmlChar buf
[HTML_PARSER_BIG_BUFFER_SIZE
+ 1];
2263 while (IS_CHAR(cur
)) {
2264 if ((cur
== '<') && (NXT(1) == '!') && (NXT(2) == '-') &&
2266 if ((nbchar
!= 0) && (ctxt
->sax
!= NULL
) && (!ctxt
->disableSAX
)) {
2267 if (ctxt
->sax
->cdataBlock
!= NULL
) {
2269 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2271 ctxt
->sax
->cdataBlock(ctxt
->userData
, buf
, nbchar
);
2275 htmlParseComment(ctxt
);
2278 } else if ((cur
== '<') && (NXT(1) == '/')) {
2280 * One should break here, the specification is clear:
2281 * Authors should therefore escape "</" within the content.
2282 * Escape mechanisms are specific to each scripting or
2283 * style sheet language.
2285 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2286 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2289 buf
[nbchar
++] = cur
;
2290 if (nbchar
>= HTML_PARSER_BIG_BUFFER_SIZE
) {
2291 if (ctxt
->sax
->cdataBlock
!= NULL
) {
2293 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2295 ctxt
->sax
->cdataBlock(ctxt
->userData
, buf
, nbchar
);
2302 if (!(IS_CHAR(cur
))) {
2303 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->error
!= NULL
))
2304 ctxt
->sax
->error(ctxt
->userData
,
2305 "Invalid char in CDATA 0x%X\n", cur
);
2306 ctxt
->wellFormed
= 0;
2310 if ((nbchar
!= 0) && (ctxt
->sax
!= NULL
) && (!ctxt
->disableSAX
)) {
2311 if (ctxt
->sax
->cdataBlock
!= NULL
) {
2313 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2315 ctxt
->sax
->cdataBlock(ctxt
->userData
, buf
, nbchar
);
2322 * htmlParseCharData:
2323 * @ctxt: an HTML parser context
2325 * parse a CharData section.
2326 * if we are within a CDATA section ']]>' marks an end of section.
2328 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2332 htmlParseCharData(htmlParserCtxtPtr ctxt
) {
2333 xmlChar buf
[HTML_PARSER_BIG_BUFFER_SIZE
+ 5];
2339 while (((cur
!= '<') || (ctxt
->token
== '<')) &&
2340 ((cur
!= '&') || (ctxt
->token
== '&')) &&
2342 COPY_BUF(l
,buf
,nbchar
,cur
);
2343 if (nbchar
>= HTML_PARSER_BIG_BUFFER_SIZE
) {
2345 * Ok the segment is to be consumed as chars.
2347 if ((ctxt
->sax
!= NULL
) && (!ctxt
->disableSAX
)) {
2348 if (areBlanks(ctxt
, buf
, nbchar
)) {
2349 if (ctxt
->sax
->ignorableWhitespace
!= NULL
)
2350 ctxt
->sax
->ignorableWhitespace(ctxt
->userData
,
2353 htmlCheckParagraph(ctxt
);
2354 if (ctxt
->sax
->characters
!= NULL
)
2355 ctxt
->sax
->characters(ctxt
->userData
, buf
, nbchar
);
2365 * Ok the segment is to be consumed as chars.
2367 if ((ctxt
->sax
!= NULL
) && (!ctxt
->disableSAX
)) {
2368 if (areBlanks(ctxt
, buf
, nbchar
)) {
2369 if (ctxt
->sax
->ignorableWhitespace
!= NULL
)
2370 ctxt
->sax
->ignorableWhitespace(ctxt
->userData
, buf
, nbchar
);
2372 htmlCheckParagraph(ctxt
);
2373 if (ctxt
->sax
->characters
!= NULL
)
2374 ctxt
->sax
->characters(ctxt
->userData
, buf
, nbchar
);
2382 ctxt
->instate
= XML_PARSER_EOF
;
2387 * htmlParseExternalID:
2388 * @ctxt: an HTML parser context
2389 * @publicID: a xmlChar** receiving PubidLiteral
2391 * Parse an External ID or a Public ID
2393 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2394 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2396 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2398 * Returns the function returns SystemLiteral and in the second
2399 * case publicID receives PubidLiteral, is strict is off
2400 * it is possible to return NULL and have publicID set.
2404 htmlParseExternalID(htmlParserCtxtPtr ctxt
, xmlChar
**publicID
) {
2405 xmlChar
*URI
= NULL
;
2407 if ((UPPER
== 'S') && (UPP(1) == 'Y') &&
2408 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2409 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2411 if (!IS_BLANK(CUR
)) {
2412 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->error
!= NULL
))
2413 ctxt
->sax
->error(ctxt
->userData
,
2414 "Space required after 'SYSTEM'\n");
2415 ctxt
->wellFormed
= 0;
2418 URI
= htmlParseSystemLiteral(ctxt
);
2420 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->error
!= NULL
))
2421 ctxt
->sax
->error(ctxt
->userData
,
2422 "htmlParseExternalID: SYSTEM, no URI\n");
2423 ctxt
->wellFormed
= 0;
2425 } else if ((UPPER
== 'P') && (UPP(1) == 'U') &&
2426 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2427 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2429 if (!IS_BLANK(CUR
)) {
2430 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->error
!= NULL
))
2431 ctxt
->sax
->error(ctxt
->userData
,
2432 "Space required after 'PUBLIC'\n");
2433 ctxt
->wellFormed
= 0;
2436 *publicID
= htmlParsePubidLiteral(ctxt
);
2437 if (*publicID
== NULL
) {
2438 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->error
!= NULL
))
2439 ctxt
->sax
->error(ctxt
->userData
,
2440 "htmlParseExternalID: PUBLIC, no Public Identifier\n");
2441 ctxt
->wellFormed
= 0;
2444 if ((CUR
== '"') || (CUR
== '\'')) {
2445 URI
= htmlParseSystemLiteral(ctxt
);
2453 * @ctxt: an HTML parser context
2455 * Parse an XML (SGML) comment <!-- .... -->
2457 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2460 htmlParseComment(htmlParserCtxtPtr ctxt
) {
2461 xmlChar
*buf
= NULL
;
2463 int size
= HTML_PARSER_BUFFER_SIZE
;
2467 xmlParserInputState state
;
2470 * Check that there is a comment right here.
2472 if ((RAW
!= '<') || (NXT(1) != '!') ||
2473 (NXT(2) != '-') || (NXT(3) != '-')) return;
2475 state
= ctxt
->instate
;
2476 ctxt
->instate
= XML_PARSER_COMMENT
;
2479 buf
= (xmlChar
*) xmlMalloc(size
* sizeof(xmlChar
));
2481 xmlGenericError(xmlGenericErrorContext
,
2482 "malloc of %d byte failed\n", size
);
2483 ctxt
->instate
= state
;
2492 while (IS_CHAR(cur
) &&
2494 (r
!= '-') || (q
!= '-'))) {
2495 if (len
+ 5 >= size
) {
2497 buf
= (xmlChar
*) xmlRealloc(buf
, size
* sizeof(xmlChar
));
2499 xmlGenericError(xmlGenericErrorContext
,
2500 "realloc of %d byte failed\n", size
);
2501 ctxt
->instate
= state
;
2505 COPY_BUF(ql
,buf
,len
,q
);
2519 if (!IS_CHAR(cur
)) {
2520 ctxt
->errNo
= XML_ERR_COMMENT_NOT_FINISHED
;
2521 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->error
!= NULL
))
2522 ctxt
->sax
->error(ctxt
->userData
,
2523 "Comment not terminated \n<!--%.50s\n", buf
);
2524 ctxt
->wellFormed
= 0;
2528 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->comment
!= NULL
) &&
2529 (!ctxt
->disableSAX
))
2530 ctxt
->sax
->comment(ctxt
->userData
, buf
);
2533 ctxt
->instate
= state
;
2538 * @ctxt: an HTML parser context
2540 * parse Reference declarations
2542 * [66] CharRef ::= '&#' [0-9]+ ';' |
2543 * '&#x' [0-9a-fA-F]+ ';'
2545 * Returns the value parsed (as an int)
2548 htmlParseCharRef(htmlParserCtxtPtr ctxt
) {
2551 if ((CUR
== '&') && (NXT(1) == '#') &&
2554 while (CUR
!= ';') {
2555 if ((CUR
>= '0') && (CUR
<= '9'))
2556 val
= val
* 16 + (CUR
- '0');
2557 else if ((CUR
>= 'a') && (CUR
<= 'f'))
2558 val
= val
* 16 + (CUR
- 'a') + 10;
2559 else if ((CUR
>= 'A') && (CUR
<= 'F'))
2560 val
= val
* 16 + (CUR
- 'A') + 10;
2562 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->error
!= NULL
))
2563 ctxt
->sax
->error(ctxt
->userData
,
2564 "htmlParseCharRef: invalid hexadecimal value\n");
2565 ctxt
->wellFormed
= 0;
2572 } else if ((CUR
== '&') && (NXT(1) == '#')) {
2574 while (CUR
!= ';') {
2575 if ((CUR
>= '0') && (CUR
<= '9'))
2576 val
= val
* 10 + (CUR
- '0');
2578 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->error
!= NULL
))
2579 ctxt
->sax
->error(ctxt
->userData
,
2580 "htmlParseCharRef: invalid decimal value\n");
2581 ctxt
->wellFormed
= 0;
2589 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->error
!= NULL
))
2590 ctxt
->sax
->error(ctxt
->userData
, "htmlParseCharRef: invalid value\n");
2591 ctxt
->wellFormed
= 0;
2594 * Check the value IS_CHAR ...
2599 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->error
!= NULL
))
2600 ctxt
->sax
->error(ctxt
->userData
, "htmlParseCharRef: invalid xmlChar value %d\n",
2602 ctxt
->wellFormed
= 0;
2609 * htmlParseDocTypeDecl :
2610 * @ctxt: an HTML parser context
2612 * parse a DOCTYPE declaration
2614 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
2615 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
2619 htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt
) {
2621 xmlChar
*ExternalID
= NULL
;
2622 xmlChar
*URI
= NULL
;
2625 * We know that '<!DOCTYPE' has been detected.
2632 * Parse the DOCTYPE name.
2634 name
= htmlParseName(ctxt
);
2636 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->error
!= NULL
))
2637 ctxt
->sax
->error(ctxt
->userData
, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
2638 ctxt
->wellFormed
= 0;
2641 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
2647 * Check for SystemID and ExternalID
2649 URI
= htmlParseExternalID(ctxt
, &ExternalID
);
2653 * We should be at the end of the DOCTYPE declaration.
2656 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->error
!= NULL
))
2657 ctxt
->sax
->error(ctxt
->userData
, "DOCTYPE improperly terminated\n");
2658 ctxt
->wellFormed
= 0;
2659 /* We shouldn't try to resynchronize ... */
2664 * Create or update the document accordingly to the DOCTYPE
2666 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->internalSubset
!= NULL
) &&
2667 (!ctxt
->disableSAX
))
2668 ctxt
->sax
->internalSubset(ctxt
->userData
, name
, ExternalID
, URI
);
2671 * Cleanup, since we don't use all those identifiers
2673 if (URI
!= NULL
) xmlFree(URI
);
2674 if (ExternalID
!= NULL
) xmlFree(ExternalID
);
2675 if (name
!= NULL
) xmlFree(name
);
2679 * htmlParseAttribute:
2680 * @ctxt: an HTML parser context
2681 * @value: a xmlChar ** used to store the value of the attribute
2683 * parse an attribute
2685 * [41] Attribute ::= Name Eq AttValue
2687 * [25] Eq ::= S? '=' S?
2691 * [NS 11] Attribute ::= QName Eq AttValue
2693 * Also the case QName == xmlns:??? is handled independently as a namespace
2696 * Returns the attribute name, and the value in *value.
2700 htmlParseAttribute(htmlParserCtxtPtr ctxt
, xmlChar
**value
) {
2701 xmlChar
*name
, *val
= NULL
;
2704 name
= htmlParseHTMLName(ctxt
);
2706 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->error
!= NULL
))
2707 ctxt
->sax
->error(ctxt
->userData
, "error parsing attribute name\n");
2708 ctxt
->wellFormed
= 0;
2719 val
= htmlParseAttValue(ctxt
);
2722 * TODO : some attribute must have values, some may not
2723 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2724 ctxt->sax->warning(ctxt->userData,
2725 "No value for attribute %s\n", name); */
2733 * htmlCheckEncoding:
2734 * @ctxt: an HTML parser context
2735 * @attvalue: the attribute value
2737 * Checks an http-equiv attribute from a Meta tag to detect
2739 * If a new encoding is detected the parser is switched to decode
2743 htmlCheckEncoding(htmlParserCtxtPtr ctxt
, const xmlChar
*attvalue
) {
2744 const xmlChar
*encoding
;
2746 if ((ctxt
== NULL
) || (attvalue
== NULL
))
2749 /* do not change encoding */
2750 if (ctxt
->input
->encoding
!= NULL
)
2753 encoding
= xmlStrcasestr(attvalue
, BAD_CAST
"charset=");
2754 if (encoding
!= NULL
) {
2757 encoding
= xmlStrcasestr(attvalue
, BAD_CAST
"charset =");
2758 if (encoding
!= NULL
)
2761 if (encoding
!= NULL
) {
2762 xmlCharEncoding enc
;
2763 xmlCharEncodingHandlerPtr handler
;
2765 while ((*encoding
== ' ') || (*encoding
== '\t')) encoding
++;
2767 if (ctxt
->input
->encoding
!= NULL
)
2768 xmlFree((xmlChar
*) ctxt
->input
->encoding
);
2769 ctxt
->input
->encoding
= xmlStrdup(encoding
);
2771 enc
= xmlParseCharEncoding((const char *) encoding
);
2773 * registered set of known encodings
2775 if (enc
!= XML_CHAR_ENCODING_ERROR
) {
2776 xmlSwitchEncoding(ctxt
, enc
);
2777 ctxt
->charset
= XML_CHAR_ENCODING_UTF8
;
2780 * fallback for unknown encodings
2782 handler
= xmlFindCharEncodingHandler((const char *) encoding
);
2783 if (handler
!= NULL
) {
2784 xmlSwitchToEncoding(ctxt
, handler
);
2785 ctxt
->charset
= XML_CHAR_ENCODING_UTF8
;
2787 ctxt
->errNo
= XML_ERR_UNSUPPORTED_ENCODING
;
2791 if ((ctxt
->input
->buf
!= NULL
) &&
2792 (ctxt
->input
->buf
->encoder
!= NULL
) &&
2793 (ctxt
->input
->buf
->raw
!= NULL
) &&
2794 (ctxt
->input
->buf
->buffer
!= NULL
)) {
2799 * convert as much as possible to the parser reading buffer.
2801 processed
= ctxt
->input
->cur
- ctxt
->input
->base
;
2802 xmlBufferShrink(ctxt
->input
->buf
->buffer
, processed
);
2803 nbchars
= xmlCharEncInFunc(ctxt
->input
->buf
->encoder
,
2804 ctxt
->input
->buf
->buffer
,
2805 ctxt
->input
->buf
->raw
);
2807 ctxt
->errNo
= XML_ERR_INVALID_ENCODING
;
2808 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->error
!= NULL
))
2809 ctxt
->sax
->error(ctxt
->userData
,
2810 "htmlCheckEncoding: encoder error\n");
2813 ctxt
->input
->cur
= ctxt
->input
->buf
->buffer
->content
;
2820 * @ctxt: an HTML parser context
2821 * @atts: the attributes values
2823 * Checks an attributes from a Meta tag
2826 htmlCheckMeta(htmlParserCtxtPtr ctxt
, const xmlChar
**atts
) {
2828 const xmlChar
*att
, *value
;
2830 const xmlChar
*content
= NULL
;
2832 if ((ctxt
== NULL
) || (atts
== NULL
))
2837 while (att
!= NULL
) {
2839 if ((value
!= NULL
) && (!xmlStrcasecmp(att
, BAD_CAST
"http-equiv"))
2840 && (!xmlStrcasecmp(value
, BAD_CAST
"Content-Type")))
2842 else if ((value
!= NULL
) && (!xmlStrcasecmp(att
, BAD_CAST
"content")))
2846 if ((http
) && (content
!= NULL
))
2847 htmlCheckEncoding(ctxt
, content
);
2852 * htmlParseStartTag:
2853 * @ctxt: an HTML parser context
2855 * parse a start of tag either for rule element or
2856 * EmptyElement. In both case we don't parse the tag closing chars.
2858 * [40] STag ::= '<' Name (S Attribute)* S? '>'
2860 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
2864 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
2866 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
2871 htmlParseStartTag(htmlParserCtxtPtr ctxt
) {
2875 const xmlChar
**atts
= NULL
;
2881 if (CUR
!= '<') return;
2885 name
= htmlParseHTMLName(ctxt
);
2887 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->error
!= NULL
))
2888 ctxt
->sax
->error(ctxt
->userData
,
2889 "htmlParseStartTag: invalid element name\n");
2890 ctxt
->wellFormed
= 0;
2891 /* Dump the bogus tag like browsers do */
2892 while ((IS_CHAR(CUR
)) && (CUR
!= '>'))
2896 if (xmlStrEqual(name
, BAD_CAST
"meta"))
2900 * Check for auto-closure of HTML elements.
2902 htmlAutoClose(ctxt
, name
);
2905 * Check for implied HTML elements.
2907 htmlCheckImplied(ctxt
, name
);
2910 * Avoid html at any level > 0, head at any level != 1
2911 * or any attempt to recurse body
2913 if ((ctxt
->nameNr
> 0) && (xmlStrEqual(name
, BAD_CAST
"html"))) {
2914 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->error
!= NULL
))
2915 ctxt
->sax
->error(ctxt
->userData
,
2916 "htmlParseStartTag: misplaced <html> tag\n");
2917 ctxt
->wellFormed
= 0;
2921 if ((ctxt
->nameNr
!= 1) &&
2922 (xmlStrEqual(name
, BAD_CAST
"head"))) {
2923 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->error
!= NULL
))
2924 ctxt
->sax
->error(ctxt
->userData
,
2925 "htmlParseStartTag: misplaced <head> tag\n");
2926 ctxt
->wellFormed
= 0;
2930 if (xmlStrEqual(name
, BAD_CAST
"body")) {
2932 for (indx
= 0;indx
< ctxt
->nameNr
;indx
++) {
2933 if (xmlStrEqual(ctxt
->nameTab
[indx
], BAD_CAST
"body")) {
2934 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->error
!= NULL
))
2935 ctxt
->sax
->error(ctxt
->userData
,
2936 "htmlParseStartTag: misplaced <body> tag\n");
2937 ctxt
->wellFormed
= 0;
2945 * Now parse the attributes, it ends up with the ending
2950 while ((IS_CHAR(CUR
)) &&
2952 ((CUR
!= '/') || (NXT(1) != '>'))) {
2953 long cons
= ctxt
->nbChars
;
2956 attname
= htmlParseAttribute(ctxt
, &attvalue
);
2957 if (attname
!= NULL
) {
2960 * Well formedness requires at most one declaration of an attribute
2962 for (i
= 0; i
< nbatts
;i
+= 2) {
2963 if (xmlStrEqual(atts
[i
], attname
)) {
2964 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->error
!= NULL
))
2965 ctxt
->sax
->error(ctxt
->userData
,
2966 "Attribute %s redefined\n",
2968 ctxt
->wellFormed
= 0;
2970 if (attvalue
!= NULL
)
2977 * Add the pair to atts
2981 atts
= (const xmlChar
**) xmlMalloc(maxatts
* sizeof(xmlChar
*));
2983 xmlGenericError(xmlGenericErrorContext
,
2984 "malloc of %ld byte failed\n",
2985 maxatts
* (long)sizeof(xmlChar
*));
2986 if (name
!= NULL
) xmlFree(name
);
2989 } else if (nbatts
+ 4 > maxatts
) {
2991 atts
= (const xmlChar
**) xmlRealloc((void *) atts
,
2992 maxatts
* sizeof(xmlChar
*));
2994 xmlGenericError(xmlGenericErrorContext
,
2995 "realloc of %ld byte failed\n",
2996 maxatts
* (long)sizeof(xmlChar
*));
2997 if (name
!= NULL
) xmlFree(name
);
3001 atts
[nbatts
++] = attname
;
3002 atts
[nbatts
++] = attvalue
;
3003 atts
[nbatts
] = NULL
;
3004 atts
[nbatts
+ 1] = NULL
;
3007 /* Dump the bogus attribute string up to the next blank or
3008 * the end of the tag. */
3009 while ((IS_CHAR(CUR
)) && !(IS_BLANK(CUR
)) && (CUR
!= '>')
3010 && ((CUR
!= '/') || (NXT(1) != '>')))
3016 if (cons
== ctxt
->nbChars
) {
3017 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->error
!= NULL
))
3018 ctxt
->sax
->error(ctxt
->userData
,
3019 "htmlParseStartTag: problem parsing attributes\n");
3020 ctxt
->wellFormed
= 0;
3026 * Handle specific association to the META tag
3029 htmlCheckMeta(ctxt
, atts
);
3032 * SAX: Start of Element !
3034 htmlnamePush(ctxt
, xmlStrdup(name
));
3036 xmlGenericError(xmlGenericErrorContext
,"Start of element %s: pushed %s\n", name
, ctxt
->name
);
3038 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->startElement
!= NULL
))
3039 ctxt
->sax
->startElement(ctxt
->userData
, name
, atts
);
3042 for (i
= 0;i
< nbatts
;i
++) {
3043 if (atts
[i
] != NULL
)
3044 xmlFree((xmlChar
*) atts
[i
]);
3046 xmlFree((void *) atts
);
3048 if (name
!= NULL
) xmlFree(name
);
3053 * @ctxt: an HTML parser context
3055 * parse an end of tag
3057 * [42] ETag ::= '</' Name S? '>'
3061 * [NS 9] ETag ::= '</' QName S? '>'
3063 * Returns 1 if the current level should be closed.
3067 htmlParseEndTag(htmlParserCtxtPtr ctxt
) {
3072 if ((CUR
!= '<') || (NXT(1) != '/')) {
3073 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->error
!= NULL
))
3074 ctxt
->sax
->error(ctxt
->userData
, "htmlParseEndTag: '</' not found\n");
3075 ctxt
->wellFormed
= 0;
3080 name
= htmlParseHTMLName(ctxt
);
3081 if (name
== NULL
) return(0);
3084 * We should definitely be at the ending "S? '>'" part
3087 if ((!IS_CHAR(CUR
)) || (CUR
!= '>')) {
3088 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->error
!= NULL
))
3089 ctxt
->sax
->error(ctxt
->userData
, "End tag : expected '>'\n");
3090 ctxt
->wellFormed
= 0;
3095 * If the name read is not one of the element in the parsing stack
3096 * then return, it's just an error.
3098 for (i
= (ctxt
->nameNr
- 1);i
>= 0;i
--) {
3099 if (xmlStrEqual(name
, ctxt
->nameTab
[i
])) break;
3102 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->error
!= NULL
))
3103 ctxt
->sax
->error(ctxt
->userData
,
3104 "Unexpected end tag : %s\n", name
);
3106 ctxt
->wellFormed
= 0;
3112 * Check for auto-closure of HTML elements.
3115 htmlAutoCloseOnClose(ctxt
, name
);
3118 * Well formedness constraints, opening and closing must match.
3119 * With the exception that the autoclose may have popped stuff out
3122 if (!xmlStrEqual(name
, ctxt
->name
)) {
3124 xmlGenericError(xmlGenericErrorContext
,"End of tag %s: expecting %s\n", name
, ctxt
->name
);
3126 if ((ctxt
->name
!= NULL
) &&
3127 (!xmlStrEqual(ctxt
->name
, name
))) {
3128 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->error
!= NULL
))
3129 ctxt
->sax
->error(ctxt
->userData
,
3130 "Opening and ending tag mismatch: %s and %s\n",
3132 ctxt
->wellFormed
= 0;
3139 oldname
= ctxt
->name
;
3140 if ((oldname
!= NULL
) && (xmlStrEqual(oldname
, name
))) {
3141 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->endElement
!= NULL
))
3142 ctxt
->sax
->endElement(ctxt
->userData
, name
);
3143 oldname
= htmlnamePop(ctxt
);
3144 if (oldname
!= NULL
) {
3146 xmlGenericError(xmlGenericErrorContext
,"End of tag %s: popping out %s\n", name
, oldname
);
3151 xmlGenericError(xmlGenericErrorContext
,"End of tag %s: stack empty !!!\n", name
);
3167 * htmlParseReference:
3168 * @ctxt: an HTML parser context
3170 * parse and handle entity references in content,
3171 * this will end-up in a call to character() since this is either a
3172 * CharRef, or a predefined entity.
3175 htmlParseReference(htmlParserCtxtPtr ctxt
) {
3176 const htmlEntityDesc
* ent
;
3179 if (CUR
!= '&') return;
3181 if (NXT(1) == '#') {
3185 c
= htmlParseCharRef(ctxt
);
3189 if (c
< 0x80) { out
[i
++]= c
; bits
= -6; }
3190 else if (c
< 0x800) { out
[i
++]=((c
>> 6) & 0x1F) | 0xC0; bits
= 0; }
3191 else if (c
< 0x10000) { out
[i
++]=((c
>> 12) & 0x0F) | 0xE0; bits
= 6; }
3192 else { out
[i
++]=((c
>> 18) & 0x07) | 0xF0; bits
= 12; }
3194 for ( ; bits
>= 0; bits
-= 6) {
3195 out
[i
++]= ((c
>> bits
) & 0x3F) | 0x80;
3199 htmlCheckParagraph(ctxt
);
3200 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->characters
!= NULL
))
3201 ctxt
->sax
->characters(ctxt
->userData
, out
, i
);
3203 ent
= htmlParseEntityRef(ctxt
, &name
);
3205 htmlCheckParagraph(ctxt
);
3206 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->characters
!= NULL
))
3207 ctxt
->sax
->characters(ctxt
->userData
, BAD_CAST
"&", 1);
3210 if ((ent
== NULL
) || !(ent
->value
> 0)) {
3211 htmlCheckParagraph(ctxt
);
3212 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->characters
!= NULL
)) {
3213 ctxt
->sax
->characters(ctxt
->userData
, BAD_CAST
"&", 1);
3214 ctxt
->sax
->characters(ctxt
->userData
, name
, xmlStrlen(name
));
3215 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3223 { out
[i
++]= c
; bits
= -6; }
3225 { out
[i
++]=((c
>> 6) & 0x1F) | 0xC0; bits
= 0; }
3226 else if (c
< 0x10000)
3227 { out
[i
++]=((c
>> 12) & 0x0F) | 0xE0; bits
= 6; }
3229 { out
[i
++]=((c
>> 18) & 0x07) | 0xF0; bits
= 12; }
3231 for ( ; bits
>= 0; bits
-= 6) {
3232 out
[i
++]= ((c
>> bits
) & 0x3F) | 0x80;
3236 htmlCheckParagraph(ctxt
);
3237 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->characters
!= NULL
))
3238 ctxt
->sax
->characters(ctxt
->userData
, out
, i
);
3246 * @ctxt: an HTML parser context
3247 * @name: the node name
3249 * Parse a content: comment, sub-element, reference or text.
3254 htmlParseContent(htmlParserCtxtPtr ctxt
) {
3255 xmlChar
*currentNode
;
3258 currentNode
= xmlStrdup(ctxt
->name
);
3259 depth
= ctxt
->nameNr
;
3261 long cons
= ctxt
->nbChars
;
3265 * Our tag or one of it's parent or children is ending.
3267 if ((CUR
== '<') && (NXT(1) == '/')) {
3268 if (htmlParseEndTag(ctxt
) &&
3269 ((currentNode
!= NULL
) || (ctxt
->nameNr
== 0))) {
3270 if (currentNode
!= NULL
)
3271 xmlFree(currentNode
);
3274 continue; /* while */
3278 * Has this node been popped out during parsing of
3281 if ((ctxt
->nameNr
> 0) && (depth
>= ctxt
->nameNr
) &&
3282 (!xmlStrEqual(currentNode
, ctxt
->name
)))
3284 if (currentNode
!= NULL
) xmlFree(currentNode
);
3288 if ((CUR
!= 0) && ((xmlStrEqual(currentNode
, BAD_CAST
"script")) ||
3289 (xmlStrEqual(currentNode
, BAD_CAST
"style")))) {
3291 * Handle SCRIPT/STYLE separately
3293 htmlParseScript(ctxt
);
3296 * Sometimes DOCTYPE arrives in the middle of the document
3298 if ((CUR
== '<') && (NXT(1) == '!') &&
3299 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3300 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3301 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3303 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->error
!= NULL
))
3304 ctxt
->sax
->error(ctxt
->userData
,
3305 "Misplaced DOCTYPE declaration\n");
3306 ctxt
->wellFormed
= 0;
3307 htmlParseDocTypeDecl(ctxt
);
3311 * First case : a comment
3313 if ((CUR
== '<') && (NXT(1) == '!') &&
3314 (NXT(2) == '-') && (NXT(3) == '-')) {
3315 htmlParseComment(ctxt
);
3319 * Second case : a sub-element.
3321 else if (CUR
== '<') {
3322 htmlParseElement(ctxt
);
3326 * Third case : a reference. If if has not been resolved,
3327 * parsing returns it's Name, create the node
3329 else if (CUR
== '&') {
3330 htmlParseReference(ctxt
);
3334 * Fourth : end of the resource
3336 else if (CUR
== 0) {
3337 htmlAutoCloseOnEnd(ctxt
);
3342 * Last case, text. Note that References are handled directly.
3345 htmlParseCharData(ctxt
);
3348 if (cons
== ctxt
->nbChars
) {
3349 if (ctxt
->node
!= NULL
) {
3350 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->error
!= NULL
))
3351 ctxt
->sax
->error(ctxt
->userData
,
3352 "detected an error in element content\n");
3353 ctxt
->wellFormed
= 0;
3360 if (currentNode
!= NULL
) xmlFree(currentNode
);
3365 * @ctxt: an HTML parser context
3367 * parse an HTML element, this is highly recursive
3369 * [39] element ::= EmptyElemTag | STag content ETag
3371 * [41] Attribute ::= Name Eq AttValue
3375 htmlParseElement(htmlParserCtxtPtr ctxt
) {
3377 xmlChar
*currentNode
= NULL
;
3378 const htmlElemDesc
* info
;
3379 htmlParserNodeInfo node_info
;
3381 int depth
= ctxt
->nameNr
;
3382 const xmlChar
*oldptr
;
3384 /* Capture start position */
3385 if (ctxt
->record_info
) {
3386 node_info
.begin_pos
= ctxt
->input
->consumed
+
3387 (CUR_PTR
- ctxt
->input
->base
);
3388 node_info
.begin_line
= ctxt
->input
->line
;
3391 oldname
= xmlStrdup(ctxt
->name
);
3392 htmlParseStartTag(ctxt
);
3395 if (oldname
== NULL
)
3396 xmlGenericError(xmlGenericErrorContext
,
3397 "Start of element %s\n", name
);
3398 else if (name
== NULL
)
3399 xmlGenericError(xmlGenericErrorContext
,
3400 "Start of element failed, was %s\n", oldname
);
3402 xmlGenericError(xmlGenericErrorContext
,
3403 "Start of element %s, was %s\n", name
, oldname
);
3405 if (((depth
== ctxt
->nameNr
) && (xmlStrEqual(oldname
, ctxt
->name
))) ||
3409 if (oldname
!= NULL
)
3413 if (oldname
!= NULL
)
3417 * Lookup the info for that element.
3419 info
= htmlTagLookup(name
);
3421 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->error
!= NULL
))
3422 ctxt
->sax
->error(ctxt
->userData
, "Tag %s invalid\n",
3424 ctxt
->wellFormed
= 0;
3425 } else if (info
->depr
) {
3426 /***************************
3427 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
3428 ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
3430 ***************************/
3434 * Check for an Empty Element labeled the XML/SGML way
3436 if ((CUR
== '/') && (NXT(1) == '>')) {
3438 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->endElement
!= NULL
))
3439 ctxt
->sax
->endElement(ctxt
->userData
, name
);
3440 oldname
= htmlnamePop(ctxt
);
3442 xmlGenericError(xmlGenericErrorContext
,"End of tag the XML way: popping out %s\n", oldname
);
3444 if (oldname
!= NULL
)
3452 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->error
!= NULL
))
3453 ctxt
->sax
->error(ctxt
->userData
,
3454 "Couldn't find end of Start Tag %s\n",
3456 ctxt
->wellFormed
= 0;
3459 * end of parsing of this node.
3461 if (xmlStrEqual(name
, ctxt
->name
)) {
3463 oldname
= htmlnamePop(ctxt
);
3465 xmlGenericError(xmlGenericErrorContext
,"End of start tag problem: popping out %s\n", oldname
);
3467 if (oldname
!= NULL
)
3472 * Capture end position and add node
3474 if ( currentNode
!= NULL
&& ctxt
->record_info
) {
3475 node_info
.end_pos
= ctxt
->input
->consumed
+
3476 (CUR_PTR
- ctxt
->input
->base
);
3477 node_info
.end_line
= ctxt
->input
->line
;
3478 node_info
.node
= ctxt
->node
;
3479 xmlParserAddNodeInfo(ctxt
, &node_info
);
3485 * Check for an Empty Element from DTD definition
3487 if ((info
!= NULL
) && (info
->empty
)) {
3488 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->endElement
!= NULL
))
3489 ctxt
->sax
->endElement(ctxt
->userData
, name
);
3490 oldname
= htmlnamePop(ctxt
);
3492 xmlGenericError(xmlGenericErrorContext
,"End of empty tag %s : popping out %s\n", name
, oldname
);
3494 if (oldname
!= NULL
)
3500 * Parse the content of the element:
3502 currentNode
= xmlStrdup(ctxt
->name
);
3503 depth
= ctxt
->nameNr
;
3504 while (IS_CHAR(CUR
)) {
3505 oldptr
= ctxt
->input
->cur
;
3506 htmlParseContent(ctxt
);
3507 if (oldptr
==ctxt
->input
->cur
) break;
3508 if (ctxt
->nameNr
< depth
) break;
3512 * Capture end position and add node
3514 if ( currentNode
!= NULL
&& ctxt
->record_info
) {
3515 node_info
.end_pos
= ctxt
->input
->consumed
+
3516 (CUR_PTR
- ctxt
->input
->base
);
3517 node_info
.end_line
= ctxt
->input
->line
;
3518 node_info
.node
= ctxt
->node
;
3519 xmlParserAddNodeInfo(ctxt
, &node_info
);
3521 if (!IS_CHAR(CUR
)) {
3522 htmlAutoCloseOnEnd(ctxt
);
3525 if (currentNode
!= NULL
)
3526 xmlFree(currentNode
);
3530 * htmlParseDocument :
3531 * @ctxt: an HTML parser context
3533 * parse an HTML document (and build a tree if using the standard SAX
3536 * Returns 0, -1 in case of error. the parser context is augmented
3537 * as a result of the parsing.
3541 htmlParseDocument(htmlParserCtxtPtr ctxt
) {
3546 htmlDefaultSAXHandlerInit();
3551 * SAX: beginning of the document processing.
3553 if ((ctxt
->sax
) && (ctxt
->sax
->setDocumentLocator
))
3554 ctxt
->sax
->setDocumentLocator(ctxt
->userData
, &xmlDefaultSAXLocator
);
3557 * Wipe out everything which is before the first '<'
3561 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->error
!= NULL
))
3562 ctxt
->sax
->error(ctxt
->userData
, "Document is empty\n");
3563 ctxt
->wellFormed
= 0;
3566 if ((ctxt
->sax
) && (ctxt
->sax
->startDocument
) && (!ctxt
->disableSAX
))
3567 ctxt
->sax
->startDocument(ctxt
->userData
);
3571 * Parse possible comments before any content
3573 while ((CUR
== '<') && (NXT(1) == '!') &&
3574 (NXT(2) == '-') && (NXT(3) == '-')) {
3575 htmlParseComment(ctxt
);
3581 * Then possibly doc type declaration(s) and more Misc
3582 * (doctypedecl Misc*)?
3584 if ((CUR
== '<') && (NXT(1) == '!') &&
3585 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3586 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3587 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3589 htmlParseDocTypeDecl(ctxt
);
3594 * Parse possible comments before any content
3596 while ((CUR
== '<') && (NXT(1) == '!') &&
3597 (NXT(2) == '-') && (NXT(3) == '-')) {
3598 htmlParseComment(ctxt
);
3603 * Time to start parsing the tree itself
3605 htmlParseContent(ctxt
);
3611 htmlAutoCloseOnEnd(ctxt
);
3615 * SAX: end of the document processing.
3617 if ((ctxt
->sax
) && (ctxt
->sax
->endDocument
!= NULL
))
3618 ctxt
->sax
->endDocument(ctxt
->userData
);
3620 if (ctxt
->myDoc
!= NULL
) {
3621 dtd
= xmlGetIntSubset(ctxt
->myDoc
);
3623 ctxt
->myDoc
->intSubset
=
3624 xmlCreateIntSubset(ctxt
->myDoc
, BAD_CAST
"HTML",
3625 BAD_CAST
"-//W3C//DTD HTML 4.0 Transitional//EN",
3626 BAD_CAST
"http://www.w3.org/TR/REC-html40/loose.dtd");
3628 if (! ctxt
->wellFormed
) return(-1);
3633 /************************************************************************
3635 * Parser contexts handling *
3637 ************************************************************************/
3640 * xmlInitParserCtxt:
3641 * @ctxt: an HTML parser context
3643 * Initialize a parser context
3647 htmlInitParserCtxt(htmlParserCtxtPtr ctxt
)
3649 htmlSAXHandler
*sax
;
3651 if (ctxt
== NULL
) return;
3652 memset(ctxt
, 0, sizeof(htmlParserCtxt
));
3654 sax
= (htmlSAXHandler
*) xmlMalloc(sizeof(htmlSAXHandler
));
3656 xmlGenericError(xmlGenericErrorContext
,
3657 "htmlInitParserCtxt: out of memory\n");
3660 memset(sax
, 0, sizeof(htmlSAXHandler
));
3662 /* Allocate the Input stack */
3663 ctxt
->inputTab
= (htmlParserInputPtr
*)
3664 xmlMalloc(5 * sizeof(htmlParserInputPtr
));
3665 if (ctxt
->inputTab
== NULL
) {
3666 xmlGenericError(xmlGenericErrorContext
,
3667 "htmlInitParserCtxt: out of memory\n");
3676 ctxt
->version
= NULL
;
3677 ctxt
->encoding
= NULL
;
3678 ctxt
->standalone
= -1;
3679 ctxt
->instate
= XML_PARSER_START
;
3681 /* Allocate the Node stack */
3682 ctxt
->nodeTab
= (htmlNodePtr
*) xmlMalloc(10 * sizeof(htmlNodePtr
));
3683 if (ctxt
->nodeTab
== NULL
) {
3684 xmlGenericError(xmlGenericErrorContext
,
3685 "htmlInitParserCtxt: out of memory\n");
3698 /* Allocate the Name stack */
3699 ctxt
->nameTab
= (xmlChar
**) xmlMalloc(10 * sizeof(xmlChar
*));
3700 if (ctxt
->nameTab
== NULL
) {
3701 xmlGenericError(xmlGenericErrorContext
,
3702 "htmlInitParserCtxt: out of memory\n");
3718 if (sax
== NULL
) ctxt
->sax
= &htmlDefaultSAXHandler
;
3721 memcpy(sax
, &htmlDefaultSAXHandler
, sizeof(htmlSAXHandler
));
3723 ctxt
->userData
= ctxt
;
3725 ctxt
->wellFormed
= 1;
3726 ctxt
->replaceEntities
= 0;
3727 ctxt
->linenumbers
= xmlLineNumbersDefaultValue
;
3729 ctxt
->record_info
= 0;
3732 ctxt
->checkIndex
= 0;
3733 ctxt
->catalogs
= NULL
;
3734 xmlInitNodeInfoSeq(&ctxt
->node_seq
);
3738 * htmlFreeParserCtxt:
3739 * @ctxt: an HTML parser context
3741 * Free all the memory used by a parser context. However the parsed
3742 * document in ctxt->myDoc is not freed.
3746 htmlFreeParserCtxt(htmlParserCtxtPtr ctxt
)
3748 xmlFreeParserCtxt(ctxt
);
3752 * htmlNewParserCtxt:
3754 * Allocate and initialize a new parser context.
3756 * Returns the xmlParserCtxtPtr or NULL
3759 static htmlParserCtxtPtr
3760 htmlNewParserCtxt(void)
3762 xmlParserCtxtPtr ctxt
;
3764 ctxt
= (xmlParserCtxtPtr
) xmlMalloc(sizeof(xmlParserCtxt
));
3766 xmlGenericError(xmlGenericErrorContext
,
3767 "xmlNewParserCtxt : cannot allocate context\n");
3770 memset(ctxt
, 0, sizeof(xmlParserCtxt
));
3771 htmlInitParserCtxt(ctxt
);
3776 * htmlCreateMemoryParserCtxt:
3777 * @buffer: a pointer to a char array
3778 * @size: the size of the array
3780 * Create a parser context for an HTML in-memory document.
3782 * Returns the new parser context or NULL
3784 static htmlParserCtxtPtr
3785 htmlCreateMemoryParserCtxt(const char *buffer
, int size
) {
3786 xmlParserCtxtPtr ctxt
;
3787 xmlParserInputPtr input
;
3788 xmlParserInputBufferPtr buf
;
3795 ctxt
= htmlNewParserCtxt();
3799 buf
= xmlParserInputBufferCreateMem(buffer
, size
, XML_CHAR_ENCODING_NONE
);
3800 if (buf
== NULL
) return(NULL
);
3802 input
= xmlNewInputStream(ctxt
);
3803 if (input
== NULL
) {
3804 xmlFreeParserCtxt(ctxt
);
3808 input
->filename
= NULL
;
3810 input
->base
= input
->buf
->buffer
->content
;
3811 input
->cur
= input
->buf
->buffer
->content
;
3812 input
->end
= &input
->buf
->buffer
->content
[input
->buf
->buffer
->use
];
3814 inputPush(ctxt
, input
);
3819 * htmlCreateDocParserCtxt :
3820 * @cur: a pointer to an array of xmlChar
3821 * @encoding: a free form C string describing the HTML document encoding, or NULL
3823 * Create a parser context for an HTML document.
3825 * TODO: check the need to add encoding handling there
3827 * Returns the new parser context or NULL
3829 static htmlParserCtxtPtr
3830 htmlCreateDocParserCtxt(xmlChar
*cur
, const char *encoding ATTRIBUTE_UNUSED
) {
3835 len
= xmlStrlen(cur
);
3836 return(htmlCreateMemoryParserCtxt((char *)cur
, len
));
3839 /************************************************************************
3841 * Progressive parsing interfaces *
3843 ************************************************************************/
3846 * htmlParseLookupSequence:
3847 * @ctxt: an HTML parser context
3848 * @first: the first char to lookup
3849 * @next: the next char to lookup or zero
3850 * @third: the next char to lookup or zero
3852 * Try to find if a sequence (first, next, third) or just (first next) or
3853 * (first) is available in the input stream.
3854 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
3855 * to avoid rescanning sequences of bytes, it DOES change the state of the
3856 * parser, do not use liberally.
3857 * This is basically similar to xmlParseLookupSequence()
3859 * Returns the index to the current parsing point if the full sequence
3860 * is available, -1 otherwise.
3863 htmlParseLookupSequence(htmlParserCtxtPtr ctxt
, xmlChar first
,
3864 xmlChar next
, xmlChar third
) {
3866 htmlParserInputPtr in
;
3871 if (in
== NULL
) return(-1);
3872 base
= in
->cur
- in
->base
;
3873 if (base
< 0) return(-1);
3874 if (ctxt
->checkIndex
> base
)
3875 base
= ctxt
->checkIndex
;
3876 if (in
->buf
== NULL
) {
3880 buf
= in
->buf
->buffer
->content
;
3881 len
= in
->buf
->buffer
->use
;
3883 /* take into account the sequence length */
3884 if (third
) len
-= 2;
3885 else if (next
) len
--;
3886 for (;base
< len
;base
++) {
3887 if (!incomment
&& (base
+ 4 < len
)) {
3888 if ((buf
[base
] == '<') && (buf
[base
+ 1] == '!') &&
3889 (buf
[base
+ 2] == '-') && (buf
[base
+ 3] == '-')) {
3892 /* do not increment base, some people use <!--> */
3897 if ((buf
[base
] == '-') && (buf
[base
+ 1] == '-') &&
3898 (buf
[base
+ 2] == '>')) {
3904 if (buf
[base
] == first
) {
3906 if ((buf
[base
+ 1] != next
) ||
3907 (buf
[base
+ 2] != third
)) continue;
3908 } else if (next
!= 0) {
3909 if (buf
[base
+ 1] != next
) continue;
3911 ctxt
->checkIndex
= 0;
3914 xmlGenericError(xmlGenericErrorContext
,
3915 "HPP: lookup '%c' found at %d\n",
3917 else if (third
== 0)
3918 xmlGenericError(xmlGenericErrorContext
,
3919 "HPP: lookup '%c%c' found at %d\n",
3922 xmlGenericError(xmlGenericErrorContext
,
3923 "HPP: lookup '%c%c%c' found at %d\n",
3924 first
, next
, third
, base
);
3926 return(base
- (in
->cur
- in
->base
));
3929 ctxt
->checkIndex
= base
;
3932 xmlGenericError(xmlGenericErrorContext
,
3933 "HPP: lookup '%c' failed\n", first
);
3934 else if (third
== 0)
3935 xmlGenericError(xmlGenericErrorContext
,
3936 "HPP: lookup '%c%c' failed\n", first
, next
);
3938 xmlGenericError(xmlGenericErrorContext
,
3939 "HPP: lookup '%c%c%c' failed\n", first
, next
, third
);
3945 * htmlParseTryOrFinish:
3946 * @ctxt: an HTML parser context
3947 * @terminate: last chunk indicator
3949 * Try to progress on parsing
3951 * Returns zero if no parsing was possible
3954 htmlParseTryOrFinish(htmlParserCtxtPtr ctxt
, int terminate
) {
3956 htmlParserInputPtr in
;
3961 switch (ctxt
->instate
) {
3962 case XML_PARSER_EOF
:
3963 xmlGenericError(xmlGenericErrorContext
,
3964 "HPP: try EOF\n"); break;
3965 case XML_PARSER_START
:
3966 xmlGenericError(xmlGenericErrorContext
,
3967 "HPP: try START\n"); break;
3968 case XML_PARSER_MISC
:
3969 xmlGenericError(xmlGenericErrorContext
,
3970 "HPP: try MISC\n");break;
3971 case XML_PARSER_COMMENT
:
3972 xmlGenericError(xmlGenericErrorContext
,
3973 "HPP: try COMMENT\n");break;
3974 case XML_PARSER_PROLOG
:
3975 xmlGenericError(xmlGenericErrorContext
,
3976 "HPP: try PROLOG\n");break;
3977 case XML_PARSER_START_TAG
:
3978 xmlGenericError(xmlGenericErrorContext
,
3979 "HPP: try START_TAG\n");break;
3980 case XML_PARSER_CONTENT
:
3981 xmlGenericError(xmlGenericErrorContext
,
3982 "HPP: try CONTENT\n");break;
3983 case XML_PARSER_CDATA_SECTION
:
3984 xmlGenericError(xmlGenericErrorContext
,
3985 "HPP: try CDATA_SECTION\n");break;
3986 case XML_PARSER_END_TAG
:
3987 xmlGenericError(xmlGenericErrorContext
,
3988 "HPP: try END_TAG\n");break;
3989 case XML_PARSER_ENTITY_DECL
:
3990 xmlGenericError(xmlGenericErrorContext
,
3991 "HPP: try ENTITY_DECL\n");break;
3992 case XML_PARSER_ENTITY_VALUE
:
3993 xmlGenericError(xmlGenericErrorContext
,
3994 "HPP: try ENTITY_VALUE\n");break;
3995 case XML_PARSER_ATTRIBUTE_VALUE
:
3996 xmlGenericError(xmlGenericErrorContext
,
3997 "HPP: try ATTRIBUTE_VALUE\n");break;
3998 case XML_PARSER_DTD
:
3999 xmlGenericError(xmlGenericErrorContext
,
4000 "HPP: try DTD\n");break;
4001 case XML_PARSER_EPILOG
:
4002 xmlGenericError(xmlGenericErrorContext
,
4003 "HPP: try EPILOG\n");break;
4005 xmlGenericError(xmlGenericErrorContext
,
4006 "HPP: try PI\n");break;
4007 case XML_PARSER_SYSTEM_LITERAL
:
4008 xmlGenericError(xmlGenericErrorContext
,
4009 "HPP: try SYSTEM_LITERAL\n");break;
4016 if (in
== NULL
) break;
4017 if (in
->buf
== NULL
)
4018 avail
= in
->length
- (in
->cur
- in
->base
);
4020 avail
= in
->buf
->buffer
->use
- (in
->cur
- in
->base
);
4021 if ((avail
== 0) && (terminate
)) {
4022 htmlAutoCloseOnEnd(ctxt
);
4023 if ((ctxt
->nameNr
== 0) && (ctxt
->instate
!= XML_PARSER_EOF
)) {
4025 * SAX: end of the document processing.
4027 ctxt
->instate
= XML_PARSER_EOF
;
4028 if ((ctxt
->sax
) && (ctxt
->sax
->endDocument
!= NULL
))
4029 ctxt
->sax
->endDocument(ctxt
->userData
);
4034 switch (ctxt
->instate
) {
4035 case XML_PARSER_EOF
:
4037 * Document parsing is done !
4040 case XML_PARSER_START
:
4042 * Very first chars read from the document flow.
4045 if (IS_BLANK(cur
)) {
4047 if (in
->buf
== NULL
)
4048 avail
= in
->length
- (in
->cur
- in
->base
);
4050 avail
= in
->buf
->buffer
->use
- (in
->cur
- in
->base
);
4052 if ((ctxt
->sax
) && (ctxt
->sax
->setDocumentLocator
))
4053 ctxt
->sax
->setDocumentLocator(ctxt
->userData
,
4054 &xmlDefaultSAXLocator
);
4055 if ((ctxt
->sax
) && (ctxt
->sax
->startDocument
) &&
4056 (!ctxt
->disableSAX
))
4057 ctxt
->sax
->startDocument(ctxt
->userData
);
4061 if ((cur
== '<') && (next
== '!') &&
4062 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4063 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4064 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4067 (htmlParseLookupSequence(ctxt
, '>', 0, 0) < 0))
4070 xmlGenericError(xmlGenericErrorContext
,
4071 "HPP: Parsing internal subset\n");
4073 htmlParseDocTypeDecl(ctxt
);
4074 ctxt
->instate
= XML_PARSER_PROLOG
;
4076 xmlGenericError(xmlGenericErrorContext
,
4077 "HPP: entering PROLOG\n");
4080 ctxt
->instate
= XML_PARSER_MISC
;
4083 xmlGenericError(xmlGenericErrorContext
,
4084 "HPP: entering MISC\n");
4087 case XML_PARSER_MISC
:
4089 if (in
->buf
== NULL
)
4090 avail
= in
->length
- (in
->cur
- in
->base
);
4092 avail
= in
->buf
->buffer
->use
- (in
->cur
- in
->base
);
4097 if ((cur
== '<') && (next
== '!') &&
4098 (in
->cur
[2] == '-') && (in
->cur
[3] == '-')) {
4100 (htmlParseLookupSequence(ctxt
, '-', '-', '>') < 0))
4103 xmlGenericError(xmlGenericErrorContext
,
4104 "HPP: Parsing Comment\n");
4106 htmlParseComment(ctxt
);
4107 ctxt
->instate
= XML_PARSER_MISC
;
4108 } else if ((cur
== '<') && (next
== '!') &&
4109 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4110 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4111 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4114 (htmlParseLookupSequence(ctxt
, '>', 0, 0) < 0))
4117 xmlGenericError(xmlGenericErrorContext
,
4118 "HPP: Parsing internal subset\n");
4120 htmlParseDocTypeDecl(ctxt
);
4121 ctxt
->instate
= XML_PARSER_PROLOG
;
4123 xmlGenericError(xmlGenericErrorContext
,
4124 "HPP: entering PROLOG\n");
4126 } else if ((cur
== '<') && (next
== '!') &&
4130 ctxt
->instate
= XML_PARSER_START_TAG
;
4132 xmlGenericError(xmlGenericErrorContext
,
4133 "HPP: entering START_TAG\n");
4137 case XML_PARSER_PROLOG
:
4139 if (in
->buf
== NULL
)
4140 avail
= in
->length
- (in
->cur
- in
->base
);
4142 avail
= in
->buf
->buffer
->use
- (in
->cur
- in
->base
);
4147 if ((cur
== '<') && (next
== '!') &&
4148 (in
->cur
[2] == '-') && (in
->cur
[3] == '-')) {
4150 (htmlParseLookupSequence(ctxt
, '-', '-', '>') < 0))
4153 xmlGenericError(xmlGenericErrorContext
,
4154 "HPP: Parsing Comment\n");
4156 htmlParseComment(ctxt
);
4157 ctxt
->instate
= XML_PARSER_PROLOG
;
4158 } else if ((cur
== '<') && (next
== '!') &&
4162 ctxt
->instate
= XML_PARSER_START_TAG
;
4164 xmlGenericError(xmlGenericErrorContext
,
4165 "HPP: entering START_TAG\n");
4169 case XML_PARSER_EPILOG
:
4170 if (in
->buf
== NULL
)
4171 avail
= in
->length
- (in
->cur
- in
->base
);
4173 avail
= in
->buf
->buffer
->use
- (in
->cur
- in
->base
);
4177 if (IS_BLANK(cur
)) {
4178 htmlParseCharData(ctxt
);
4184 if ((cur
== '<') && (next
== '!') &&
4185 (in
->cur
[2] == '-') && (in
->cur
[3] == '-')) {
4187 (htmlParseLookupSequence(ctxt
, '-', '-', '>') < 0))
4190 xmlGenericError(xmlGenericErrorContext
,
4191 "HPP: Parsing Comment\n");
4193 htmlParseComment(ctxt
);
4194 ctxt
->instate
= XML_PARSER_EPILOG
;
4195 } else if ((cur
== '<') && (next
== '!') &&
4199 ctxt
->errNo
= XML_ERR_DOCUMENT_END
;
4200 ctxt
->wellFormed
= 0;
4201 ctxt
->instate
= XML_PARSER_EOF
;
4203 xmlGenericError(xmlGenericErrorContext
,
4204 "HPP: entering EOF\n");
4206 if ((ctxt
->sax
) && (ctxt
->sax
->endDocument
!= NULL
))
4207 ctxt
->sax
->endDocument(ctxt
->userData
);
4211 case XML_PARSER_START_TAG
: {
4212 xmlChar
*name
, *oldname
;
4213 int depth
= ctxt
->nameNr
;
4214 const htmlElemDesc
* info
;
4220 ctxt
->instate
= XML_PARSER_CONTENT
;
4222 xmlGenericError(xmlGenericErrorContext
,
4223 "HPP: entering CONTENT\n");
4227 if (in
->cur
[1] == '/') {
4228 ctxt
->instate
= XML_PARSER_END_TAG
;
4229 ctxt
->checkIndex
= 0;
4231 xmlGenericError(xmlGenericErrorContext
,
4232 "HPP: entering END_TAG\n");
4237 (htmlParseLookupSequence(ctxt
, '>', 0, 0) < 0))
4240 oldname
= xmlStrdup(ctxt
->name
);
4241 htmlParseStartTag(ctxt
);
4244 if (oldname
== NULL
)
4245 xmlGenericError(xmlGenericErrorContext
,
4246 "Start of element %s\n", name
);
4247 else if (name
== NULL
)
4248 xmlGenericError(xmlGenericErrorContext
,
4249 "Start of element failed, was %s\n",
4252 xmlGenericError(xmlGenericErrorContext
,
4253 "Start of element %s, was %s\n",
4256 if (((depth
== ctxt
->nameNr
) &&
4257 (xmlStrEqual(oldname
, ctxt
->name
))) ||
4261 if (oldname
!= NULL
)
4265 if (oldname
!= NULL
)
4269 * Lookup the info for that element.
4271 info
= htmlTagLookup(name
);
4273 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->error
!= NULL
))
4274 ctxt
->sax
->error(ctxt
->userData
, "Tag %s invalid\n",
4276 ctxt
->wellFormed
= 0;
4277 } else if (info
->depr
) {
4278 /***************************
4279 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
4280 ctxt->sax->warning(ctxt->userData,
4281 "Tag %s is deprecated\n",
4283 ***************************/
4287 * Check for an Empty Element labeled the XML/SGML way
4289 if ((CUR
== '/') && (NXT(1) == '>')) {
4291 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->endElement
!= NULL
))
4292 ctxt
->sax
->endElement(ctxt
->userData
, name
);
4293 oldname
= htmlnamePop(ctxt
);
4295 xmlGenericError(xmlGenericErrorContext
,"End of tag the XML way: popping out %s\n",
4298 if (oldname
!= NULL
)
4300 ctxt
->instate
= XML_PARSER_CONTENT
;
4302 xmlGenericError(xmlGenericErrorContext
,
4303 "HPP: entering CONTENT\n");
4311 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->error
!= NULL
))
4312 ctxt
->sax
->error(ctxt
->userData
,
4313 "Couldn't find end of Start Tag %s\n",
4315 ctxt
->wellFormed
= 0;
4318 * end of parsing of this node.
4320 if (xmlStrEqual(name
, ctxt
->name
)) {
4322 oldname
= htmlnamePop(ctxt
);
4324 xmlGenericError(xmlGenericErrorContext
,
4325 "End of start tag problem: popping out %s\n", oldname
);
4327 if (oldname
!= NULL
)
4331 ctxt
->instate
= XML_PARSER_CONTENT
;
4333 xmlGenericError(xmlGenericErrorContext
,
4334 "HPP: entering CONTENT\n");
4340 * Check for an Empty Element from DTD definition
4342 if ((info
!= NULL
) && (info
->empty
)) {
4343 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->endElement
!= NULL
))
4344 ctxt
->sax
->endElement(ctxt
->userData
, name
);
4345 oldname
= htmlnamePop(ctxt
);
4347 xmlGenericError(xmlGenericErrorContext
,"End of empty tag %s : popping out %s\n", name
, oldname
);
4349 if (oldname
!= NULL
)
4352 ctxt
->instate
= XML_PARSER_CONTENT
;
4354 xmlGenericError(xmlGenericErrorContext
,
4355 "HPP: entering CONTENT\n");
4359 case XML_PARSER_CONTENT
: {
4362 * Handle preparsed entities and charRef
4364 if (ctxt
->token
!= 0) {
4365 xmlChar chr
[2] = { 0 , 0 } ;
4367 chr
[0] = (xmlChar
) ctxt
->token
;
4368 htmlCheckParagraph(ctxt
);
4369 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->characters
!= NULL
))
4370 ctxt
->sax
->characters(ctxt
->userData
, chr
, 1);
4372 ctxt
->checkIndex
= 0;
4374 if ((avail
== 1) && (terminate
)) {
4376 if ((cur
!= '<') && (cur
!= '&')) {
4377 if (ctxt
->sax
!= NULL
) {
4378 if (IS_BLANK(cur
)) {
4379 if (ctxt
->sax
->ignorableWhitespace
!= NULL
)
4380 ctxt
->sax
->ignorableWhitespace(
4381 ctxt
->userData
, &cur
, 1);
4383 htmlCheckParagraph(ctxt
);
4384 if (ctxt
->sax
->characters
!= NULL
)
4385 ctxt
->sax
->characters(
4386 ctxt
->userData
, &cur
, 1);
4390 ctxt
->checkIndex
= 0;
4399 cons
= ctxt
->nbChars
;
4400 if ((xmlStrEqual(ctxt
->name
, BAD_CAST
"script")) ||
4401 (xmlStrEqual(ctxt
->name
, BAD_CAST
"style"))) {
4403 * Handle SCRIPT/STYLE separately
4406 (htmlParseLookupSequence(ctxt
, '<', '/', 0) < 0))
4408 htmlParseScript(ctxt
);
4409 if ((cur
== '<') && (next
== '/')) {
4410 ctxt
->instate
= XML_PARSER_END_TAG
;
4411 ctxt
->checkIndex
= 0;
4413 xmlGenericError(xmlGenericErrorContext
,
4414 "HPP: entering END_TAG\n");
4420 * Sometimes DOCTYPE arrives in the middle of the document
4422 if ((cur
== '<') && (next
== '!') &&
4423 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4424 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4425 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4428 (htmlParseLookupSequence(ctxt
, '>', 0, 0) < 0))
4430 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->error
!= NULL
))
4431 ctxt
->sax
->error(ctxt
->userData
,
4432 "Misplaced DOCTYPE declaration\n");
4433 ctxt
->wellFormed
= 0;
4434 htmlParseDocTypeDecl(ctxt
);
4435 } else if ((cur
== '<') && (next
== '!') &&
4436 (in
->cur
[2] == '-') && (in
->cur
[3] == '-')) {
4438 (htmlParseLookupSequence(ctxt
, '-', '-', '>') < 0))
4441 xmlGenericError(xmlGenericErrorContext
,
4442 "HPP: Parsing Comment\n");
4444 htmlParseComment(ctxt
);
4445 ctxt
->instate
= XML_PARSER_CONTENT
;
4446 } else if ((cur
== '<') && (next
== '!') && (avail
< 4)) {
4448 } else if ((cur
== '<') && (next
== '/')) {
4449 ctxt
->instate
= XML_PARSER_END_TAG
;
4450 ctxt
->checkIndex
= 0;
4452 xmlGenericError(xmlGenericErrorContext
,
4453 "HPP: entering END_TAG\n");
4456 } else if (cur
== '<') {
4457 ctxt
->instate
= XML_PARSER_START_TAG
;
4458 ctxt
->checkIndex
= 0;
4460 xmlGenericError(xmlGenericErrorContext
,
4461 "HPP: entering START_TAG\n");
4464 } else if (cur
== '&') {
4466 (htmlParseLookupSequence(ctxt
, ';', 0, 0) < 0))
4469 xmlGenericError(xmlGenericErrorContext
,
4470 "HPP: Parsing Reference\n");
4472 /* TODO: check generation of subtrees if noent !!! */
4473 htmlParseReference(ctxt
);
4475 /* TODO Avoid the extra copy, handle directly !!!!!! */
4477 * Goal of the following test is :
4478 * - minimize calls to the SAX 'character' callback
4479 * when they are mergeable
4481 if ((ctxt
->inputNr
== 1) &&
4482 (avail
< HTML_PARSER_BIG_BUFFER_SIZE
)) {
4484 (htmlParseLookupSequence(ctxt
, '<', 0, 0) < 0))
4487 ctxt
->checkIndex
= 0;
4489 xmlGenericError(xmlGenericErrorContext
,
4490 "HPP: Parsing char data\n");
4492 htmlParseCharData(ctxt
);
4495 if (cons
== ctxt
->nbChars
) {
4496 if (ctxt
->node
!= NULL
) {
4497 if ((ctxt
->sax
!= NULL
) && (ctxt
->sax
->error
!= NULL
))
4498 ctxt
->sax
->error(ctxt
->userData
,
4499 "detected an error in element content\n");
4500 ctxt
->wellFormed
= 0;
4508 case XML_PARSER_END_TAG
:
4512 (htmlParseLookupSequence(ctxt
, '>', 0, 0) < 0))
4514 htmlParseEndTag(ctxt
);
4515 if (ctxt
->nameNr
== 0) {
4516 ctxt
->instate
= XML_PARSER_EPILOG
;
4518 ctxt
->instate
= XML_PARSER_CONTENT
;
4520 ctxt
->checkIndex
= 0;
4522 xmlGenericError(xmlGenericErrorContext
,
4523 "HPP: entering CONTENT\n");
4526 case XML_PARSER_CDATA_SECTION
:
4527 xmlGenericError(xmlGenericErrorContext
,
4528 "HPP: internal error, state == CDATA\n");
4529 ctxt
->instate
= XML_PARSER_CONTENT
;
4530 ctxt
->checkIndex
= 0;
4532 xmlGenericError(xmlGenericErrorContext
,
4533 "HPP: entering CONTENT\n");
4536 case XML_PARSER_DTD
:
4537 xmlGenericError(xmlGenericErrorContext
,
4538 "HPP: internal error, state == DTD\n");
4539 ctxt
->instate
= XML_PARSER_CONTENT
;
4540 ctxt
->checkIndex
= 0;
4542 xmlGenericError(xmlGenericErrorContext
,
4543 "HPP: entering CONTENT\n");
4546 case XML_PARSER_COMMENT
:
4547 xmlGenericError(xmlGenericErrorContext
,
4548 "HPP: internal error, state == COMMENT\n");
4549 ctxt
->instate
= XML_PARSER_CONTENT
;
4550 ctxt
->checkIndex
= 0;
4552 xmlGenericError(xmlGenericErrorContext
,
4553 "HPP: entering CONTENT\n");
4557 xmlGenericError(xmlGenericErrorContext
,
4558 "HPP: internal error, state == PI\n");
4559 ctxt
->instate
= XML_PARSER_CONTENT
;
4560 ctxt
->checkIndex
= 0;
4562 xmlGenericError(xmlGenericErrorContext
,
4563 "HPP: entering CONTENT\n");
4566 case XML_PARSER_ENTITY_DECL
:
4567 xmlGenericError(xmlGenericErrorContext
,
4568 "HPP: internal error, state == ENTITY_DECL\n");
4569 ctxt
->instate
= XML_PARSER_CONTENT
;
4570 ctxt
->checkIndex
= 0;
4572 xmlGenericError(xmlGenericErrorContext
,
4573 "HPP: entering CONTENT\n");
4576 case XML_PARSER_ENTITY_VALUE
:
4577 xmlGenericError(xmlGenericErrorContext
,
4578 "HPP: internal error, state == ENTITY_VALUE\n");
4579 ctxt
->instate
= XML_PARSER_CONTENT
;
4580 ctxt
->checkIndex
= 0;
4582 xmlGenericError(xmlGenericErrorContext
,
4583 "HPP: entering DTD\n");
4586 case XML_PARSER_ATTRIBUTE_VALUE
:
4587 xmlGenericError(xmlGenericErrorContext
,
4588 "HPP: internal error, state == ATTRIBUTE_VALUE\n");
4589 ctxt
->instate
= XML_PARSER_START_TAG
;
4590 ctxt
->checkIndex
= 0;
4592 xmlGenericError(xmlGenericErrorContext
,
4593 "HPP: entering START_TAG\n");
4596 case XML_PARSER_SYSTEM_LITERAL
:
4597 xmlGenericError(xmlGenericErrorContext
,
4598 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
4599 ctxt
->instate
= XML_PARSER_CONTENT
;
4600 ctxt
->checkIndex
= 0;
4602 xmlGenericError(xmlGenericErrorContext
,
4603 "HPP: entering CONTENT\n");
4606 case XML_PARSER_IGNORE
:
4607 xmlGenericError(xmlGenericErrorContext
,
4608 "HPP: internal error, state == XML_PARSER_IGNORE\n");
4609 ctxt
->instate
= XML_PARSER_CONTENT
;
4610 ctxt
->checkIndex
= 0;
4612 xmlGenericError(xmlGenericErrorContext
,
4613 "HPP: entering CONTENT\n");
4616 case XML_PARSER_PUBLIC_LITERAL
:
4617 xmlGenericError(xmlGenericErrorContext
,
4618 "HPP: internal error, state == XML_PARSER_LITERAL\n");
4619 ctxt
->instate
= XML_PARSER_CONTENT
;
4620 ctxt
->checkIndex
= 0;
4622 xmlGenericError(xmlGenericErrorContext
,
4623 "HPP: entering CONTENT\n");
4630 if ((avail
== 0) && (terminate
)) {
4631 htmlAutoCloseOnEnd(ctxt
);
4632 if ((ctxt
->nameNr
== 0) && (ctxt
->instate
!= XML_PARSER_EOF
)) {
4634 * SAX: end of the document processing.
4636 ctxt
->instate
= XML_PARSER_EOF
;
4637 if ((ctxt
->sax
) && (ctxt
->sax
->endDocument
!= NULL
))
4638 ctxt
->sax
->endDocument(ctxt
->userData
);
4641 if ((ctxt
->myDoc
!= NULL
) &&
4642 ((terminate
) || (ctxt
->instate
== XML_PARSER_EOF
) ||
4643 (ctxt
->instate
== XML_PARSER_EPILOG
))) {
4645 dtd
= xmlGetIntSubset(ctxt
->myDoc
);
4647 ctxt
->myDoc
->intSubset
=
4648 xmlCreateIntSubset(ctxt
->myDoc
, BAD_CAST
"HTML",
4649 BAD_CAST
"-//W3C//DTD HTML 4.0 Transitional//EN",
4650 BAD_CAST
"http://www.w3.org/TR/REC-html40/loose.dtd");
4653 xmlGenericError(xmlGenericErrorContext
, "HPP: done %d\n", ret
);
4660 * @ctxt: an XML parser context
4661 * @chunk: an char array
4662 * @size: the size in byte of the chunk
4663 * @terminate: last chunk indicator
4665 * Parse a Chunk of memory
4667 * Returns zero if no error, the xmlParserErrors otherwise.
4670 htmlParseChunk(htmlParserCtxtPtr ctxt
, const char *chunk
, int size
,
4672 if ((size
> 0) && (chunk
!= NULL
) && (ctxt
->input
!= NULL
) &&
4673 (ctxt
->input
->buf
!= NULL
) && (ctxt
->instate
!= XML_PARSER_EOF
)) {
4674 int base
= ctxt
->input
->base
- ctxt
->input
->buf
->buffer
->content
;
4675 int cur
= ctxt
->input
->cur
- ctxt
->input
->base
;
4677 xmlParserInputBufferPush(ctxt
->input
->buf
, size
, chunk
);
4678 ctxt
->input
->base
= ctxt
->input
->buf
->buffer
->content
+ base
;
4679 ctxt
->input
->cur
= ctxt
->input
->base
+ cur
;
4681 xmlGenericError(xmlGenericErrorContext
, "HPP: pushed %d\n", size
);
4684 if ((terminate
) || (ctxt
->input
->buf
->buffer
->use
> 80))
4685 htmlParseTryOrFinish(ctxt
, terminate
);
4686 } else if (ctxt
->instate
!= XML_PARSER_EOF
) {
4687 xmlParserInputBufferPush(ctxt
->input
->buf
, 0, "");
4688 htmlParseTryOrFinish(ctxt
, terminate
);
4691 if ((ctxt
->instate
!= XML_PARSER_EOF
) &&
4692 (ctxt
->instate
!= XML_PARSER_EPILOG
) &&
4693 (ctxt
->instate
!= XML_PARSER_MISC
)) {
4694 ctxt
->errNo
= XML_ERR_DOCUMENT_END
;
4695 ctxt
->wellFormed
= 0;
4697 if (ctxt
->instate
!= XML_PARSER_EOF
) {
4698 if ((ctxt
->sax
) && (ctxt
->sax
->endDocument
!= NULL
))
4699 ctxt
->sax
->endDocument(ctxt
->userData
);
4701 ctxt
->instate
= XML_PARSER_EOF
;
4703 return((xmlParserErrors
) ctxt
->errNo
);
4706 /************************************************************************
4708 * User entry points *
4710 ************************************************************************/
4713 * htmlCreatePushParserCtxt :
4714 * @sax: a SAX handler
4715 * @user_data: The user data returned on SAX callbacks
4716 * @chunk: a pointer to an array of chars
4717 * @size: number of chars in the array
4718 * @filename: an optional file name or URI
4719 * @enc: an optional encoding
4721 * Create a parser context for using the HTML parser in push mode
4722 * To allow content encoding detection, @size should be >= 4
4723 * The value of @filename is used for fetching external entities
4724 * and error/warning reports.
4726 * Returns the new parser context or NULL
4729 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax
, void *user_data
,
4730 const char *chunk
, int size
, const char *filename
,
4731 xmlCharEncoding enc
) {
4732 htmlParserCtxtPtr ctxt
;
4733 htmlParserInputPtr inputStream
;
4734 xmlParserInputBufferPtr buf
;
4738 buf
= xmlAllocParserInputBuffer(enc
);
4739 if (buf
== NULL
) return(NULL
);
4741 ctxt
= (htmlParserCtxtPtr
) xmlMalloc(sizeof(htmlParserCtxt
));
4746 memset(ctxt
, 0, sizeof(htmlParserCtxt
));
4747 htmlInitParserCtxt(ctxt
);
4749 if (ctxt
->sax
!= &htmlDefaultSAXHandler
)
4751 ctxt
->sax
= (htmlSAXHandlerPtr
) xmlMalloc(sizeof(htmlSAXHandler
));
4752 if (ctxt
->sax
== NULL
) {
4757 memcpy(ctxt
->sax
, sax
, sizeof(htmlSAXHandler
));
4758 if (user_data
!= NULL
)
4759 ctxt
->userData
= user_data
;
4761 if (filename
== NULL
) {
4762 ctxt
->directory
= NULL
;
4764 ctxt
->directory
= xmlParserGetDirectory(filename
);
4767 inputStream
= htmlNewInputStream(ctxt
);
4768 if (inputStream
== NULL
) {
4769 xmlFreeParserCtxt(ctxt
);
4773 if (filename
== NULL
)
4774 inputStream
->filename
= NULL
;
4776 inputStream
->filename
= xmlMemStrdup(filename
);
4777 inputStream
->buf
= buf
;
4778 inputStream
->base
= inputStream
->buf
->buffer
->content
;
4779 inputStream
->cur
= inputStream
->buf
->buffer
->content
;
4781 inputPush(ctxt
, inputStream
);
4783 if ((size
> 0) && (chunk
!= NULL
) && (ctxt
->input
!= NULL
) &&
4784 (ctxt
->input
->buf
!= NULL
)) {
4785 xmlParserInputBufferPush(ctxt
->input
->buf
, size
, chunk
);
4787 xmlGenericError(xmlGenericErrorContext
, "HPP: pushed %d\n", size
);
4796 * @cur: a pointer to an array of xmlChar
4797 * @encoding: a free form C string describing the HTML document encoding, or NULL
4798 * @sax: the SAX handler block
4799 * @userData: if using SAX, this pointer will be provided on callbacks.
4801 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
4802 * to handle parse events. If sax is NULL, fallback to the default DOM
4803 * behavior and return a tree.
4805 * Returns the resulting document tree unless SAX is NULL or the document is
4810 htmlSAXParseDoc(xmlChar
*cur
, const char *encoding
, htmlSAXHandlerPtr sax
, void *userData
) {
4812 htmlParserCtxtPtr ctxt
;
4816 if (cur
== NULL
) return(NULL
);
4819 ctxt
= htmlCreateDocParserCtxt(cur
, encoding
);
4820 if (ctxt
== NULL
) return(NULL
);
4823 ctxt
->userData
= userData
;
4826 htmlParseDocument(ctxt
);
4830 ctxt
->userData
= NULL
;
4832 htmlFreeParserCtxt(ctxt
);
4839 * @cur: a pointer to an array of xmlChar
4840 * @encoding: a free form C string describing the HTML document encoding, or NULL
4842 * parse an HTML in-memory document and build a tree.
4844 * Returns the resulting document tree
4848 htmlParseDoc(xmlChar
*cur
, const char *encoding
) {
4849 return(htmlSAXParseDoc(cur
, encoding
, NULL
, NULL
));
4854 * htmlCreateFileParserCtxt :
4855 * @filename: the filename
4856 * @encoding: a free form C string describing the HTML document encoding, or NULL
4858 * Create a parser context for a file content.
4859 * Automatic support for ZLIB/Compress compressed document is provided
4860 * by default if found at compile-time.
4862 * Returns the new parser context or NULL
4865 htmlCreateFileParserCtxt(const char *filename
, const char *encoding
)
4867 htmlParserCtxtPtr ctxt
;
4868 htmlParserInputPtr inputStream
;
4869 xmlParserInputBufferPtr buf
;
4870 /* htmlCharEncoding enc; */
4871 xmlChar
*content
, *content_line
= (xmlChar
*) "charset=";
4873 buf
= xmlParserInputBufferCreateFilename(filename
, XML_CHAR_ENCODING_NONE
);
4874 if (buf
== NULL
) return(NULL
);
4876 ctxt
= (htmlParserCtxtPtr
) xmlMalloc(sizeof(htmlParserCtxt
));
4878 xmlGenericError(xmlGenericErrorContext
, "malloc failed\n");
4881 memset(ctxt
, 0, sizeof(htmlParserCtxt
));
4882 htmlInitParserCtxt(ctxt
);
4883 inputStream
= (htmlParserInputPtr
) xmlMalloc(sizeof(htmlParserInput
));
4884 if (inputStream
== NULL
) {
4885 xmlGenericError(xmlGenericErrorContext
, "malloc failed\n");
4889 memset(inputStream
, 0, sizeof(htmlParserInput
));
4891 inputStream
->filename
= (char *)
4892 xmlNormalizeWindowsPath((xmlChar
*)filename
);
4893 inputStream
->line
= 1;
4894 inputStream
->col
= 1;
4895 inputStream
->buf
= buf
;
4896 inputStream
->directory
= NULL
;
4898 inputStream
->base
= inputStream
->buf
->buffer
->content
;
4899 inputStream
->cur
= inputStream
->buf
->buffer
->content
;
4900 inputStream
->free
= NULL
;
4902 inputPush(ctxt
, inputStream
);
4906 content
= xmlMalloc (xmlStrlen(content_line
) + strlen(encoding
) + 1);
4908 strcpy ((char *)content
, (char *)content_line
);
4909 strcat ((char *)content
, (char *)encoding
);
4910 htmlCheckEncoding (ctxt
, content
);
4919 * htmlSAXParseFile :
4920 * @filename: the filename
4921 * @encoding: a free form C string describing the HTML document encoding, or NULL
4922 * @sax: the SAX handler block
4923 * @userData: if using SAX, this pointer will be provided on callbacks.
4925 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4926 * compressed document is provided by default if found at compile-time.
4927 * It use the given SAX function block to handle the parsing callback.
4928 * If sax is NULL, fallback to the default DOM tree building routines.
4930 * Returns the resulting document tree unless SAX is NULL or the document is
4935 htmlSAXParseFile(const char *filename
, const char *encoding
, htmlSAXHandlerPtr sax
,
4938 htmlParserCtxtPtr ctxt
;
4939 htmlSAXHandlerPtr oldsax
= NULL
;
4943 ctxt
= htmlCreateFileParserCtxt(filename
, encoding
);
4944 if (ctxt
== NULL
) return(NULL
);
4948 ctxt
->userData
= userData
;
4951 htmlParseDocument(ctxt
);
4956 ctxt
->userData
= NULL
;
4958 htmlFreeParserCtxt(ctxt
);
4965 * @filename: the filename
4966 * @encoding: a free form C string describing the HTML document encoding, or NULL
4968 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4969 * compressed document is provided by default if found at compile-time.
4971 * Returns the resulting document tree
4975 htmlParseFile(const char *filename
, const char *encoding
) {
4976 return(htmlSAXParseFile(filename
, encoding
, NULL
, NULL
));
4980 * htmlHandleOmittedElem:
4983 * Set and return the previous value for handling HTML omitted tags.
4985 * Returns the last value for 0 for no handling, 1 for auto insertion.
4989 htmlHandleOmittedElem(int val
) {
4990 int old
= htmlOmittedDefaultValue
;
4992 htmlOmittedDefaultValue
= val
;
4996 #endif /* LIBXML_HTML_ENABLED */