dsrc isn't necessary for this repo
[client-tools.git] / src / external / 3rd / library / libxml / HTMLparser.c
blobc775b89c6dd6a8d42e9b618edda3ba3aa443290d
1 /*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
4 * See Copyright for the status of this software.
6 * daniel@veillard.com
7 */
9 #define IN_LIBXML
10 #include "libxml.h"
11 #ifdef LIBXML_HTML_ENABLED
13 #include <string.h>
14 #ifdef HAVE_CTYPE_H
15 #include <ctype.h>
16 #endif
17 #ifdef HAVE_STDLIB_H
18 #include <stdlib.h>
19 #endif
20 #ifdef HAVE_SYS_STAT_H
21 #include <sys/stat.h>
22 #endif
23 #ifdef HAVE_FCNTL_H
24 #include <fcntl.h>
25 #endif
26 #ifdef HAVE_UNISTD_H
27 #include <unistd.h>
28 #endif
29 #ifdef HAVE_ZLIB_H
30 #include <zlib.h>
31 #endif
33 #include <libxml/xmlmemory.h>
34 #include <libxml/tree.h>
35 #include <libxml/parser.h>
36 #include <libxml/parserInternals.h>
37 #include <libxml/xmlerror.h>
38 #include <libxml/HTMLparser.h>
39 #include <libxml/HTMLtree.h>
40 #include <libxml/entities.h>
41 #include <libxml/encoding.h>
42 #include <libxml/valid.h>
43 #include <libxml/xmlIO.h>
44 #include <libxml/globals.h>
46 #define HTML_MAX_NAMELEN 1000
47 #define HTML_PARSER_BIG_BUFFER_SIZE 1000
48 #define HTML_PARSER_BUFFER_SIZE 100
50 /* #define DEBUG */
51 /* #define DEBUG_PUSH */
53 static int htmlOmittedDefaultValue = 1;
55 xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
56 xmlChar end, xmlChar end2, xmlChar end3);
57 static void htmlParseComment(htmlParserCtxtPtr ctxt);
59 /************************************************************************
60 * *
61 * Parser stacks related functions and macros *
62 * *
63 ************************************************************************/
66 * Generic function for accessing stacks in the Parser Context
69 #define PUSH_AND_POP(scope, type, name) \
70 scope int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \
71 if (ctxt->name##Nr >= ctxt->name##Max) { \
72 ctxt->name##Max *= 2; \
73 ctxt->name##Tab = (type *) xmlRealloc(ctxt->name##Tab, \
74 ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
75 if (ctxt->name##Tab == NULL) { \
76 xmlGenericError(xmlGenericErrorContext, \
77 "realloc failed !\n"); \
78 return(0); \
79 } \
80 } \
81 ctxt->name##Tab[ctxt->name##Nr] = value; \
82 ctxt->name = value; \
83 return(ctxt->name##Nr++); \
84 } \
85 scope type html##name##Pop(htmlParserCtxtPtr ctxt) { \
86 type ret; \
87 if (ctxt->name##Nr <= 0) return(0); \
88 ctxt->name##Nr--; \
89 if (ctxt->name##Nr < 0) return(0); \
90 if (ctxt->name##Nr > 0) \
91 ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
92 else \
93 ctxt->name = NULL; \
94 ret = ctxt->name##Tab[ctxt->name##Nr]; \
95 ctxt->name##Tab[ctxt->name##Nr] = 0; \
96 return(ret); \
97 } \
99 /* PUSH_AND_POP(static, xmlNodePtr, node) */
100 PUSH_AND_POP(static, xmlChar*, name)
103 * Macros for accessing the content. Those should be used only by the parser,
104 * and not exported.
106 * Dirty macros, i.e. one need to make assumption on the context to use them
108 * CUR_PTR return the current pointer to the xmlChar to be parsed.
109 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
110 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
111 * in UNICODE mode. This should be used internally by the parser
112 * only to compare to ASCII values otherwise it would break when
113 * running with UTF-8 encoding.
114 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
115 * to compare on ASCII based substring.
116 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
117 * it should be used only to compare on ASCII based substring.
118 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
119 * strings within the parser.
121 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
123 * CURRENT Returns the current char value, with the full decoding of
124 * UTF-8 if we are using this mode. It returns an int.
125 * NEXT Skip to the next character, this does the proper decoding
126 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
127 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
130 #define UPPER (toupper(*ctxt->input->cur))
132 #define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
134 #define NXT(val) ctxt->input->cur[(val)]
136 #define UPP(val) (toupper(ctxt->input->cur[(val)]))
138 #define CUR_PTR ctxt->input->cur
140 #define SHRINK xmlParserInputShrink(ctxt->input)
142 #define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
144 #define CURRENT ((int) (*ctxt->input->cur))
146 #define SKIP_BLANKS htmlSkipBlankChars(ctxt)
148 /* Inported from XML */
150 /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
151 #define CUR ((int) (*ctxt->input->cur))
152 #define NEXT xmlNextChar(ctxt),ctxt->nbChars++
154 #define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
155 #define NXT(val) ctxt->input->cur[(val)]
156 #define CUR_PTR ctxt->input->cur
159 #define NEXTL(l) do { \
160 if (*(ctxt->input->cur) == '\n') { \
161 ctxt->input->line++; ctxt->input->col = 1; \
162 } else ctxt->input->col++; \
163 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
164 } while (0)
166 /************
168 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
169 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
170 ************/
172 #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
173 #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
175 #define COPY_BUF(l,b,i,v) \
176 if (l == 1) b[i++] = (xmlChar) v; \
177 else i += xmlCopyChar(l,&b[i],v)
180 * htmlCurrentChar:
181 * @ctxt: the HTML parser context
182 * @len: pointer to the length of the char read
184 * The current char value, if using UTF-8 this may actually span multiple
185 * bytes in the input buffer. Implement the end of line normalization:
186 * 2.11 End-of-Line Handling
187 * If the encoding is unspecified, in the case we find an ISO-Latin-1
188 * char, then the encoding converter is plugged in automatically.
190 * Returns the current char value and its length
193 static int
194 htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
195 if (ctxt->instate == XML_PARSER_EOF)
196 return(0);
198 if (ctxt->token != 0) {
199 *len = 0;
200 return(ctxt->token);
202 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
204 * We are supposed to handle UTF8, check it's valid
205 * From rfc2044: encoding of the Unicode values on UTF-8:
207 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
208 * 0000 0000-0000 007F 0xxxxxxx
209 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
210 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
212 * Check for the 0x110000 limit too
214 const unsigned char *cur = ctxt->input->cur;
215 unsigned char c;
216 unsigned int val;
218 c = *cur;
219 if (c & 0x80) {
220 if (cur[1] == 0)
221 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
222 if ((cur[1] & 0xc0) != 0x80)
223 goto encoding_error;
224 if ((c & 0xe0) == 0xe0) {
226 if (cur[2] == 0)
227 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
228 if ((cur[2] & 0xc0) != 0x80)
229 goto encoding_error;
230 if ((c & 0xf0) == 0xf0) {
231 if (cur[3] == 0)
232 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
233 if (((c & 0xf8) != 0xf0) ||
234 ((cur[3] & 0xc0) != 0x80))
235 goto encoding_error;
236 /* 4-byte code */
237 *len = 4;
238 val = (cur[0] & 0x7) << 18;
239 val |= (cur[1] & 0x3f) << 12;
240 val |= (cur[2] & 0x3f) << 6;
241 val |= cur[3] & 0x3f;
242 } else {
243 /* 3-byte code */
244 *len = 3;
245 val = (cur[0] & 0xf) << 12;
246 val |= (cur[1] & 0x3f) << 6;
247 val |= cur[2] & 0x3f;
249 } else {
250 /* 2-byte code */
251 *len = 2;
252 val = (cur[0] & 0x1f) << 6;
253 val |= cur[1] & 0x3f;
255 if (!IS_CHAR(val)) {
256 ctxt->errNo = XML_ERR_INVALID_ENCODING;
257 if ((ctxt->sax != NULL) &&
258 (ctxt->sax->error != NULL))
259 ctxt->sax->error(ctxt->userData,
260 "Char 0x%X out of allowed range\n", val);
261 ctxt->wellFormed = 0;
262 if (ctxt->recovery == 0) ctxt->disableSAX = 1;
264 return(val);
265 } else {
266 /* 1-byte code */
267 *len = 1;
268 return((int) *ctxt->input->cur);
272 * Assume it's a fixed length encoding (1) with
273 * a compatible encoding for the ASCII set, since
274 * XML constructs only use < 128 chars
276 *len = 1;
277 if ((int) *ctxt->input->cur < 0x80)
278 return((int) *ctxt->input->cur);
281 * Humm this is bad, do an automatic flow conversion
283 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
284 ctxt->charset = XML_CHAR_ENCODING_UTF8;
285 return(xmlCurrentChar(ctxt, len));
287 encoding_error:
289 * If we detect an UTF8 error that probably mean that the
290 * input encoding didn't get properly advertized in the
291 * declaration header. Report the error and switch the encoding
292 * to ISO-Latin-1 (if you don't like this policy, just declare the
293 * encoding !)
295 ctxt->errNo = XML_ERR_INVALID_ENCODING;
296 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
297 ctxt->sax->error(ctxt->userData,
298 "Input is not proper UTF-8, indicate encoding !\n");
299 ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
300 ctxt->input->cur[0], ctxt->input->cur[1],
301 ctxt->input->cur[2], ctxt->input->cur[3]);
304 ctxt->charset = XML_CHAR_ENCODING_8859_1;
305 *len = 1;
306 return((int) *ctxt->input->cur);
310 * htmlSkipBlankChars:
311 * @ctxt: the HTML parser context
313 * skip all blanks character found at that point in the input streams.
315 * Returns the number of space chars skipped
318 static int
319 htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
320 int res = 0;
322 while (IS_BLANK(*(ctxt->input->cur))) {
323 if ((*ctxt->input->cur == 0) &&
324 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
325 xmlPopInput(ctxt);
326 } else {
327 if (*(ctxt->input->cur) == '\n') {
328 ctxt->input->line++; ctxt->input->col = 1;
329 } else ctxt->input->col++;
330 ctxt->input->cur++;
331 ctxt->nbChars++;
332 if (*ctxt->input->cur == 0)
333 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
335 res++;
337 return(res);
342 /************************************************************************
344 * The list of HTML elements and their properties *
346 ************************************************************************/
349 * Start Tag: 1 means the start tag can be ommited
350 * End Tag: 1 means the end tag can be ommited
351 * 2 means it's forbidden (empty elements)
352 * 3 means the tag is stylistic and should be closed easily
353 * Depr: this element is deprecated
354 * DTD: 1 means that this element is valid only in the Loose DTD
355 * 2 means that this element is valid only in the Frameset DTD
357 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
359 static const htmlElemDesc
360 html40ElementTable[] = {
361 { "a", 0, 0, 0, 0, 0, 0, 1, "anchor " },
362 { "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form" },
363 { "acronym", 0, 0, 0, 0, 0, 0, 1, "" },
364 { "address", 0, 0, 0, 0, 0, 0, 0, "information on author " },
365 { "applet", 0, 0, 0, 0, 1, 1, 2, "java applet " },
366 { "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area " },
367 { "b", 0, 3, 0, 0, 0, 0, 1, "bold text style" },
368 { "base", 0, 2, 2, 1, 0, 0, 0, "document base uri " },
369 { "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " },
370 { "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride " },
371 { "big", 0, 3, 0, 0, 0, 0, 1, "large text style" },
372 { "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation " },
373 { "body", 1, 1, 0, 0, 0, 0, 0, "document body " },
374 { "br", 0, 2, 2, 1, 0, 0, 1, "forced line break " },
375 { "button", 0, 0, 0, 0, 0, 0, 2, "push button " },
376 { "caption", 0, 0, 0, 0, 0, 0, 0, "table caption " },
377 { "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center " },
378 { "cite", 0, 0, 0, 0, 0, 0, 1, "citation" },
379 { "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment" },
380 { "col", 0, 2, 2, 1, 0, 0, 0, "table column " },
381 { "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group " },
382 { "dd", 0, 1, 0, 0, 0, 0, 0, "definition description " },
383 { "del", 0, 0, 0, 0, 0, 0, 2, "deleted text " },
384 { "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition" },
385 { "dir", 0, 0, 0, 0, 1, 1, 0, "directory list" },
386 { "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container"},
387 { "dl", 0, 0, 0, 0, 0, 0, 0, "definition list " },
388 { "dt", 0, 1, 0, 0, 0, 0, 0, "definition term " },
389 { "em", 0, 3, 0, 0, 0, 0, 1, "emphasis" },
390 { "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group " },
391 { "font", 0, 3, 0, 0, 1, 1, 1, "local change to font " },
392 { "form", 0, 0, 0, 0, 0, 0, 0, "interactive form " },
393 { "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " },
394 { "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" },
395 { "h1", 0, 0, 0, 0, 0, 0, 0, "heading " },
396 { "h2", 0, 0, 0, 0, 0, 0, 0, "heading " },
397 { "h3", 0, 0, 0, 0, 0, 0, 0, "heading " },
398 { "h4", 0, 0, 0, 0, 0, 0, 0, "heading " },
399 { "h5", 0, 0, 0, 0, 0, 0, 0, "heading " },
400 { "h6", 0, 0, 0, 0, 0, 0, 0, "heading " },
401 { "head", 1, 1, 0, 0, 0, 0, 0, "document head " },
402 { "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " },
403 { "html", 1, 1, 0, 0, 0, 0, 0, "document root element " },
404 { "i", 0, 3, 0, 0, 0, 0, 1, "italic text style" },
405 { "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow " },
406 { "img", 0, 2, 2, 1, 0, 0, 1, "embedded image " },
407 { "input", 0, 2, 2, 1, 0, 0, 1, "form control " },
408 { "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text" },
409 { "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt " },
410 { "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user" },
411 { "label", 0, 0, 0, 0, 0, 0, 1, "form field label text " },
412 { "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend " },
413 { "li", 0, 1, 1, 0, 0, 0, 0, "list item " },
414 { "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link " },
415 { "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map " },
416 { "menu", 0, 0, 0, 0, 1, 1, 0, "menu list " },
417 { "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation " },
418 { "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering " },
419 { "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
420 { "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object " },
421 { "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list " },
422 { "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group " },
423 { "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " },
424 { "p", 0, 1, 0, 0, 0, 0, 0, "paragraph " },
425 { "param", 0, 2, 2, 1, 0, 0, 0, "named property value " },
426 { "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text " },
427 { "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation " },
428 { "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style" },
429 { "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc." },
430 { "script", 0, 0, 0, 0, 0, 0, 2, "script statements " },
431 { "select", 0, 0, 0, 0, 0, 0, 1, "option selector " },
432 { "small", 0, 3, 0, 0, 0, 0, 1, "small text style" },
433 { "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container " },
434 { "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text" },
435 { "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis" },
436 { "style", 0, 0, 0, 0, 0, 0, 0, "style info " },
437 { "sub", 0, 3, 0, 0, 0, 0, 1, "subscript" },
438 { "sup", 0, 3, 0, 0, 0, 0, 1, "superscript " },
439 { "table", 0, 0, 0, 0, 0, 0, 0, "&#160;" },
440 { "tbody", 1, 0, 0, 0, 0, 0, 0, "table body " },
441 { "td", 0, 0, 0, 0, 0, 0, 0, "table data cell" },
442 { "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field " },
443 { "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer " },
444 { "th", 0, 1, 0, 0, 0, 0, 0, "table header cell" },
445 { "thead", 0, 1, 0, 0, 0, 0, 0, "table header " },
446 { "title", 0, 0, 0, 0, 0, 0, 0, "document title " },
447 { "tr", 0, 0, 0, 0, 0, 0, 0, "table row " },
448 { "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style" },
449 { "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style" },
450 { "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list " },
451 { "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument" },
455 * start tags that imply the end of current element
457 static const char *htmlStartClose[] = {
458 "form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
459 "dl", "ul", "ol", "menu", "dir", "address", "pre",
460 "listing", "xmp", "head", NULL,
461 "head", "p", NULL,
462 "title", "p", NULL,
463 "body", "head", "style", "link", "title", "p", NULL,
464 "li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
465 "pre", "listing", "xmp", "head", "li", NULL,
466 "hr", "p", "head", NULL,
467 "h1", "p", "head", NULL,
468 "h2", "p", "head", NULL,
469 "h3", "p", "head", NULL,
470 "h4", "p", "head", NULL,
471 "h5", "p", "head", NULL,
472 "h6", "p", "head", NULL,
473 "dir", "p", "head", NULL,
474 "address", "p", "head", "ul", NULL,
475 "pre", "p", "head", "ul", NULL,
476 "listing", "p", "head", NULL,
477 "xmp", "p", "head", NULL,
478 "blockquote", "p", "head", NULL,
479 "dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
480 "xmp", "head", NULL,
481 "dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
482 "head", "dd", NULL,
483 "dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
484 "head", "dt", NULL,
485 "ul", "p", "head", "ol", "menu", "dir", "address", "pre",
486 "listing", "xmp", NULL,
487 "ol", "p", "head", "ul", NULL,
488 "menu", "p", "head", "ul", NULL,
489 "p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
490 "div", "p", "head", NULL,
491 "noscript", "p", "head", NULL,
492 "center", "font", "b", "i", "p", "head", NULL,
493 "a", "a", NULL,
494 "caption", "p", NULL,
495 "colgroup", "caption", "colgroup", "col", "p", NULL,
496 "col", "caption", "col", "p", NULL,
497 "table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
498 "listing", "xmp", "a", NULL,
499 "th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
500 "td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
501 "tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
502 "thead", "caption", "col", "colgroup", NULL,
503 "tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
504 "tbody", "p", NULL,
505 "tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
506 "tfoot", "tbody", "p", NULL,
507 "optgroup", "option", NULL,
508 "option", "option", NULL,
509 "fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
510 "pre", "listing", "xmp", "a", NULL,
511 NULL
515 * The list of HTML elements which are supposed not to have
516 * CDATA content and where a p element will be implied
518 * TODO: extend that list by reading the HTML SGML DTD on
519 * implied paragraph
521 static const char *htmlNoContentElements[] = {
522 "html",
523 "head",
524 "body",
525 NULL
529 * The list of HTML attributes which are of content %Script;
530 * NOTE: when adding ones, check htmlIsScriptAttribute() since
531 * it assumes the name starts with 'on'
533 static const char *htmlScriptAttributes[] = {
534 "onclick",
535 "ondblclick",
536 "onmousedown",
537 "onmouseup",
538 "onmouseover",
539 "onmousemove",
540 "onmouseout",
541 "onkeypress",
542 "onkeydown",
543 "onkeyup",
544 "onload",
545 "onunload",
546 "onfocus",
547 "onblur",
548 "onsubmit",
549 "onrest",
550 "onchange",
551 "onselect"
555 * This table is used by the htmlparser to know what to do with
556 * broken html pages. By assigning different priorities to different
557 * elements the parser can decide how to handle extra endtags.
558 * Endtags are only allowed to close elements with lower or equal
559 * priority.
562 typedef struct {
563 const char *name;
564 int priority;
565 } elementPriority;
567 static const elementPriority htmlEndPriority[] = {
568 {"div", 150},
569 {"td", 160},
570 {"th", 160},
571 {"tr", 170},
572 {"thead", 180},
573 {"tbody", 180},
574 {"tfoot", 180},
575 {"table", 190},
576 {"head", 200},
577 {"body", 200},
578 {"html", 220},
579 {NULL, 100} /* Default priority */
582 static const char** htmlStartCloseIndex[100];
583 static int htmlStartCloseIndexinitialized = 0;
585 /************************************************************************
587 * functions to handle HTML specific data *
589 ************************************************************************/
592 * htmlInitAutoClose:
594 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
595 * This is not reentrant. Call xmlInitParser() once before processing in
596 * case of use in multithreaded programs.
598 void
599 htmlInitAutoClose(void) {
600 int indx, i = 0;
602 if (htmlStartCloseIndexinitialized) return;
604 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
605 indx = 0;
606 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
607 htmlStartCloseIndex[indx++] = &htmlStartClose[i];
608 while (htmlStartClose[i] != NULL) i++;
609 i++;
611 htmlStartCloseIndexinitialized = 1;
615 * htmlTagLookup:
616 * @tag: The tag name in lowercase
618 * Lookup the HTML tag in the ElementTable
620 * Returns the related htmlElemDescPtr or NULL if not found.
622 const htmlElemDesc *
623 htmlTagLookup(const xmlChar *tag) {
624 unsigned int i;
626 for (i = 0; i < (sizeof(html40ElementTable) /
627 sizeof(html40ElementTable[0]));i++) {
628 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
629 return((const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) &html40ElementTable[i]);
631 return(NULL);
635 * htmlGetEndPriority:
636 * @name: The name of the element to look up the priority for.
638 * Return value: The "endtag" priority.
640 static int
641 htmlGetEndPriority (const xmlChar *name) {
642 int i = 0;
644 while ((htmlEndPriority[i].name != NULL) &&
645 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
646 i++;
648 return(htmlEndPriority[i].priority);
652 * htmlCheckAutoClose:
653 * @newtag: The new tag name
654 * @oldtag: The old tag name
656 * Checks whether the new tag is one of the registered valid tags for
657 * closing old.
658 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
660 * Returns 0 if no, 1 if yes.
662 static int
663 htmlCheckAutoClose(const xmlChar *newtag, const xmlChar *oldtag) {
664 int i, indx;
665 const char **closed = NULL;
667 if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
669 /* inefficient, but not a big deal */
670 for (indx = 0; indx < 100;indx++) {
671 closed = htmlStartCloseIndex[indx];
672 if (closed == NULL) return(0);
673 if (xmlStrEqual(BAD_CAST *closed, newtag)) break;
676 i = closed - htmlStartClose;
677 i++;
678 while (htmlStartClose[i] != NULL) {
679 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
680 return(1);
682 i++;
684 return(0);
688 * htmlAutoCloseOnClose:
689 * @ctxt: an HTML parser context
690 * @newtag: The new tag name
691 * @force: force the tag closure
693 * The HTML DTD allows an ending tag to implicitly close other tags.
695 static void
696 htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
697 const htmlElemDesc * info;
698 xmlChar *oldname;
699 int i, priority;
701 #ifdef DEBUG
702 xmlGenericError(xmlGenericErrorContext,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr);
703 for (i = 0;i < ctxt->nameNr;i++)
704 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
705 #endif
707 priority = htmlGetEndPriority (newtag);
709 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
711 if (xmlStrEqual(newtag, ctxt->nameTab[i])) break;
713 * A missplaced endtag can only close elements with lower
714 * or equal priority, so if we find an element with higher
715 * priority before we find an element with
716 * matching name, we just ignore this endtag
718 if (htmlGetEndPriority (ctxt->nameTab[i]) > priority) return;
720 if (i < 0) return;
722 while (!xmlStrEqual(newtag, ctxt->name)) {
723 info = htmlTagLookup(ctxt->name);
724 if ((info == NULL) || (info->endTag == 1)) {
725 #ifdef DEBUG
726 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
727 #endif
728 } else if (info->endTag == 3) {
729 #ifdef DEBUG
730 xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", newtag, ctxt->name);
732 #endif
733 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
734 ctxt->sax->error(ctxt->userData,
735 "Opening and ending tag mismatch: %s and %s\n",
736 newtag, ctxt->name);
737 ctxt->wellFormed = 0;
739 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
740 ctxt->sax->endElement(ctxt->userData, ctxt->name);
741 oldname = htmlnamePop(ctxt);
742 if (oldname != NULL) {
743 #ifdef DEBUG
744 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: popped %s\n", oldname);
745 #endif
746 xmlFree(oldname);
752 * htmlAutoCloseOnEnd:
753 * @ctxt: an HTML parser context
755 * Close all remaining tags at the end of the stream
757 static void
758 htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt) {
759 xmlChar *oldname;
760 int i;
762 if (ctxt->nameNr == 0)
763 return;
764 #ifdef DEBUG
765 xmlGenericError(xmlGenericErrorContext,"Close of stack: %d elements\n", ctxt->nameNr);
766 #endif
768 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
769 #ifdef DEBUG
770 xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
771 #endif
772 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
773 ctxt->sax->endElement(ctxt->userData, ctxt->name);
774 oldname = htmlnamePop(ctxt);
775 if (oldname != NULL) {
776 #ifdef DEBUG
777 xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnEnd: popped %s\n", oldname);
778 #endif
779 xmlFree(oldname);
785 * htmlAutoClose:
786 * @ctxt: an HTML parser context
787 * @newtag: The new tag name or NULL
789 * The HTML DTD allows a tag to implicitly close other tags.
790 * The list is kept in htmlStartClose array. This function is
791 * called when a new tag has been detected and generates the
792 * appropriates closes if possible/needed.
793 * If newtag is NULL this mean we are at the end of the resource
794 * and we should check
796 static void
797 htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
798 xmlChar *oldname;
799 while ((newtag != NULL) && (ctxt->name != NULL) &&
800 (htmlCheckAutoClose(newtag, ctxt->name))) {
801 #ifdef DEBUG
802 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: %s closes %s\n", newtag, ctxt->name);
803 #endif
804 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
805 ctxt->sax->endElement(ctxt->userData, ctxt->name);
806 oldname = htmlnamePop(ctxt);
807 if (oldname != NULL) {
808 #ifdef DEBUG
809 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
810 #endif
811 xmlFree(oldname);
814 if (newtag == NULL) {
815 htmlAutoCloseOnEnd(ctxt);
816 return;
818 while ((newtag == NULL) && (ctxt->name != NULL) &&
819 ((xmlStrEqual(ctxt->name, BAD_CAST"head")) ||
820 (xmlStrEqual(ctxt->name, BAD_CAST"body")) ||
821 (xmlStrEqual(ctxt->name, BAD_CAST"html")))) {
822 #ifdef DEBUG
823 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: EOF closes %s\n", ctxt->name);
824 #endif
825 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
826 ctxt->sax->endElement(ctxt->userData, ctxt->name);
827 oldname = htmlnamePop(ctxt);
828 if (oldname != NULL) {
829 #ifdef DEBUG
830 xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
831 #endif
832 xmlFree(oldname);
839 * htmlAutoCloseTag:
840 * @doc: the HTML document
841 * @name: The tag name
842 * @elem: the HTML element
844 * The HTML DTD allows a tag to implicitly close other tags.
845 * The list is kept in htmlStartClose array. This function checks
846 * if the element or one of it's children would autoclose the
847 * given tag.
849 * Returns 1 if autoclose, 0 otherwise
852 htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
853 htmlNodePtr child;
855 if (elem == NULL) return(1);
856 if (xmlStrEqual(name, elem->name)) return(0);
857 if (htmlCheckAutoClose(elem->name, name)) return(1);
858 child = elem->children;
859 while (child != NULL) {
860 if (htmlAutoCloseTag(doc, name, child)) return(1);
861 child = child->next;
863 return(0);
867 * htmlIsAutoClosed:
868 * @doc: the HTML document
869 * @elem: the HTML element
871 * The HTML DTD allows a tag to implicitly close other tags.
872 * The list is kept in htmlStartClose array. This function checks
873 * if a tag is autoclosed by one of it's child
875 * Returns 1 if autoclosed, 0 otherwise
878 htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
879 htmlNodePtr child;
881 if (elem == NULL) return(1);
882 child = elem->children;
883 while (child != NULL) {
884 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
885 child = child->next;
887 return(0);
891 * htmlCheckImplied:
892 * @ctxt: an HTML parser context
893 * @newtag: The new tag name
895 * The HTML DTD allows a tag to exists only implicitly
896 * called when a new tag has been detected and generates the
897 * appropriates implicit tags if missing
899 static void
900 htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
901 if (!htmlOmittedDefaultValue)
902 return;
903 if (xmlStrEqual(newtag, BAD_CAST"html"))
904 return;
905 if (ctxt->nameNr <= 0) {
906 #ifdef DEBUG
907 xmlGenericError(xmlGenericErrorContext,"Implied element html: pushed html\n");
908 #endif
909 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"html"));
910 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
911 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
913 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
914 return;
915 if ((ctxt->nameNr <= 1) &&
916 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
917 (xmlStrEqual(newtag, BAD_CAST"style")) ||
918 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
919 (xmlStrEqual(newtag, BAD_CAST"link")) ||
920 (xmlStrEqual(newtag, BAD_CAST"title")) ||
921 (xmlStrEqual(newtag, BAD_CAST"base")))) {
923 * dropped OBJECT ... i you put it first BODY will be
924 * assumed !
926 #ifdef DEBUG
927 xmlGenericError(xmlGenericErrorContext,"Implied element head: pushed head\n");
928 #endif
929 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
930 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
931 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
932 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
933 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
934 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
935 int i;
936 for (i = 0;i < ctxt->nameNr;i++) {
937 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
938 return;
940 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
941 return;
945 #ifdef DEBUG
946 xmlGenericError(xmlGenericErrorContext,"Implied element body: pushed body\n");
947 #endif
948 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
949 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
950 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
955 * htmlCheckParagraph
956 * @ctxt: an HTML parser context
958 * Check whether a p element need to be implied before inserting
959 * characters in the current element.
961 * Returns 1 if a paragraph has been inserted, 0 if not and -1
962 * in case of error.
965 static int
966 htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
967 const xmlChar *tag;
968 int i;
970 if (ctxt == NULL)
971 return(-1);
972 tag = ctxt->name;
973 if (tag == NULL) {
974 htmlAutoClose(ctxt, BAD_CAST"p");
975 htmlCheckImplied(ctxt, BAD_CAST"p");
976 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
977 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
978 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
979 return(1);
981 if (!htmlOmittedDefaultValue)
982 return(0);
983 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
984 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
985 #ifdef DEBUG
986 xmlGenericError(xmlGenericErrorContext,"Implied element paragraph\n");
987 #endif
988 htmlAutoClose(ctxt, BAD_CAST"p");
989 htmlCheckImplied(ctxt, BAD_CAST"p");
990 htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
991 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
992 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
993 return(1);
996 return(0);
1000 * htmlIsScriptAttribute:
1001 * @name: an attribute name
1003 * Check if an attribute is of content type Script
1005 * Returns 1 is the attribute is a script 0 otherwise
1008 htmlIsScriptAttribute(const xmlChar *name) {
1009 unsigned int i;
1011 if (name == NULL)
1012 return(0);
1014 * all script attributes start with 'on'
1016 if ((name[0] != 'o') || (name[1] != 'n'))
1017 return(0);
1018 for (i = 0;
1019 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1020 i++) {
1021 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1022 return(1);
1024 return(0);
1027 /************************************************************************
1029 * The list of HTML predefined entities *
1031 ************************************************************************/
1034 static const htmlEntityDesc html40EntitiesTable[] = {
1036 * the 4 absolute ones, plus apostrophe.
1038 { 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1039 { 38, "amp", "ampersand, U+0026 ISOnum" },
1040 { 39, "apos", "single quote" },
1041 { 60, "lt", "less-than sign, U+003C ISOnum" },
1042 { 62, "gt", "greater-than sign, U+003E ISOnum" },
1045 * A bunch still in the 128-255 range
1046 * Replacing them depend really on the charset used.
1048 { 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1049 { 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1050 { 162, "cent", "cent sign, U+00A2 ISOnum" },
1051 { 163, "pound","pound sign, U+00A3 ISOnum" },
1052 { 164, "curren","currency sign, U+00A4 ISOnum" },
1053 { 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1054 { 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1055 { 167, "sect", "section sign, U+00A7 ISOnum" },
1056 { 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1057 { 169, "copy", "copyright sign, U+00A9 ISOnum" },
1058 { 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1059 { 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1060 { 172, "not", "not sign, U+00AC ISOnum" },
1061 { 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1062 { 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1063 { 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1064 { 176, "deg", "degree sign, U+00B0 ISOnum" },
1065 { 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1066 { 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1067 { 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1068 { 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1069 { 181, "micro","micro sign, U+00B5 ISOnum" },
1070 { 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1071 { 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1072 { 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1073 { 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1074 { 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1075 { 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1076 { 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1077 { 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1078 { 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1079 { 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1080 { 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1081 { 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1082 { 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1083 { 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1084 { 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1085 { 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1086 { 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1087 { 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1088 { 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1089 { 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1090 { 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1091 { 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1092 { 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1093 { 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1094 { 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1095 { 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1096 { 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1097 { 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1098 { 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1099 { 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1100 { 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1101 { 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1102 { 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1103 { 215, "times","multiplication sign, U+00D7 ISOnum" },
1104 { 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1105 { 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1106 { 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1107 { 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1108 { 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1109 { 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1110 { 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1111 { 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1112 { 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1113 { 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1114 { 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1115 { 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1116 { 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1117 { 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1118 { 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1119 { 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1120 { 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1121 { 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1122 { 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1123 { 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1124 { 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1125 { 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1126 { 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1127 { 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1128 { 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1129 { 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1130 { 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1131 { 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1132 { 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1133 { 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1134 { 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1135 { 247, "divide","division sign, U+00F7 ISOnum" },
1136 { 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1137 { 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1138 { 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1139 { 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1140 { 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1141 { 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1142 { 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1143 { 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1145 { 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1146 { 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1147 { 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1148 { 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1149 { 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1152 * Anything below should really be kept as entities references
1154 { 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1156 { 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1157 { 732, "tilde","small tilde, U+02DC ISOdia" },
1159 { 913, "Alpha","greek capital letter alpha, U+0391" },
1160 { 914, "Beta", "greek capital letter beta, U+0392" },
1161 { 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1162 { 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1163 { 917, "Epsilon","greek capital letter epsilon, U+0395" },
1164 { 918, "Zeta", "greek capital letter zeta, U+0396" },
1165 { 919, "Eta", "greek capital letter eta, U+0397" },
1166 { 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1167 { 921, "Iota", "greek capital letter iota, U+0399" },
1168 { 922, "Kappa","greek capital letter kappa, U+039A" },
1169 { 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1170 { 924, "Mu", "greek capital letter mu, U+039C" },
1171 { 925, "Nu", "greek capital letter nu, U+039D" },
1172 { 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1173 { 927, "Omicron","greek capital letter omicron, U+039F" },
1174 { 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1175 { 929, "Rho", "greek capital letter rho, U+03A1" },
1176 { 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1177 { 932, "Tau", "greek capital letter tau, U+03A4" },
1178 { 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1179 { 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1180 { 935, "Chi", "greek capital letter chi, U+03A7" },
1181 { 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1182 { 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1184 { 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1185 { 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1186 { 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1187 { 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1188 { 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1189 { 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1190 { 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1191 { 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1192 { 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1193 { 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1194 { 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1195 { 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1196 { 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1197 { 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1198 { 959, "omicron","greek small letter omicron, U+03BF NEW" },
1199 { 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1200 { 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1201 { 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1202 { 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1203 { 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1204 { 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1205 { 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1206 { 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1207 { 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1208 { 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1209 { 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1210 { 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1211 { 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1213 { 8194, "ensp", "en space, U+2002 ISOpub" },
1214 { 8195, "emsp", "em space, U+2003 ISOpub" },
1215 { 8201, "thinsp","thin space, U+2009 ISOpub" },
1216 { 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1217 { 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1218 { 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1219 { 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1220 { 8211, "ndash","en dash, U+2013 ISOpub" },
1221 { 8212, "mdash","em dash, U+2014 ISOpub" },
1222 { 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1223 { 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1224 { 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1225 { 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1226 { 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1227 { 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1228 { 8224, "dagger","dagger, U+2020 ISOpub" },
1229 { 8225, "Dagger","double dagger, U+2021 ISOpub" },
1231 { 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1232 { 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1234 { 8240, "permil","per mille sign, U+2030 ISOtech" },
1236 { 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1237 { 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1239 { 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1240 { 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1242 { 8254, "oline","overline = spacing overscore, U+203E NEW" },
1243 { 8260, "frasl","fraction slash, U+2044 NEW" },
1245 { 8364, "euro", "euro sign, U+20AC NEW" },
1247 { 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1248 { 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1249 { 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1250 { 8482, "trade","trade mark sign, U+2122 ISOnum" },
1251 { 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1252 { 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1253 { 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1254 { 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1255 { 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1256 { 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1257 { 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1258 { 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1259 { 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1260 { 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1261 { 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1262 { 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1264 { 8704, "forall","for all, U+2200 ISOtech" },
1265 { 8706, "part", "partial differential, U+2202 ISOtech" },
1266 { 8707, "exist","there exists, U+2203 ISOtech" },
1267 { 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1268 { 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1269 { 8712, "isin", "element of, U+2208 ISOtech" },
1270 { 8713, "notin","not an element of, U+2209 ISOtech" },
1271 { 8715, "ni", "contains as member, U+220B ISOtech" },
1272 { 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1273 { 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
1274 { 8722, "minus","minus sign, U+2212 ISOtech" },
1275 { 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1276 { 8730, "radic","square root = radical sign, U+221A ISOtech" },
1277 { 8733, "prop", "proportional to, U+221D ISOtech" },
1278 { 8734, "infin","infinity, U+221E ISOtech" },
1279 { 8736, "ang", "angle, U+2220 ISOamso" },
1280 { 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1281 { 8744, "or", "logical or = vee, U+2228 ISOtech" },
1282 { 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1283 { 8746, "cup", "union = cup, U+222A ISOtech" },
1284 { 8747, "int", "integral, U+222B ISOtech" },
1285 { 8756, "there4","therefore, U+2234 ISOtech" },
1286 { 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1287 { 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1288 { 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1289 { 8800, "ne", "not equal to, U+2260 ISOtech" },
1290 { 8801, "equiv","identical to, U+2261 ISOtech" },
1291 { 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1292 { 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1293 { 8834, "sub", "subset of, U+2282 ISOtech" },
1294 { 8835, "sup", "superset of, U+2283 ISOtech" },
1295 { 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1296 { 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1297 { 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1298 { 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1299 { 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1300 { 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1301 { 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1302 { 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1303 { 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1304 { 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1305 { 8971, "rfloor","right floor, U+230B ISOamsc" },
1306 { 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1307 { 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1308 { 9674, "loz", "lozenge, U+25CA ISOpub" },
1310 { 9824, "spades","black spade suit, U+2660 ISOpub" },
1311 { 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1312 { 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1313 { 9830, "diams","black diamond suit, U+2666 ISOpub" },
1317 /************************************************************************
1319 * Commodity functions to handle entities *
1321 ************************************************************************/
1324 * Macro used to grow the current buffer.
1326 #define growBuffer(buffer) { \
1327 buffer##_size *= 2; \
1328 buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1329 if (buffer == NULL) { \
1330 xmlGenericError(xmlGenericErrorContext, "realloc failed\n"); \
1331 return(NULL); \
1336 * htmlEntityLookup:
1337 * @name: the entity name
1339 * Lookup the given entity in EntitiesTable
1341 * TODO: the linear scan is really ugly, an hash table is really needed.
1343 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1345 const htmlEntityDesc *
1346 htmlEntityLookup(const xmlChar *name) {
1347 unsigned int i;
1349 for (i = 0;i < (sizeof(html40EntitiesTable)/
1350 sizeof(html40EntitiesTable[0]));i++) {
1351 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1352 #ifdef DEBUG
1353 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", name);
1354 #endif
1355 return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
1358 return(NULL);
1362 * htmlEntityValueLookup:
1363 * @value: the entity's unicode value
1365 * Lookup the given entity in EntitiesTable
1367 * TODO: the linear scan is really ugly, an hash table is really needed.
1369 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1371 const htmlEntityDesc *
1372 htmlEntityValueLookup(unsigned int value) {
1373 unsigned int i;
1374 #ifdef DEBUG
1375 unsigned int lv = 0;
1376 #endif
1378 for (i = 0;i < (sizeof(html40EntitiesTable)/
1379 sizeof(html40EntitiesTable[0]));i++) {
1380 if (html40EntitiesTable[i].value >= value) {
1381 if (html40EntitiesTable[i].value > value)
1382 break;
1383 #ifdef DEBUG
1384 xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", html40EntitiesTable[i].name);
1385 #endif
1386 return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
1388 #ifdef DEBUG
1389 if (lv > html40EntitiesTable[i].value) {
1390 xmlGenericError(xmlGenericErrorContext,
1391 "html40EntitiesTable[] is not sorted (%d > %d)!\n",
1392 lv, html40EntitiesTable[i].value);
1394 lv = html40EntitiesTable[i].value;
1395 #endif
1397 return(NULL);
1401 * UTF8ToHtml:
1402 * @out: a pointer to an array of bytes to store the result
1403 * @outlen: the length of @out
1404 * @in: a pointer to an array of UTF-8 chars
1405 * @inlen: the length of @in
1407 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1408 * plus HTML entities block of chars out.
1410 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1411 * The value of @inlen after return is the number of octets consumed
1412 * as the return value is positive, else unpredictable.
1413 * The value of @outlen after return is the number of octets consumed.
1416 UTF8ToHtml(unsigned char* out, int *outlen,
1417 const unsigned char* in, int *inlen) {
1418 const unsigned char* processed = in;
1419 const unsigned char* outend;
1420 const unsigned char* outstart = out;
1421 const unsigned char* instart = in;
1422 const unsigned char* inend;
1423 unsigned int c, d;
1424 int trailing;
1426 if (in == NULL) {
1428 * initialization nothing to do
1430 *outlen = 0;
1431 *inlen = 0;
1432 return(0);
1434 inend = in + (*inlen);
1435 outend = out + (*outlen);
1436 while (in < inend) {
1437 d = *in++;
1438 if (d < 0x80) { c= d; trailing= 0; }
1439 else if (d < 0xC0) {
1440 /* trailing byte in leading position */
1441 *outlen = out - outstart;
1442 *inlen = processed - instart;
1443 return(-2);
1444 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1445 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1446 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1447 else {
1448 /* no chance for this in Ascii */
1449 *outlen = out - outstart;
1450 *inlen = processed - instart;
1451 return(-2);
1454 if (inend - in < trailing) {
1455 break;
1458 for ( ; trailing; trailing--) {
1459 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1460 break;
1461 c <<= 6;
1462 c |= d & 0x3F;
1465 /* assertion: c is a single UTF-4 value */
1466 if (c < 0x80) {
1467 if (out + 1 >= outend)
1468 break;
1469 *out++ = c;
1470 } else {
1471 int len;
1472 const htmlEntityDesc * ent;
1475 * Try to lookup a predefined HTML entity for it
1478 ent = htmlEntityValueLookup(c);
1479 if (ent == NULL) {
1480 /* no chance for this in Ascii */
1481 *outlen = out - outstart;
1482 *inlen = processed - instart;
1483 return(-2);
1485 len = strlen(ent->name);
1486 if (out + 2 + len >= outend)
1487 break;
1488 *out++ = '&';
1489 memcpy(out, ent->name, len);
1490 out += len;
1491 *out++ = ';';
1493 processed = in;
1495 *outlen = out - outstart;
1496 *inlen = processed - instart;
1497 return(0);
1501 * htmlEncodeEntities:
1502 * @out: a pointer to an array of bytes to store the result
1503 * @outlen: the length of @out
1504 * @in: a pointer to an array of UTF-8 chars
1505 * @inlen: the length of @in
1506 * @quoteChar: the quote character to escape (' or ") or zero.
1508 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1509 * plus HTML entities block of chars out.
1511 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1512 * The value of @inlen after return is the number of octets consumed
1513 * as the return value is positive, else unpredictable.
1514 * The value of @outlen after return is the number of octets consumed.
1517 htmlEncodeEntities(unsigned char* out, int *outlen,
1518 const unsigned char* in, int *inlen, int quoteChar) {
1519 const unsigned char* processed = in;
1520 const unsigned char* outend = out + (*outlen);
1521 const unsigned char* outstart = out;
1522 const unsigned char* instart = in;
1523 const unsigned char* inend = in + (*inlen);
1524 unsigned int c, d;
1525 int trailing;
1527 while (in < inend) {
1528 d = *in++;
1529 if (d < 0x80) { c= d; trailing= 0; }
1530 else if (d < 0xC0) {
1531 /* trailing byte in leading position */
1532 *outlen = out - outstart;
1533 *inlen = processed - instart;
1534 return(-2);
1535 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1536 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1537 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1538 else {
1539 /* no chance for this in Ascii */
1540 *outlen = out - outstart;
1541 *inlen = processed - instart;
1542 return(-2);
1545 if (inend - in < trailing)
1546 break;
1548 while (trailing--) {
1549 if (((d= *in++) & 0xC0) != 0x80) {
1550 *outlen = out - outstart;
1551 *inlen = processed - instart;
1552 return(-2);
1554 c <<= 6;
1555 c |= d & 0x3F;
1558 /* assertion: c is a single UTF-4 value */
1559 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
1560 (c != '&') && (c != '<') && (c != '>')) {
1561 if (out >= outend)
1562 break;
1563 *out++ = c;
1564 } else {
1565 const htmlEntityDesc * ent;
1566 const char *cp;
1567 char nbuf[16];
1568 int len;
1571 * Try to lookup a predefined HTML entity for it
1573 ent = htmlEntityValueLookup(c);
1574 if (ent == NULL) {
1575 snprintf(nbuf, sizeof(nbuf), "#%u", c);
1576 cp = nbuf;
1578 else
1579 cp = ent->name;
1580 len = strlen(cp);
1581 if (out + 2 + len > outend)
1582 break;
1583 *out++ = '&';
1584 memcpy(out, cp, len);
1585 out += len;
1586 *out++ = ';';
1588 processed = in;
1590 *outlen = out - outstart;
1591 *inlen = processed - instart;
1592 return(0);
1596 * htmlDecodeEntities:
1597 * @ctxt: the parser context
1598 * @len: the len to decode (in bytes !), -1 for no size limit
1599 * @end: an end marker xmlChar, 0 if none
1600 * @end2: an end marker xmlChar, 0 if none
1601 * @end3: an end marker xmlChar, 0 if none
1603 * Substitute the HTML entities by their value
1605 * DEPRECATED !!!!
1607 * Returns A newly allocated string with the substitution done. The caller
1608 * must deallocate it !
1610 xmlChar *
1611 htmlDecodeEntities(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED, int len ATTRIBUTE_UNUSED,
1612 xmlChar end ATTRIBUTE_UNUSED, xmlChar end2 ATTRIBUTE_UNUSED, xmlChar end3 ATTRIBUTE_UNUSED) {
1613 static int deprecated = 0;
1614 if (!deprecated) {
1615 xmlGenericError(xmlGenericErrorContext,
1616 "htmlDecodeEntities() deprecated function reached\n");
1617 deprecated = 1;
1619 return(NULL);
1622 /************************************************************************
1624 * Commodity functions to handle streams *
1626 ************************************************************************/
1629 * htmlNewInputStream:
1630 * @ctxt: an HTML parser context
1632 * Create a new input stream structure
1633 * Returns the new input stream or NULL
1635 static htmlParserInputPtr
1636 htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1637 htmlParserInputPtr input;
1639 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1640 if (input == NULL) {
1641 ctxt->errNo = XML_ERR_NO_MEMORY;
1642 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1643 ctxt->sax->error(ctxt->userData,
1644 "malloc: couldn't allocate a new input stream\n");
1645 return(NULL);
1647 memset(input, 0, sizeof(htmlParserInput));
1648 input->filename = NULL;
1649 input->directory = NULL;
1650 input->base = NULL;
1651 input->cur = NULL;
1652 input->buf = NULL;
1653 input->line = 1;
1654 input->col = 1;
1655 input->buf = NULL;
1656 input->free = NULL;
1657 input->version = NULL;
1658 input->consumed = 0;
1659 input->length = 0;
1660 return(input);
1664 /************************************************************************
1666 * Commodity functions, cleanup needed ? *
1668 ************************************************************************/
1670 * all tags allowing pc data from the html 4.01 loose dtd
1671 * NOTE: it might be more apropriate to integrate this information
1672 * into the html40ElementTable array but I don't want to risk any
1673 * binary incomptibility
1675 static const char *allowPCData[] = {
1676 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
1677 "blockquote", "body", "button", "caption", "center", "cite", "code",
1678 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
1679 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
1680 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
1681 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
1685 * areBlanks:
1686 * @ctxt: an HTML parser context
1687 * @str: a xmlChar *
1688 * @len: the size of @str
1690 * Is this a sequence of blank chars that one can ignore ?
1692 * Returns 1 if ignorable 0 otherwise.
1695 static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
1696 unsigned int i;
1697 int j;
1698 xmlNodePtr lastChild;
1700 for (j = 0;j < len;j++)
1701 if (!(IS_BLANK(str[j]))) return(0);
1703 if (CUR == 0) return(1);
1704 if (CUR != '<') return(0);
1705 if (ctxt->name == NULL)
1706 return(1);
1707 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
1708 return(1);
1709 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
1710 return(1);
1711 if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
1712 return(1);
1713 if (ctxt->node == NULL) return(0);
1714 lastChild = xmlGetLastChild(ctxt->node);
1715 if (lastChild == NULL) {
1716 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
1717 (ctxt->node->content != NULL)) return(0);
1718 /* keep ws in constructs like ...<b> </b>...
1719 for all tags "b" allowing PCDATA */
1720 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
1721 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
1722 return(0);
1725 } else if (xmlNodeIsText(lastChild)) {
1726 return(0);
1727 } else {
1728 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
1729 for all tags "p" allowing PCDATA */
1730 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
1731 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
1732 return(0);
1736 return(1);
1740 * htmlNewDocNoDtD:
1741 * @URI: URI for the dtd, or NULL
1742 * @ExternalID: the external ID of the DTD, or NULL
1744 * Creates a new HTML document without a DTD node if @URI and @ExternalID
1745 * are NULL
1747 * Returns a new document, do not initialize the DTD if not provided
1749 htmlDocPtr
1750 htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
1751 xmlDocPtr cur;
1754 * Allocate a new document and fill the fields.
1756 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
1757 if (cur == NULL) {
1758 xmlGenericError(xmlGenericErrorContext,
1759 "htmlNewDocNoDtD : malloc failed\n");
1760 return(NULL);
1762 memset(cur, 0, sizeof(xmlDoc));
1764 cur->type = XML_HTML_DOCUMENT_NODE;
1765 cur->version = NULL;
1766 cur->intSubset = NULL;
1767 cur->doc = cur;
1768 cur->name = NULL;
1769 cur->children = NULL;
1770 cur->extSubset = NULL;
1771 cur->oldNs = NULL;
1772 cur->encoding = NULL;
1773 cur->standalone = 1;
1774 cur->compression = 0;
1775 cur->ids = NULL;
1776 cur->refs = NULL;
1777 cur->_private = NULL;
1778 if ((ExternalID != NULL) ||
1779 (URI != NULL))
1780 xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
1781 return(cur);
1785 * htmlNewDoc:
1786 * @URI: URI for the dtd, or NULL
1787 * @ExternalID: the external ID of the DTD, or NULL
1789 * Creates a new HTML document
1791 * Returns a new document
1793 htmlDocPtr
1794 htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
1795 if ((URI == NULL) && (ExternalID == NULL))
1796 return(htmlNewDocNoDtD(
1797 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
1798 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
1800 return(htmlNewDocNoDtD(URI, ExternalID));
1804 /************************************************************************
1806 * The parser itself *
1807 * Relates to http://www.w3.org/TR/html40 *
1809 ************************************************************************/
1811 /************************************************************************
1813 * The parser itself *
1815 ************************************************************************/
1818 * htmlParseHTMLName:
1819 * @ctxt: an HTML parser context
1821 * parse an HTML tag or attribute name, note that we convert it to lowercase
1822 * since HTML names are not case-sensitive.
1824 * Returns the Tag Name parsed or NULL
1827 static xmlChar *
1828 htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
1829 xmlChar *ret = NULL;
1830 int i = 0;
1831 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
1833 if (!IS_LETTER(CUR) && (CUR != '_') &&
1834 (CUR != ':')) return(NULL);
1836 while ((i < HTML_PARSER_BUFFER_SIZE) &&
1837 ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1838 (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
1839 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
1840 else loc[i] = CUR;
1841 i++;
1843 NEXT;
1846 ret = xmlStrndup(loc, i);
1848 return(ret);
1852 * htmlParseName:
1853 * @ctxt: an HTML parser context
1855 * parse an HTML name, this routine is case sensitive.
1857 * Returns the Name parsed or NULL
1860 static xmlChar *
1861 htmlParseName(htmlParserCtxtPtr ctxt) {
1862 xmlChar buf[HTML_MAX_NAMELEN];
1863 int len = 0;
1865 GROW;
1866 if (!IS_LETTER(CUR) && (CUR != '_')) {
1867 return(NULL);
1870 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1871 (CUR == '.') || (CUR == '-') ||
1872 (CUR == '_') || (CUR == ':') ||
1873 (IS_COMBINING(CUR)) ||
1874 (IS_EXTENDER(CUR))) {
1875 buf[len++] = CUR;
1876 NEXT;
1877 if (len >= HTML_MAX_NAMELEN) {
1878 xmlGenericError(xmlGenericErrorContext,
1879 "htmlParseName: reached HTML_MAX_NAMELEN limit\n");
1880 while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1881 (CUR == '.') || (CUR == '-') ||
1882 (CUR == '_') || (CUR == ':') ||
1883 (IS_COMBINING(CUR)) ||
1884 (IS_EXTENDER(CUR)))
1885 NEXT;
1886 break;
1889 return(xmlStrndup(buf, len));
1893 * htmlParseHTMLAttribute:
1894 * @ctxt: an HTML parser context
1895 * @stop: a char stop value
1897 * parse an HTML attribute value till the stop (quote), if
1898 * stop is 0 then it stops at the first space
1900 * Returns the attribute parsed or NULL
1903 static xmlChar *
1904 htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
1905 xmlChar *buffer = NULL;
1906 int buffer_size = 0;
1907 xmlChar *out = NULL;
1908 xmlChar *name = NULL;
1910 xmlChar *cur = NULL;
1911 const htmlEntityDesc * ent;
1914 * allocate a translation buffer.
1916 buffer_size = HTML_PARSER_BUFFER_SIZE;
1917 buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1918 if (buffer == NULL) {
1919 xmlGenericError(xmlGenericErrorContext,
1920 "htmlParseHTMLAttribute: malloc failed\n");
1921 return(NULL);
1923 out = buffer;
1926 * Ok loop until we reach one of the ending chars
1928 while ((CUR != 0) && (CUR != stop)) {
1929 if ((stop == 0) && (CUR == '>')) break;
1930 if ((stop == 0) && (IS_BLANK(CUR))) break;
1931 if (CUR == '&') {
1932 if (NXT(1) == '#') {
1933 unsigned int c;
1934 int bits;
1936 c = htmlParseCharRef(ctxt);
1937 if (c < 0x80)
1938 { *out++ = c; bits= -6; }
1939 else if (c < 0x800)
1940 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
1941 else if (c < 0x10000)
1942 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
1943 else
1944 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
1946 for ( ; bits >= 0; bits-= 6) {
1947 *out++ = ((c >> bits) & 0x3F) | 0x80;
1950 if (out - buffer > buffer_size - 100) {
1951 int indx = out - buffer;
1953 growBuffer(buffer);
1954 out = &buffer[indx];
1956 } else {
1957 ent = htmlParseEntityRef(ctxt, &name);
1958 if (name == NULL) {
1959 *out++ = '&';
1960 if (out - buffer > buffer_size - 100) {
1961 int indx = out - buffer;
1963 growBuffer(buffer);
1964 out = &buffer[indx];
1966 } else if (ent == NULL) {
1967 *out++ = '&';
1968 cur = name;
1969 while (*cur != 0) {
1970 if (out - buffer > buffer_size - 100) {
1971 int indx = out - buffer;
1973 growBuffer(buffer);
1974 out = &buffer[indx];
1976 *out++ = *cur++;
1978 xmlFree(name);
1979 } else {
1980 unsigned int c;
1981 int bits;
1983 if (out - buffer > buffer_size - 100) {
1984 int indx = out - buffer;
1986 growBuffer(buffer);
1987 out = &buffer[indx];
1989 c = (xmlChar)ent->value;
1990 if (c < 0x80)
1991 { *out++ = c; bits= -6; }
1992 else if (c < 0x800)
1993 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
1994 else if (c < 0x10000)
1995 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
1996 else
1997 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
1999 for ( ; bits >= 0; bits-= 6) {
2000 *out++ = ((c >> bits) & 0x3F) | 0x80;
2002 xmlFree(name);
2005 } else {
2006 unsigned int c;
2007 int bits, l;
2009 if (out - buffer > buffer_size - 100) {
2010 int indx = out - buffer;
2012 growBuffer(buffer);
2013 out = &buffer[indx];
2015 c = CUR_CHAR(l);
2016 if (c < 0x80)
2017 { *out++ = c; bits= -6; }
2018 else if (c < 0x800)
2019 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2020 else if (c < 0x10000)
2021 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2022 else
2023 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2025 for ( ; bits >= 0; bits-= 6) {
2026 *out++ = ((c >> bits) & 0x3F) | 0x80;
2028 NEXT;
2031 *out++ = 0;
2032 return(buffer);
2036 * htmlParseEntityRef:
2037 * @ctxt: an HTML parser context
2038 * @str: location to store the entity name
2040 * parse an HTML ENTITY references
2042 * [68] EntityRef ::= '&' Name ';'
2044 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2045 * if non-NULL *str will have to be freed by the caller.
2047 const htmlEntityDesc *
2048 htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
2049 xmlChar *name;
2050 const htmlEntityDesc * ent = NULL;
2051 *str = NULL;
2053 if (CUR == '&') {
2054 NEXT;
2055 name = htmlParseName(ctxt);
2056 if (name == NULL) {
2057 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2058 ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
2059 ctxt->wellFormed = 0;
2060 } else {
2061 GROW;
2062 if (CUR == ';') {
2063 *str = name;
2066 * Lookup the entity in the table.
2068 ent = htmlEntityLookup(name);
2069 if (ent != NULL) /* OK that's ugly !!! */
2070 NEXT;
2071 } else {
2072 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2073 ctxt->sax->error(ctxt->userData,
2074 "htmlParseEntityRef: expecting ';'\n");
2075 *str = name;
2079 return(ent);
2083 * htmlParseAttValue:
2084 * @ctxt: an HTML parser context
2086 * parse a value for an attribute
2087 * Note: the parser won't do substitution of entities here, this
2088 * will be handled later in xmlStringGetNodeList, unless it was
2089 * asked for ctxt->replaceEntities != 0
2091 * Returns the AttValue parsed or NULL.
2094 static xmlChar *
2095 htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2096 xmlChar *ret = NULL;
2098 if (CUR == '"') {
2099 NEXT;
2100 ret = htmlParseHTMLAttribute(ctxt, '"');
2101 if (CUR != '"') {
2102 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2103 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2104 ctxt->wellFormed = 0;
2105 } else
2106 NEXT;
2107 } else if (CUR == '\'') {
2108 NEXT;
2109 ret = htmlParseHTMLAttribute(ctxt, '\'');
2110 if (CUR != '\'') {
2111 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2112 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2113 ctxt->wellFormed = 0;
2114 } else
2115 NEXT;
2116 } else {
2118 * That's an HTMLism, the attribute value may not be quoted
2120 ret = htmlParseHTMLAttribute(ctxt, 0);
2121 if (ret == NULL) {
2122 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2123 ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
2124 ctxt->wellFormed = 0;
2127 return(ret);
2131 * htmlParseSystemLiteral:
2132 * @ctxt: an HTML parser context
2134 * parse an HTML Literal
2136 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2138 * Returns the SystemLiteral parsed or NULL
2141 static xmlChar *
2142 htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2143 const xmlChar *q;
2144 xmlChar *ret = NULL;
2146 if (CUR == '"') {
2147 NEXT;
2148 q = CUR_PTR;
2149 while ((IS_CHAR(CUR)) && (CUR != '"'))
2150 NEXT;
2151 if (!IS_CHAR(CUR)) {
2152 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2153 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2154 ctxt->wellFormed = 0;
2155 } else {
2156 ret = xmlStrndup(q, CUR_PTR - q);
2157 NEXT;
2159 } else if (CUR == '\'') {
2160 NEXT;
2161 q = CUR_PTR;
2162 while ((IS_CHAR(CUR)) && (CUR != '\''))
2163 NEXT;
2164 if (!IS_CHAR(CUR)) {
2165 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2166 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2167 ctxt->wellFormed = 0;
2168 } else {
2169 ret = xmlStrndup(q, CUR_PTR - q);
2170 NEXT;
2172 } else {
2173 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2174 ctxt->sax->error(ctxt->userData,
2175 "SystemLiteral \" or ' expected\n");
2176 ctxt->wellFormed = 0;
2179 return(ret);
2183 * htmlParsePubidLiteral:
2184 * @ctxt: an HTML parser context
2186 * parse an HTML public literal
2188 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2190 * Returns the PubidLiteral parsed or NULL.
2193 static xmlChar *
2194 htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2195 const xmlChar *q;
2196 xmlChar *ret = NULL;
2198 * Name ::= (Letter | '_') (NameChar)*
2200 if (CUR == '"') {
2201 NEXT;
2202 q = CUR_PTR;
2203 while (IS_PUBIDCHAR(CUR)) NEXT;
2204 if (CUR != '"') {
2205 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2206 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2207 ctxt->wellFormed = 0;
2208 } else {
2209 ret = xmlStrndup(q, CUR_PTR - q);
2210 NEXT;
2212 } else if (CUR == '\'') {
2213 NEXT;
2214 q = CUR_PTR;
2215 while ((IS_LETTER(CUR)) && (CUR != '\''))
2216 NEXT;
2217 if (!IS_LETTER(CUR)) {
2218 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2219 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2220 ctxt->wellFormed = 0;
2221 } else {
2222 ret = xmlStrndup(q, CUR_PTR - q);
2223 NEXT;
2225 } else {
2226 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2227 ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
2228 ctxt->wellFormed = 0;
2231 return(ret);
2235 * htmlParseScript:
2236 * @ctxt: an HTML parser context
2238 * parse the content of an HTML SCRIPT or STYLE element
2239 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2240 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2241 * http://www.w3.org/TR/html4/types.html#type-script
2242 * http://www.w3.org/TR/html4/types.html#h-6.15
2243 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2245 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2246 * element and the value of intrinsic event attributes. User agents must
2247 * not evaluate script data as HTML markup but instead must pass it on as
2248 * data to a script engine.
2249 * NOTES:
2250 * - The content is passed like CDATA
2251 * - the attributes for style and scripting "onXXX" are also described
2252 * as CDATA but SGML allows entities references in attributes so their
2253 * processing is identical as other attributes
2255 static void
2256 htmlParseScript(htmlParserCtxtPtr ctxt) {
2257 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
2258 int nbchar = 0;
2259 xmlChar cur;
2261 SHRINK;
2262 cur = CUR;
2263 while (IS_CHAR(cur)) {
2264 if ((cur == '<') && (NXT(1) == '!') && (NXT(2) == '-') &&
2265 (NXT(3) == '-')) {
2266 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2267 if (ctxt->sax->cdataBlock!= NULL) {
2269 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2271 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2274 nbchar = 0;
2275 htmlParseComment(ctxt);
2276 cur = CUR;
2277 continue;
2278 } else if ((cur == '<') && (NXT(1) == '/')) {
2280 * One should break here, the specification is clear:
2281 * Authors should therefore escape "</" within the content.
2282 * Escape mechanisms are specific to each scripting or
2283 * style sheet language.
2285 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2286 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2287 break; /* while */
2289 buf[nbchar++] = cur;
2290 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2291 if (ctxt->sax->cdataBlock!= NULL) {
2293 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2295 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2297 nbchar = 0;
2299 NEXT;
2300 cur = CUR;
2302 if (!(IS_CHAR(cur))) {
2303 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2304 ctxt->sax->error(ctxt->userData,
2305 "Invalid char in CDATA 0x%X\n", cur);
2306 ctxt->wellFormed = 0;
2307 NEXT;
2310 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2311 if (ctxt->sax->cdataBlock!= NULL) {
2313 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2315 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2322 * htmlParseCharData:
2323 * @ctxt: an HTML parser context
2325 * parse a CharData section.
2326 * if we are within a CDATA section ']]>' marks an end of section.
2328 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2331 static void
2332 htmlParseCharData(htmlParserCtxtPtr ctxt) {
2333 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2334 int nbchar = 0;
2335 int cur, l;
2337 SHRINK;
2338 cur = CUR_CHAR(l);
2339 while (((cur != '<') || (ctxt->token == '<')) &&
2340 ((cur != '&') || (ctxt->token == '&')) &&
2341 (IS_CHAR(cur))) {
2342 COPY_BUF(l,buf,nbchar,cur);
2343 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2345 * Ok the segment is to be consumed as chars.
2347 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2348 if (areBlanks(ctxt, buf, nbchar)) {
2349 if (ctxt->sax->ignorableWhitespace != NULL)
2350 ctxt->sax->ignorableWhitespace(ctxt->userData,
2351 buf, nbchar);
2352 } else {
2353 htmlCheckParagraph(ctxt);
2354 if (ctxt->sax->characters != NULL)
2355 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2358 nbchar = 0;
2360 NEXTL(l);
2361 cur = CUR_CHAR(l);
2363 if (nbchar != 0) {
2365 * Ok the segment is to be consumed as chars.
2367 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2368 if (areBlanks(ctxt, buf, nbchar)) {
2369 if (ctxt->sax->ignorableWhitespace != NULL)
2370 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2371 } else {
2372 htmlCheckParagraph(ctxt);
2373 if (ctxt->sax->characters != NULL)
2374 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2377 } else {
2379 * Loop detection
2381 if (cur == 0)
2382 ctxt->instate = XML_PARSER_EOF;
2387 * htmlParseExternalID:
2388 * @ctxt: an HTML parser context
2389 * @publicID: a xmlChar** receiving PubidLiteral
2391 * Parse an External ID or a Public ID
2393 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2394 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2396 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2398 * Returns the function returns SystemLiteral and in the second
2399 * case publicID receives PubidLiteral, is strict is off
2400 * it is possible to return NULL and have publicID set.
2403 static xmlChar *
2404 htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
2405 xmlChar *URI = NULL;
2407 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2408 (UPP(2) == 'S') && (UPP(3) == 'T') &&
2409 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2410 SKIP(6);
2411 if (!IS_BLANK(CUR)) {
2412 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2413 ctxt->sax->error(ctxt->userData,
2414 "Space required after 'SYSTEM'\n");
2415 ctxt->wellFormed = 0;
2417 SKIP_BLANKS;
2418 URI = htmlParseSystemLiteral(ctxt);
2419 if (URI == NULL) {
2420 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2421 ctxt->sax->error(ctxt->userData,
2422 "htmlParseExternalID: SYSTEM, no URI\n");
2423 ctxt->wellFormed = 0;
2425 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2426 (UPP(2) == 'B') && (UPP(3) == 'L') &&
2427 (UPP(4) == 'I') && (UPP(5) == 'C')) {
2428 SKIP(6);
2429 if (!IS_BLANK(CUR)) {
2430 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2431 ctxt->sax->error(ctxt->userData,
2432 "Space required after 'PUBLIC'\n");
2433 ctxt->wellFormed = 0;
2435 SKIP_BLANKS;
2436 *publicID = htmlParsePubidLiteral(ctxt);
2437 if (*publicID == NULL) {
2438 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2439 ctxt->sax->error(ctxt->userData,
2440 "htmlParseExternalID: PUBLIC, no Public Identifier\n");
2441 ctxt->wellFormed = 0;
2443 SKIP_BLANKS;
2444 if ((CUR == '"') || (CUR == '\'')) {
2445 URI = htmlParseSystemLiteral(ctxt);
2448 return(URI);
2452 * htmlParseComment:
2453 * @ctxt: an HTML parser context
2455 * Parse an XML (SGML) comment <!-- .... -->
2457 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2459 static void
2460 htmlParseComment(htmlParserCtxtPtr ctxt) {
2461 xmlChar *buf = NULL;
2462 int len;
2463 int size = HTML_PARSER_BUFFER_SIZE;
2464 int q, ql;
2465 int r, rl;
2466 int cur, l;
2467 xmlParserInputState state;
2470 * Check that there is a comment right here.
2472 if ((RAW != '<') || (NXT(1) != '!') ||
2473 (NXT(2) != '-') || (NXT(3) != '-')) return;
2475 state = ctxt->instate;
2476 ctxt->instate = XML_PARSER_COMMENT;
2477 SHRINK;
2478 SKIP(4);
2479 buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar));
2480 if (buf == NULL) {
2481 xmlGenericError(xmlGenericErrorContext,
2482 "malloc of %d byte failed\n", size);
2483 ctxt->instate = state;
2484 return;
2486 q = CUR_CHAR(ql);
2487 NEXTL(ql);
2488 r = CUR_CHAR(rl);
2489 NEXTL(rl);
2490 cur = CUR_CHAR(l);
2491 len = 0;
2492 while (IS_CHAR(cur) &&
2493 ((cur != '>') ||
2494 (r != '-') || (q != '-'))) {
2495 if (len + 5 >= size) {
2496 size *= 2;
2497 buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
2498 if (buf == NULL) {
2499 xmlGenericError(xmlGenericErrorContext,
2500 "realloc of %d byte failed\n", size);
2501 ctxt->instate = state;
2502 return;
2505 COPY_BUF(ql,buf,len,q);
2506 q = r;
2507 ql = rl;
2508 r = cur;
2509 rl = l;
2510 NEXTL(l);
2511 cur = CUR_CHAR(l);
2512 if (cur == 0) {
2513 SHRINK;
2514 GROW;
2515 cur = CUR_CHAR(l);
2518 buf[len] = 0;
2519 if (!IS_CHAR(cur)) {
2520 ctxt->errNo = XML_ERR_COMMENT_NOT_FINISHED;
2521 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2522 ctxt->sax->error(ctxt->userData,
2523 "Comment not terminated \n<!--%.50s\n", buf);
2524 ctxt->wellFormed = 0;
2525 xmlFree(buf);
2526 } else {
2527 NEXT;
2528 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
2529 (!ctxt->disableSAX))
2530 ctxt->sax->comment(ctxt->userData, buf);
2531 xmlFree(buf);
2533 ctxt->instate = state;
2537 * htmlParseCharRef:
2538 * @ctxt: an HTML parser context
2540 * parse Reference declarations
2542 * [66] CharRef ::= '&#' [0-9]+ ';' |
2543 * '&#x' [0-9a-fA-F]+ ';'
2545 * Returns the value parsed (as an int)
2548 htmlParseCharRef(htmlParserCtxtPtr ctxt) {
2549 int val = 0;
2551 if ((CUR == '&') && (NXT(1) == '#') &&
2552 (NXT(2) == 'x')) {
2553 SKIP(3);
2554 while (CUR != ';') {
2555 if ((CUR >= '0') && (CUR <= '9'))
2556 val = val * 16 + (CUR - '0');
2557 else if ((CUR >= 'a') && (CUR <= 'f'))
2558 val = val * 16 + (CUR - 'a') + 10;
2559 else if ((CUR >= 'A') && (CUR <= 'F'))
2560 val = val * 16 + (CUR - 'A') + 10;
2561 else {
2562 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2563 ctxt->sax->error(ctxt->userData,
2564 "htmlParseCharRef: invalid hexadecimal value\n");
2565 ctxt->wellFormed = 0;
2566 return(0);
2568 NEXT;
2570 if (CUR == ';')
2571 NEXT;
2572 } else if ((CUR == '&') && (NXT(1) == '#')) {
2573 SKIP(2);
2574 while (CUR != ';') {
2575 if ((CUR >= '0') && (CUR <= '9'))
2576 val = val * 10 + (CUR - '0');
2577 else {
2578 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2579 ctxt->sax->error(ctxt->userData,
2580 "htmlParseCharRef: invalid decimal value\n");
2581 ctxt->wellFormed = 0;
2582 return(0);
2584 NEXT;
2586 if (CUR == ';')
2587 NEXT;
2588 } else {
2589 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2590 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
2591 ctxt->wellFormed = 0;
2594 * Check the value IS_CHAR ...
2596 if (IS_CHAR(val)) {
2597 return(val);
2598 } else {
2599 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2600 ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
2601 val);
2602 ctxt->wellFormed = 0;
2604 return(0);
2609 * htmlParseDocTypeDecl :
2610 * @ctxt: an HTML parser context
2612 * parse a DOCTYPE declaration
2614 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
2615 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
2618 static void
2619 htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
2620 xmlChar *name;
2621 xmlChar *ExternalID = NULL;
2622 xmlChar *URI = NULL;
2625 * We know that '<!DOCTYPE' has been detected.
2627 SKIP(9);
2629 SKIP_BLANKS;
2632 * Parse the DOCTYPE name.
2634 name = htmlParseName(ctxt);
2635 if (name == NULL) {
2636 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2637 ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
2638 ctxt->wellFormed = 0;
2641 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
2644 SKIP_BLANKS;
2647 * Check for SystemID and ExternalID
2649 URI = htmlParseExternalID(ctxt, &ExternalID);
2650 SKIP_BLANKS;
2653 * We should be at the end of the DOCTYPE declaration.
2655 if (CUR != '>') {
2656 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2657 ctxt->sax->error(ctxt->userData, "DOCTYPE improperly terminated\n");
2658 ctxt->wellFormed = 0;
2659 /* We shouldn't try to resynchronize ... */
2661 NEXT;
2664 * Create or update the document accordingly to the DOCTYPE
2666 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
2667 (!ctxt->disableSAX))
2668 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
2671 * Cleanup, since we don't use all those identifiers
2673 if (URI != NULL) xmlFree(URI);
2674 if (ExternalID != NULL) xmlFree(ExternalID);
2675 if (name != NULL) xmlFree(name);
2679 * htmlParseAttribute:
2680 * @ctxt: an HTML parser context
2681 * @value: a xmlChar ** used to store the value of the attribute
2683 * parse an attribute
2685 * [41] Attribute ::= Name Eq AttValue
2687 * [25] Eq ::= S? '=' S?
2689 * With namespace:
2691 * [NS 11] Attribute ::= QName Eq AttValue
2693 * Also the case QName == xmlns:??? is handled independently as a namespace
2694 * definition.
2696 * Returns the attribute name, and the value in *value.
2699 static xmlChar *
2700 htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
2701 xmlChar *name, *val = NULL;
2703 *value = NULL;
2704 name = htmlParseHTMLName(ctxt);
2705 if (name == NULL) {
2706 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2707 ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
2708 ctxt->wellFormed = 0;
2709 return(NULL);
2713 * read the value
2715 SKIP_BLANKS;
2716 if (CUR == '=') {
2717 NEXT;
2718 SKIP_BLANKS;
2719 val = htmlParseAttValue(ctxt);
2720 /******
2721 } else {
2722 * TODO : some attribute must have values, some may not
2723 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2724 ctxt->sax->warning(ctxt->userData,
2725 "No value for attribute %s\n", name); */
2728 *value = val;
2729 return(name);
2733 * htmlCheckEncoding:
2734 * @ctxt: an HTML parser context
2735 * @attvalue: the attribute value
2737 * Checks an http-equiv attribute from a Meta tag to detect
2738 * the encoding
2739 * If a new encoding is detected the parser is switched to decode
2740 * it and pass UTF8
2742 static void
2743 htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
2744 const xmlChar *encoding;
2746 if ((ctxt == NULL) || (attvalue == NULL))
2747 return;
2749 /* do not change encoding */
2750 if (ctxt->input->encoding != NULL)
2751 return;
2753 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
2754 if (encoding != NULL) {
2755 encoding += 8;
2756 } else {
2757 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
2758 if (encoding != NULL)
2759 encoding += 9;
2761 if (encoding != NULL) {
2762 xmlCharEncoding enc;
2763 xmlCharEncodingHandlerPtr handler;
2765 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
2767 if (ctxt->input->encoding != NULL)
2768 xmlFree((xmlChar *) ctxt->input->encoding);
2769 ctxt->input->encoding = xmlStrdup(encoding);
2771 enc = xmlParseCharEncoding((const char *) encoding);
2773 * registered set of known encodings
2775 if (enc != XML_CHAR_ENCODING_ERROR) {
2776 xmlSwitchEncoding(ctxt, enc);
2777 ctxt->charset = XML_CHAR_ENCODING_UTF8;
2778 } else {
2780 * fallback for unknown encodings
2782 handler = xmlFindCharEncodingHandler((const char *) encoding);
2783 if (handler != NULL) {
2784 xmlSwitchToEncoding(ctxt, handler);
2785 ctxt->charset = XML_CHAR_ENCODING_UTF8;
2786 } else {
2787 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
2791 if ((ctxt->input->buf != NULL) &&
2792 (ctxt->input->buf->encoder != NULL) &&
2793 (ctxt->input->buf->raw != NULL) &&
2794 (ctxt->input->buf->buffer != NULL)) {
2795 int nbchars;
2796 int processed;
2799 * convert as much as possible to the parser reading buffer.
2801 processed = ctxt->input->cur - ctxt->input->base;
2802 xmlBufferShrink(ctxt->input->buf->buffer, processed);
2803 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
2804 ctxt->input->buf->buffer,
2805 ctxt->input->buf->raw);
2806 if (nbchars < 0) {
2807 ctxt->errNo = XML_ERR_INVALID_ENCODING;
2808 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2809 ctxt->sax->error(ctxt->userData,
2810 "htmlCheckEncoding: encoder error\n");
2812 ctxt->input->base =
2813 ctxt->input->cur = ctxt->input->buf->buffer->content;
2819 * htmlCheckMeta:
2820 * @ctxt: an HTML parser context
2821 * @atts: the attributes values
2823 * Checks an attributes from a Meta tag
2825 static void
2826 htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
2827 int i;
2828 const xmlChar *att, *value;
2829 int http = 0;
2830 const xmlChar *content = NULL;
2832 if ((ctxt == NULL) || (atts == NULL))
2833 return;
2835 i = 0;
2836 att = atts[i++];
2837 while (att != NULL) {
2838 value = atts[i++];
2839 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
2840 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
2841 http = 1;
2842 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
2843 content = value;
2844 att = atts[i++];
2846 if ((http) && (content != NULL))
2847 htmlCheckEncoding(ctxt, content);
2852 * htmlParseStartTag:
2853 * @ctxt: an HTML parser context
2855 * parse a start of tag either for rule element or
2856 * EmptyElement. In both case we don't parse the tag closing chars.
2858 * [40] STag ::= '<' Name (S Attribute)* S? '>'
2860 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
2862 * With namespace:
2864 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
2866 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
2870 static void
2871 htmlParseStartTag(htmlParserCtxtPtr ctxt) {
2872 xmlChar *name;
2873 xmlChar *attname;
2874 xmlChar *attvalue;
2875 const xmlChar **atts = NULL;
2876 int nbatts = 0;
2877 int maxatts = 0;
2878 int meta = 0;
2879 int i;
2881 if (CUR != '<') return;
2882 NEXT;
2884 GROW;
2885 name = htmlParseHTMLName(ctxt);
2886 if (name == NULL) {
2887 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2888 ctxt->sax->error(ctxt->userData,
2889 "htmlParseStartTag: invalid element name\n");
2890 ctxt->wellFormed = 0;
2891 /* Dump the bogus tag like browsers do */
2892 while ((IS_CHAR(CUR)) && (CUR != '>'))
2893 NEXT;
2894 return;
2896 if (xmlStrEqual(name, BAD_CAST"meta"))
2897 meta = 1;
2900 * Check for auto-closure of HTML elements.
2902 htmlAutoClose(ctxt, name);
2905 * Check for implied HTML elements.
2907 htmlCheckImplied(ctxt, name);
2910 * Avoid html at any level > 0, head at any level != 1
2911 * or any attempt to recurse body
2913 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
2914 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2915 ctxt->sax->error(ctxt->userData,
2916 "htmlParseStartTag: misplaced <html> tag\n");
2917 ctxt->wellFormed = 0;
2918 xmlFree(name);
2919 return;
2921 if ((ctxt->nameNr != 1) &&
2922 (xmlStrEqual(name, BAD_CAST"head"))) {
2923 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2924 ctxt->sax->error(ctxt->userData,
2925 "htmlParseStartTag: misplaced <head> tag\n");
2926 ctxt->wellFormed = 0;
2927 xmlFree(name);
2928 return;
2930 if (xmlStrEqual(name, BAD_CAST"body")) {
2931 int indx;
2932 for (indx = 0;indx < ctxt->nameNr;indx++) {
2933 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
2934 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2935 ctxt->sax->error(ctxt->userData,
2936 "htmlParseStartTag: misplaced <body> tag\n");
2937 ctxt->wellFormed = 0;
2938 xmlFree(name);
2939 return;
2945 * Now parse the attributes, it ends up with the ending
2947 * (S Attribute)* S?
2949 SKIP_BLANKS;
2950 while ((IS_CHAR(CUR)) &&
2951 (CUR != '>') &&
2952 ((CUR != '/') || (NXT(1) != '>'))) {
2953 long cons = ctxt->nbChars;
2955 GROW;
2956 attname = htmlParseAttribute(ctxt, &attvalue);
2957 if (attname != NULL) {
2960 * Well formedness requires at most one declaration of an attribute
2962 for (i = 0; i < nbatts;i += 2) {
2963 if (xmlStrEqual(atts[i], attname)) {
2964 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2965 ctxt->sax->error(ctxt->userData,
2966 "Attribute %s redefined\n",
2967 attname);
2968 ctxt->wellFormed = 0;
2969 xmlFree(attname);
2970 if (attvalue != NULL)
2971 xmlFree(attvalue);
2972 goto failed;
2977 * Add the pair to atts
2979 if (atts == NULL) {
2980 maxatts = 10;
2981 atts = (const xmlChar **) xmlMalloc(maxatts * sizeof(xmlChar *));
2982 if (atts == NULL) {
2983 xmlGenericError(xmlGenericErrorContext,
2984 "malloc of %ld byte failed\n",
2985 maxatts * (long)sizeof(xmlChar *));
2986 if (name != NULL) xmlFree(name);
2987 return;
2989 } else if (nbatts + 4 > maxatts) {
2990 maxatts *= 2;
2991 atts = (const xmlChar **) xmlRealloc((void *) atts,
2992 maxatts * sizeof(xmlChar *));
2993 if (atts == NULL) {
2994 xmlGenericError(xmlGenericErrorContext,
2995 "realloc of %ld byte failed\n",
2996 maxatts * (long)sizeof(xmlChar *));
2997 if (name != NULL) xmlFree(name);
2998 return;
3001 atts[nbatts++] = attname;
3002 atts[nbatts++] = attvalue;
3003 atts[nbatts] = NULL;
3004 atts[nbatts + 1] = NULL;
3006 else {
3007 /* Dump the bogus attribute string up to the next blank or
3008 * the end of the tag. */
3009 while ((IS_CHAR(CUR)) && !(IS_BLANK(CUR)) && (CUR != '>')
3010 && ((CUR != '/') || (NXT(1) != '>')))
3011 NEXT;
3014 failed:
3015 SKIP_BLANKS;
3016 if (cons == ctxt->nbChars) {
3017 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3018 ctxt->sax->error(ctxt->userData,
3019 "htmlParseStartTag: problem parsing attributes\n");
3020 ctxt->wellFormed = 0;
3021 break;
3026 * Handle specific association to the META tag
3028 if (meta)
3029 htmlCheckMeta(ctxt, atts);
3032 * SAX: Start of Element !
3034 htmlnamePush(ctxt, xmlStrdup(name));
3035 #ifdef DEBUG
3036 xmlGenericError(xmlGenericErrorContext,"Start of element %s: pushed %s\n", name, ctxt->name);
3037 #endif
3038 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
3039 ctxt->sax->startElement(ctxt->userData, name, atts);
3041 if (atts != NULL) {
3042 for (i = 0;i < nbatts;i++) {
3043 if (atts[i] != NULL)
3044 xmlFree((xmlChar *) atts[i]);
3046 xmlFree((void *) atts);
3048 if (name != NULL) xmlFree(name);
3052 * htmlParseEndTag:
3053 * @ctxt: an HTML parser context
3055 * parse an end of tag
3057 * [42] ETag ::= '</' Name S? '>'
3059 * With namespace
3061 * [NS 9] ETag ::= '</' QName S? '>'
3063 * Returns 1 if the current level should be closed.
3066 static int
3067 htmlParseEndTag(htmlParserCtxtPtr ctxt) {
3068 xmlChar *name;
3069 xmlChar *oldname;
3070 int i, ret;
3072 if ((CUR != '<') || (NXT(1) != '/')) {
3073 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3074 ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
3075 ctxt->wellFormed = 0;
3076 return(0);
3078 SKIP(2);
3080 name = htmlParseHTMLName(ctxt);
3081 if (name == NULL) return(0);
3084 * We should definitely be at the ending "S? '>'" part
3086 SKIP_BLANKS;
3087 if ((!IS_CHAR(CUR)) || (CUR != '>')) {
3088 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3089 ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
3090 ctxt->wellFormed = 0;
3091 } else
3092 NEXT;
3095 * If the name read is not one of the element in the parsing stack
3096 * then return, it's just an error.
3098 for (i = (ctxt->nameNr - 1);i >= 0;i--) {
3099 if (xmlStrEqual(name, ctxt->nameTab[i])) break;
3101 if (i < 0) {
3102 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3103 ctxt->sax->error(ctxt->userData,
3104 "Unexpected end tag : %s\n", name);
3105 xmlFree(name);
3106 ctxt->wellFormed = 0;
3107 return(0);
3112 * Check for auto-closure of HTML elements.
3115 htmlAutoCloseOnClose(ctxt, name);
3118 * Well formedness constraints, opening and closing must match.
3119 * With the exception that the autoclose may have popped stuff out
3120 * of the stack.
3122 if (!xmlStrEqual(name, ctxt->name)) {
3123 #ifdef DEBUG
3124 xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", name, ctxt->name);
3125 #endif
3126 if ((ctxt->name != NULL) &&
3127 (!xmlStrEqual(ctxt->name, name))) {
3128 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3129 ctxt->sax->error(ctxt->userData,
3130 "Opening and ending tag mismatch: %s and %s\n",
3131 name, ctxt->name);
3132 ctxt->wellFormed = 0;
3137 * SAX: End of Tag
3139 oldname = ctxt->name;
3140 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
3141 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3142 ctxt->sax->endElement(ctxt->userData, name);
3143 oldname = htmlnamePop(ctxt);
3144 if (oldname != NULL) {
3145 #ifdef DEBUG
3146 xmlGenericError(xmlGenericErrorContext,"End of tag %s: popping out %s\n", name, oldname);
3147 #endif
3148 xmlFree(oldname);
3149 #ifdef DEBUG
3150 } else {
3151 xmlGenericError(xmlGenericErrorContext,"End of tag %s: stack empty !!!\n", name);
3152 #endif
3154 ret = 1;
3155 } else {
3156 ret = 0;
3159 if (name != NULL)
3160 xmlFree(name);
3162 return(ret);
3167 * htmlParseReference:
3168 * @ctxt: an HTML parser context
3170 * parse and handle entity references in content,
3171 * this will end-up in a call to character() since this is either a
3172 * CharRef, or a predefined entity.
3174 static void
3175 htmlParseReference(htmlParserCtxtPtr ctxt) {
3176 const htmlEntityDesc * ent;
3177 xmlChar out[6];
3178 xmlChar *name;
3179 if (CUR != '&') return;
3181 if (NXT(1) == '#') {
3182 unsigned int c;
3183 int bits, i = 0;
3185 c = htmlParseCharRef(ctxt);
3186 if (c == 0)
3187 return;
3189 if (c < 0x80) { out[i++]= c; bits= -6; }
3190 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3191 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3192 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3194 for ( ; bits >= 0; bits-= 6) {
3195 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3197 out[i] = 0;
3199 htmlCheckParagraph(ctxt);
3200 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3201 ctxt->sax->characters(ctxt->userData, out, i);
3202 } else {
3203 ent = htmlParseEntityRef(ctxt, &name);
3204 if (name == NULL) {
3205 htmlCheckParagraph(ctxt);
3206 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3207 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3208 return;
3210 if ((ent == NULL) || !(ent->value > 0)) {
3211 htmlCheckParagraph(ctxt);
3212 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3213 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3214 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3215 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3217 } else {
3218 unsigned int c;
3219 int bits, i = 0;
3221 c = ent->value;
3222 if (c < 0x80)
3223 { out[i++]= c; bits= -6; }
3224 else if (c < 0x800)
3225 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3226 else if (c < 0x10000)
3227 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3228 else
3229 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3231 for ( ; bits >= 0; bits-= 6) {
3232 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3234 out[i] = 0;
3236 htmlCheckParagraph(ctxt);
3237 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3238 ctxt->sax->characters(ctxt->userData, out, i);
3240 xmlFree(name);
3245 * htmlParseContent:
3246 * @ctxt: an HTML parser context
3247 * @name: the node name
3249 * Parse a content: comment, sub-element, reference or text.
3253 static void
3254 htmlParseContent(htmlParserCtxtPtr ctxt) {
3255 xmlChar *currentNode;
3256 int depth;
3258 currentNode = xmlStrdup(ctxt->name);
3259 depth = ctxt->nameNr;
3260 while (1) {
3261 long cons = ctxt->nbChars;
3263 GROW;
3265 * Our tag or one of it's parent or children is ending.
3267 if ((CUR == '<') && (NXT(1) == '/')) {
3268 if (htmlParseEndTag(ctxt) &&
3269 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
3270 if (currentNode != NULL)
3271 xmlFree(currentNode);
3272 return;
3274 continue; /* while */
3278 * Has this node been popped out during parsing of
3279 * the next element
3281 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3282 (!xmlStrEqual(currentNode, ctxt->name)))
3284 if (currentNode != NULL) xmlFree(currentNode);
3285 return;
3288 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3289 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
3291 * Handle SCRIPT/STYLE separately
3293 htmlParseScript(ctxt);
3294 } else {
3296 * Sometimes DOCTYPE arrives in the middle of the document
3298 if ((CUR == '<') && (NXT(1) == '!') &&
3299 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3300 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3301 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3302 (UPP(8) == 'E')) {
3303 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3304 ctxt->sax->error(ctxt->userData,
3305 "Misplaced DOCTYPE declaration\n");
3306 ctxt->wellFormed = 0;
3307 htmlParseDocTypeDecl(ctxt);
3311 * First case : a comment
3313 if ((CUR == '<') && (NXT(1) == '!') &&
3314 (NXT(2) == '-') && (NXT(3) == '-')) {
3315 htmlParseComment(ctxt);
3319 * Second case : a sub-element.
3321 else if (CUR == '<') {
3322 htmlParseElement(ctxt);
3326 * Third case : a reference. If if has not been resolved,
3327 * parsing returns it's Name, create the node
3329 else if (CUR == '&') {
3330 htmlParseReference(ctxt);
3334 * Fourth : end of the resource
3336 else if (CUR == 0) {
3337 htmlAutoCloseOnEnd(ctxt);
3338 break;
3342 * Last case, text. Note that References are handled directly.
3344 else {
3345 htmlParseCharData(ctxt);
3348 if (cons == ctxt->nbChars) {
3349 if (ctxt->node != NULL) {
3350 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3351 ctxt->sax->error(ctxt->userData,
3352 "detected an error in element content\n");
3353 ctxt->wellFormed = 0;
3355 break;
3358 GROW;
3360 if (currentNode != NULL) xmlFree(currentNode);
3364 * htmlParseElement:
3365 * @ctxt: an HTML parser context
3367 * parse an HTML element, this is highly recursive
3369 * [39] element ::= EmptyElemTag | STag content ETag
3371 * [41] Attribute ::= Name Eq AttValue
3374 void
3375 htmlParseElement(htmlParserCtxtPtr ctxt) {
3376 xmlChar *name;
3377 xmlChar *currentNode = NULL;
3378 const htmlElemDesc * info;
3379 htmlParserNodeInfo node_info;
3380 xmlChar *oldname;
3381 int depth = ctxt->nameNr;
3382 const xmlChar *oldptr;
3384 /* Capture start position */
3385 if (ctxt->record_info) {
3386 node_info.begin_pos = ctxt->input->consumed +
3387 (CUR_PTR - ctxt->input->base);
3388 node_info.begin_line = ctxt->input->line;
3391 oldname = xmlStrdup(ctxt->name);
3392 htmlParseStartTag(ctxt);
3393 name = ctxt->name;
3394 #ifdef DEBUG
3395 if (oldname == NULL)
3396 xmlGenericError(xmlGenericErrorContext,
3397 "Start of element %s\n", name);
3398 else if (name == NULL)
3399 xmlGenericError(xmlGenericErrorContext,
3400 "Start of element failed, was %s\n", oldname);
3401 else
3402 xmlGenericError(xmlGenericErrorContext,
3403 "Start of element %s, was %s\n", name, oldname);
3404 #endif
3405 if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) ||
3406 (name == NULL)) {
3407 if (CUR == '>')
3408 NEXT;
3409 if (oldname != NULL)
3410 xmlFree(oldname);
3411 return;
3413 if (oldname != NULL)
3414 xmlFree(oldname);
3417 * Lookup the info for that element.
3419 info = htmlTagLookup(name);
3420 if (info == NULL) {
3421 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3422 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
3423 name);
3424 ctxt->wellFormed = 0;
3425 } else if (info->depr) {
3426 /***************************
3427 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
3428 ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
3429 name);
3430 ***************************/
3434 * Check for an Empty Element labeled the XML/SGML way
3436 if ((CUR == '/') && (NXT(1) == '>')) {
3437 SKIP(2);
3438 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3439 ctxt->sax->endElement(ctxt->userData, name);
3440 oldname = htmlnamePop(ctxt);
3441 #ifdef DEBUG
3442 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n", oldname);
3443 #endif
3444 if (oldname != NULL)
3445 xmlFree(oldname);
3446 return;
3449 if (CUR == '>') {
3450 NEXT;
3451 } else {
3452 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3453 ctxt->sax->error(ctxt->userData,
3454 "Couldn't find end of Start Tag %s\n",
3455 name);
3456 ctxt->wellFormed = 0;
3459 * end of parsing of this node.
3461 if (xmlStrEqual(name, ctxt->name)) {
3462 nodePop(ctxt);
3463 oldname = htmlnamePop(ctxt);
3464 #ifdef DEBUG
3465 xmlGenericError(xmlGenericErrorContext,"End of start tag problem: popping out %s\n", oldname);
3466 #endif
3467 if (oldname != NULL)
3468 xmlFree(oldname);
3472 * Capture end position and add node
3474 if ( currentNode != NULL && ctxt->record_info ) {
3475 node_info.end_pos = ctxt->input->consumed +
3476 (CUR_PTR - ctxt->input->base);
3477 node_info.end_line = ctxt->input->line;
3478 node_info.node = ctxt->node;
3479 xmlParserAddNodeInfo(ctxt, &node_info);
3481 return;
3485 * Check for an Empty Element from DTD definition
3487 if ((info != NULL) && (info->empty)) {
3488 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3489 ctxt->sax->endElement(ctxt->userData, name);
3490 oldname = htmlnamePop(ctxt);
3491 #ifdef DEBUG
3492 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
3493 #endif
3494 if (oldname != NULL)
3495 xmlFree(oldname);
3496 return;
3500 * Parse the content of the element:
3502 currentNode = xmlStrdup(ctxt->name);
3503 depth = ctxt->nameNr;
3504 while (IS_CHAR(CUR)) {
3505 oldptr = ctxt->input->cur;
3506 htmlParseContent(ctxt);
3507 if (oldptr==ctxt->input->cur) break;
3508 if (ctxt->nameNr < depth) break;
3512 * Capture end position and add node
3514 if ( currentNode != NULL && ctxt->record_info ) {
3515 node_info.end_pos = ctxt->input->consumed +
3516 (CUR_PTR - ctxt->input->base);
3517 node_info.end_line = ctxt->input->line;
3518 node_info.node = ctxt->node;
3519 xmlParserAddNodeInfo(ctxt, &node_info);
3521 if (!IS_CHAR(CUR)) {
3522 htmlAutoCloseOnEnd(ctxt);
3525 if (currentNode != NULL)
3526 xmlFree(currentNode);
3530 * htmlParseDocument :
3531 * @ctxt: an HTML parser context
3533 * parse an HTML document (and build a tree if using the standard SAX
3534 * interface).
3536 * Returns 0, -1 in case of error. the parser context is augmented
3537 * as a result of the parsing.
3541 htmlParseDocument(htmlParserCtxtPtr ctxt) {
3542 xmlDtdPtr dtd;
3544 xmlInitParser();
3546 htmlDefaultSAXHandlerInit();
3547 ctxt->html = 1;
3549 GROW;
3551 * SAX: beginning of the document processing.
3553 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3554 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
3557 * Wipe out everything which is before the first '<'
3559 SKIP_BLANKS;
3560 if (CUR == 0) {
3561 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3562 ctxt->sax->error(ctxt->userData, "Document is empty\n");
3563 ctxt->wellFormed = 0;
3566 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
3567 ctxt->sax->startDocument(ctxt->userData);
3571 * Parse possible comments before any content
3573 while ((CUR == '<') && (NXT(1) == '!') &&
3574 (NXT(2) == '-') && (NXT(3) == '-')) {
3575 htmlParseComment(ctxt);
3576 SKIP_BLANKS;
3581 * Then possibly doc type declaration(s) and more Misc
3582 * (doctypedecl Misc*)?
3584 if ((CUR == '<') && (NXT(1) == '!') &&
3585 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3586 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3587 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3588 (UPP(8) == 'E')) {
3589 htmlParseDocTypeDecl(ctxt);
3591 SKIP_BLANKS;
3594 * Parse possible comments before any content
3596 while ((CUR == '<') && (NXT(1) == '!') &&
3597 (NXT(2) == '-') && (NXT(3) == '-')) {
3598 htmlParseComment(ctxt);
3599 SKIP_BLANKS;
3603 * Time to start parsing the tree itself
3605 htmlParseContent(ctxt);
3608 * autoclose
3610 if (CUR == 0)
3611 htmlAutoCloseOnEnd(ctxt);
3615 * SAX: end of the document processing.
3617 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3618 ctxt->sax->endDocument(ctxt->userData);
3620 if (ctxt->myDoc != NULL) {
3621 dtd = xmlGetIntSubset(ctxt->myDoc);
3622 if (dtd == NULL)
3623 ctxt->myDoc->intSubset =
3624 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
3625 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
3626 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
3628 if (! ctxt->wellFormed) return(-1);
3629 return(0);
3633 /************************************************************************
3635 * Parser contexts handling *
3637 ************************************************************************/
3640 * xmlInitParserCtxt:
3641 * @ctxt: an HTML parser context
3643 * Initialize a parser context
3646 static void
3647 htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
3649 htmlSAXHandler *sax;
3651 if (ctxt == NULL) return;
3652 memset(ctxt, 0, sizeof(htmlParserCtxt));
3654 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
3655 if (sax == NULL) {
3656 xmlGenericError(xmlGenericErrorContext,
3657 "htmlInitParserCtxt: out of memory\n");
3659 else
3660 memset(sax, 0, sizeof(htmlSAXHandler));
3662 /* Allocate the Input stack */
3663 ctxt->inputTab = (htmlParserInputPtr *)
3664 xmlMalloc(5 * sizeof(htmlParserInputPtr));
3665 if (ctxt->inputTab == NULL) {
3666 xmlGenericError(xmlGenericErrorContext,
3667 "htmlInitParserCtxt: out of memory\n");
3668 ctxt->inputNr = 0;
3669 ctxt->inputMax = 0;
3670 ctxt->input = NULL;
3671 return;
3673 ctxt->inputNr = 0;
3674 ctxt->inputMax = 5;
3675 ctxt->input = NULL;
3676 ctxt->version = NULL;
3677 ctxt->encoding = NULL;
3678 ctxt->standalone = -1;
3679 ctxt->instate = XML_PARSER_START;
3681 /* Allocate the Node stack */
3682 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
3683 if (ctxt->nodeTab == NULL) {
3684 xmlGenericError(xmlGenericErrorContext,
3685 "htmlInitParserCtxt: out of memory\n");
3686 ctxt->nodeNr = 0;
3687 ctxt->nodeMax = 0;
3688 ctxt->node = NULL;
3689 ctxt->inputNr = 0;
3690 ctxt->inputMax = 0;
3691 ctxt->input = NULL;
3692 return;
3694 ctxt->nodeNr = 0;
3695 ctxt->nodeMax = 10;
3696 ctxt->node = NULL;
3698 /* Allocate the Name stack */
3699 ctxt->nameTab = (xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
3700 if (ctxt->nameTab == NULL) {
3701 xmlGenericError(xmlGenericErrorContext,
3702 "htmlInitParserCtxt: out of memory\n");
3703 ctxt->nameNr = 0;
3704 ctxt->nameMax = 10;
3705 ctxt->name = NULL;
3706 ctxt->nodeNr = 0;
3707 ctxt->nodeMax = 0;
3708 ctxt->node = NULL;
3709 ctxt->inputNr = 0;
3710 ctxt->inputMax = 0;
3711 ctxt->input = NULL;
3712 return;
3714 ctxt->nameNr = 0;
3715 ctxt->nameMax = 10;
3716 ctxt->name = NULL;
3718 if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
3719 else {
3720 ctxt->sax = sax;
3721 memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
3723 ctxt->userData = ctxt;
3724 ctxt->myDoc = NULL;
3725 ctxt->wellFormed = 1;
3726 ctxt->replaceEntities = 0;
3727 ctxt->linenumbers = xmlLineNumbersDefaultValue;
3728 ctxt->html = 1;
3729 ctxt->record_info = 0;
3730 ctxt->validate = 0;
3731 ctxt->nbChars = 0;
3732 ctxt->checkIndex = 0;
3733 ctxt->catalogs = NULL;
3734 xmlInitNodeInfoSeq(&ctxt->node_seq);
3738 * htmlFreeParserCtxt:
3739 * @ctxt: an HTML parser context
3741 * Free all the memory used by a parser context. However the parsed
3742 * document in ctxt->myDoc is not freed.
3745 void
3746 htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
3748 xmlFreeParserCtxt(ctxt);
3752 * htmlNewParserCtxt:
3754 * Allocate and initialize a new parser context.
3756 * Returns the xmlParserCtxtPtr or NULL
3759 static htmlParserCtxtPtr
3760 htmlNewParserCtxt(void)
3762 xmlParserCtxtPtr ctxt;
3764 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
3765 if (ctxt == NULL) {
3766 xmlGenericError(xmlGenericErrorContext,
3767 "xmlNewParserCtxt : cannot allocate context\n");
3768 return(NULL);
3770 memset(ctxt, 0, sizeof(xmlParserCtxt));
3771 htmlInitParserCtxt(ctxt);
3772 return(ctxt);
3776 * htmlCreateMemoryParserCtxt:
3777 * @buffer: a pointer to a char array
3778 * @size: the size of the array
3780 * Create a parser context for an HTML in-memory document.
3782 * Returns the new parser context or NULL
3784 static htmlParserCtxtPtr
3785 htmlCreateMemoryParserCtxt(const char *buffer, int size) {
3786 xmlParserCtxtPtr ctxt;
3787 xmlParserInputPtr input;
3788 xmlParserInputBufferPtr buf;
3790 if (buffer == NULL)
3791 return(NULL);
3792 if (size <= 0)
3793 return(NULL);
3795 ctxt = htmlNewParserCtxt();
3796 if (ctxt == NULL)
3797 return(NULL);
3799 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
3800 if (buf == NULL) return(NULL);
3802 input = xmlNewInputStream(ctxt);
3803 if (input == NULL) {
3804 xmlFreeParserCtxt(ctxt);
3805 return(NULL);
3808 input->filename = NULL;
3809 input->buf = buf;
3810 input->base = input->buf->buffer->content;
3811 input->cur = input->buf->buffer->content;
3812 input->end = &input->buf->buffer->content[input->buf->buffer->use];
3814 inputPush(ctxt, input);
3815 return(ctxt);
3819 * htmlCreateDocParserCtxt :
3820 * @cur: a pointer to an array of xmlChar
3821 * @encoding: a free form C string describing the HTML document encoding, or NULL
3823 * Create a parser context for an HTML document.
3825 * TODO: check the need to add encoding handling there
3827 * Returns the new parser context or NULL
3829 static htmlParserCtxtPtr
3830 htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding ATTRIBUTE_UNUSED) {
3831 int len;
3833 if (cur == NULL)
3834 return(NULL);
3835 len = xmlStrlen(cur);
3836 return(htmlCreateMemoryParserCtxt((char *)cur, len));
3839 /************************************************************************
3841 * Progressive parsing interfaces *
3843 ************************************************************************/
3846 * htmlParseLookupSequence:
3847 * @ctxt: an HTML parser context
3848 * @first: the first char to lookup
3849 * @next: the next char to lookup or zero
3850 * @third: the next char to lookup or zero
3852 * Try to find if a sequence (first, next, third) or just (first next) or
3853 * (first) is available in the input stream.
3854 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
3855 * to avoid rescanning sequences of bytes, it DOES change the state of the
3856 * parser, do not use liberally.
3857 * This is basically similar to xmlParseLookupSequence()
3859 * Returns the index to the current parsing point if the full sequence
3860 * is available, -1 otherwise.
3862 static int
3863 htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
3864 xmlChar next, xmlChar third) {
3865 int base, len;
3866 htmlParserInputPtr in;
3867 const xmlChar *buf;
3868 int incomment = 0;
3870 in = ctxt->input;
3871 if (in == NULL) return(-1);
3872 base = in->cur - in->base;
3873 if (base < 0) return(-1);
3874 if (ctxt->checkIndex > base)
3875 base = ctxt->checkIndex;
3876 if (in->buf == NULL) {
3877 buf = in->base;
3878 len = in->length;
3879 } else {
3880 buf = in->buf->buffer->content;
3881 len = in->buf->buffer->use;
3883 /* take into account the sequence length */
3884 if (third) len -= 2;
3885 else if (next) len --;
3886 for (;base < len;base++) {
3887 if (!incomment && (base + 4 < len)) {
3888 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
3889 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
3890 incomment = 1;
3892 /* do not increment base, some people use <!--> */
3894 if (incomment) {
3895 if (base + 3 < len)
3896 return(-1);
3897 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
3898 (buf[base + 2] == '>')) {
3899 incomment = 0;
3900 base += 2;
3902 continue;
3904 if (buf[base] == first) {
3905 if (third != 0) {
3906 if ((buf[base + 1] != next) ||
3907 (buf[base + 2] != third)) continue;
3908 } else if (next != 0) {
3909 if (buf[base + 1] != next) continue;
3911 ctxt->checkIndex = 0;
3912 #ifdef DEBUG_PUSH
3913 if (next == 0)
3914 xmlGenericError(xmlGenericErrorContext,
3915 "HPP: lookup '%c' found at %d\n",
3916 first, base);
3917 else if (third == 0)
3918 xmlGenericError(xmlGenericErrorContext,
3919 "HPP: lookup '%c%c' found at %d\n",
3920 first, next, base);
3921 else
3922 xmlGenericError(xmlGenericErrorContext,
3923 "HPP: lookup '%c%c%c' found at %d\n",
3924 first, next, third, base);
3925 #endif
3926 return(base - (in->cur - in->base));
3929 ctxt->checkIndex = base;
3930 #ifdef DEBUG_PUSH
3931 if (next == 0)
3932 xmlGenericError(xmlGenericErrorContext,
3933 "HPP: lookup '%c' failed\n", first);
3934 else if (third == 0)
3935 xmlGenericError(xmlGenericErrorContext,
3936 "HPP: lookup '%c%c' failed\n", first, next);
3937 else
3938 xmlGenericError(xmlGenericErrorContext,
3939 "HPP: lookup '%c%c%c' failed\n", first, next, third);
3940 #endif
3941 return(-1);
3945 * htmlParseTryOrFinish:
3946 * @ctxt: an HTML parser context
3947 * @terminate: last chunk indicator
3949 * Try to progress on parsing
3951 * Returns zero if no parsing was possible
3953 static int
3954 htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
3955 int ret = 0;
3956 htmlParserInputPtr in;
3957 int avail = 0;
3958 xmlChar cur, next;
3960 #ifdef DEBUG_PUSH
3961 switch (ctxt->instate) {
3962 case XML_PARSER_EOF:
3963 xmlGenericError(xmlGenericErrorContext,
3964 "HPP: try EOF\n"); break;
3965 case XML_PARSER_START:
3966 xmlGenericError(xmlGenericErrorContext,
3967 "HPP: try START\n"); break;
3968 case XML_PARSER_MISC:
3969 xmlGenericError(xmlGenericErrorContext,
3970 "HPP: try MISC\n");break;
3971 case XML_PARSER_COMMENT:
3972 xmlGenericError(xmlGenericErrorContext,
3973 "HPP: try COMMENT\n");break;
3974 case XML_PARSER_PROLOG:
3975 xmlGenericError(xmlGenericErrorContext,
3976 "HPP: try PROLOG\n");break;
3977 case XML_PARSER_START_TAG:
3978 xmlGenericError(xmlGenericErrorContext,
3979 "HPP: try START_TAG\n");break;
3980 case XML_PARSER_CONTENT:
3981 xmlGenericError(xmlGenericErrorContext,
3982 "HPP: try CONTENT\n");break;
3983 case XML_PARSER_CDATA_SECTION:
3984 xmlGenericError(xmlGenericErrorContext,
3985 "HPP: try CDATA_SECTION\n");break;
3986 case XML_PARSER_END_TAG:
3987 xmlGenericError(xmlGenericErrorContext,
3988 "HPP: try END_TAG\n");break;
3989 case XML_PARSER_ENTITY_DECL:
3990 xmlGenericError(xmlGenericErrorContext,
3991 "HPP: try ENTITY_DECL\n");break;
3992 case XML_PARSER_ENTITY_VALUE:
3993 xmlGenericError(xmlGenericErrorContext,
3994 "HPP: try ENTITY_VALUE\n");break;
3995 case XML_PARSER_ATTRIBUTE_VALUE:
3996 xmlGenericError(xmlGenericErrorContext,
3997 "HPP: try ATTRIBUTE_VALUE\n");break;
3998 case XML_PARSER_DTD:
3999 xmlGenericError(xmlGenericErrorContext,
4000 "HPP: try DTD\n");break;
4001 case XML_PARSER_EPILOG:
4002 xmlGenericError(xmlGenericErrorContext,
4003 "HPP: try EPILOG\n");break;
4004 case XML_PARSER_PI:
4005 xmlGenericError(xmlGenericErrorContext,
4006 "HPP: try PI\n");break;
4007 case XML_PARSER_SYSTEM_LITERAL:
4008 xmlGenericError(xmlGenericErrorContext,
4009 "HPP: try SYSTEM_LITERAL\n");break;
4011 #endif
4013 while (1) {
4015 in = ctxt->input;
4016 if (in == NULL) break;
4017 if (in->buf == NULL)
4018 avail = in->length - (in->cur - in->base);
4019 else
4020 avail = in->buf->buffer->use - (in->cur - in->base);
4021 if ((avail == 0) && (terminate)) {
4022 htmlAutoCloseOnEnd(ctxt);
4023 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4025 * SAX: end of the document processing.
4027 ctxt->instate = XML_PARSER_EOF;
4028 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4029 ctxt->sax->endDocument(ctxt->userData);
4032 if (avail < 1)
4033 goto done;
4034 switch (ctxt->instate) {
4035 case XML_PARSER_EOF:
4037 * Document parsing is done !
4039 goto done;
4040 case XML_PARSER_START:
4042 * Very first chars read from the document flow.
4044 cur = in->cur[0];
4045 if (IS_BLANK(cur)) {
4046 SKIP_BLANKS;
4047 if (in->buf == NULL)
4048 avail = in->length - (in->cur - in->base);
4049 else
4050 avail = in->buf->buffer->use - (in->cur - in->base);
4052 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4053 ctxt->sax->setDocumentLocator(ctxt->userData,
4054 &xmlDefaultSAXLocator);
4055 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4056 (!ctxt->disableSAX))
4057 ctxt->sax->startDocument(ctxt->userData);
4059 cur = in->cur[0];
4060 next = in->cur[1];
4061 if ((cur == '<') && (next == '!') &&
4062 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4063 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4064 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4065 (UPP(8) == 'E')) {
4066 if ((!terminate) &&
4067 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4068 goto done;
4069 #ifdef DEBUG_PUSH
4070 xmlGenericError(xmlGenericErrorContext,
4071 "HPP: Parsing internal subset\n");
4072 #endif
4073 htmlParseDocTypeDecl(ctxt);
4074 ctxt->instate = XML_PARSER_PROLOG;
4075 #ifdef DEBUG_PUSH
4076 xmlGenericError(xmlGenericErrorContext,
4077 "HPP: entering PROLOG\n");
4078 #endif
4079 } else {
4080 ctxt->instate = XML_PARSER_MISC;
4082 #ifdef DEBUG_PUSH
4083 xmlGenericError(xmlGenericErrorContext,
4084 "HPP: entering MISC\n");
4085 #endif
4086 break;
4087 case XML_PARSER_MISC:
4088 SKIP_BLANKS;
4089 if (in->buf == NULL)
4090 avail = in->length - (in->cur - in->base);
4091 else
4092 avail = in->buf->buffer->use - (in->cur - in->base);
4093 if (avail < 2)
4094 goto done;
4095 cur = in->cur[0];
4096 next = in->cur[1];
4097 if ((cur == '<') && (next == '!') &&
4098 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4099 if ((!terminate) &&
4100 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4101 goto done;
4102 #ifdef DEBUG_PUSH
4103 xmlGenericError(xmlGenericErrorContext,
4104 "HPP: Parsing Comment\n");
4105 #endif
4106 htmlParseComment(ctxt);
4107 ctxt->instate = XML_PARSER_MISC;
4108 } else if ((cur == '<') && (next == '!') &&
4109 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4110 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4111 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4112 (UPP(8) == 'E')) {
4113 if ((!terminate) &&
4114 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4115 goto done;
4116 #ifdef DEBUG_PUSH
4117 xmlGenericError(xmlGenericErrorContext,
4118 "HPP: Parsing internal subset\n");
4119 #endif
4120 htmlParseDocTypeDecl(ctxt);
4121 ctxt->instate = XML_PARSER_PROLOG;
4122 #ifdef DEBUG_PUSH
4123 xmlGenericError(xmlGenericErrorContext,
4124 "HPP: entering PROLOG\n");
4125 #endif
4126 } else if ((cur == '<') && (next == '!') &&
4127 (avail < 9)) {
4128 goto done;
4129 } else {
4130 ctxt->instate = XML_PARSER_START_TAG;
4131 #ifdef DEBUG_PUSH
4132 xmlGenericError(xmlGenericErrorContext,
4133 "HPP: entering START_TAG\n");
4134 #endif
4136 break;
4137 case XML_PARSER_PROLOG:
4138 SKIP_BLANKS;
4139 if (in->buf == NULL)
4140 avail = in->length - (in->cur - in->base);
4141 else
4142 avail = in->buf->buffer->use - (in->cur - in->base);
4143 if (avail < 2)
4144 goto done;
4145 cur = in->cur[0];
4146 next = in->cur[1];
4147 if ((cur == '<') && (next == '!') &&
4148 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4149 if ((!terminate) &&
4150 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4151 goto done;
4152 #ifdef DEBUG_PUSH
4153 xmlGenericError(xmlGenericErrorContext,
4154 "HPP: Parsing Comment\n");
4155 #endif
4156 htmlParseComment(ctxt);
4157 ctxt->instate = XML_PARSER_PROLOG;
4158 } else if ((cur == '<') && (next == '!') &&
4159 (avail < 4)) {
4160 goto done;
4161 } else {
4162 ctxt->instate = XML_PARSER_START_TAG;
4163 #ifdef DEBUG_PUSH
4164 xmlGenericError(xmlGenericErrorContext,
4165 "HPP: entering START_TAG\n");
4166 #endif
4168 break;
4169 case XML_PARSER_EPILOG:
4170 if (in->buf == NULL)
4171 avail = in->length - (in->cur - in->base);
4172 else
4173 avail = in->buf->buffer->use - (in->cur - in->base);
4174 if (avail < 1)
4175 goto done;
4176 cur = in->cur[0];
4177 if (IS_BLANK(cur)) {
4178 htmlParseCharData(ctxt);
4179 goto done;
4181 if (avail < 2)
4182 goto done;
4183 next = in->cur[1];
4184 if ((cur == '<') && (next == '!') &&
4185 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4186 if ((!terminate) &&
4187 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4188 goto done;
4189 #ifdef DEBUG_PUSH
4190 xmlGenericError(xmlGenericErrorContext,
4191 "HPP: Parsing Comment\n");
4192 #endif
4193 htmlParseComment(ctxt);
4194 ctxt->instate = XML_PARSER_EPILOG;
4195 } else if ((cur == '<') && (next == '!') &&
4196 (avail < 4)) {
4197 goto done;
4198 } else {
4199 ctxt->errNo = XML_ERR_DOCUMENT_END;
4200 ctxt->wellFormed = 0;
4201 ctxt->instate = XML_PARSER_EOF;
4202 #ifdef DEBUG_PUSH
4203 xmlGenericError(xmlGenericErrorContext,
4204 "HPP: entering EOF\n");
4205 #endif
4206 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4207 ctxt->sax->endDocument(ctxt->userData);
4208 goto done;
4210 break;
4211 case XML_PARSER_START_TAG: {
4212 xmlChar *name, *oldname;
4213 int depth = ctxt->nameNr;
4214 const htmlElemDesc * info;
4216 if (avail < 2)
4217 goto done;
4218 cur = in->cur[0];
4219 if (cur != '<') {
4220 ctxt->instate = XML_PARSER_CONTENT;
4221 #ifdef DEBUG_PUSH
4222 xmlGenericError(xmlGenericErrorContext,
4223 "HPP: entering CONTENT\n");
4224 #endif
4225 break;
4227 if (in->cur[1] == '/') {
4228 ctxt->instate = XML_PARSER_END_TAG;
4229 ctxt->checkIndex = 0;
4230 #ifdef DEBUG_PUSH
4231 xmlGenericError(xmlGenericErrorContext,
4232 "HPP: entering END_TAG\n");
4233 #endif
4234 break;
4236 if ((!terminate) &&
4237 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4238 goto done;
4240 oldname = xmlStrdup(ctxt->name);
4241 htmlParseStartTag(ctxt);
4242 name = ctxt->name;
4243 #ifdef DEBUG
4244 if (oldname == NULL)
4245 xmlGenericError(xmlGenericErrorContext,
4246 "Start of element %s\n", name);
4247 else if (name == NULL)
4248 xmlGenericError(xmlGenericErrorContext,
4249 "Start of element failed, was %s\n",
4250 oldname);
4251 else
4252 xmlGenericError(xmlGenericErrorContext,
4253 "Start of element %s, was %s\n",
4254 name, oldname);
4255 #endif
4256 if (((depth == ctxt->nameNr) &&
4257 (xmlStrEqual(oldname, ctxt->name))) ||
4258 (name == NULL)) {
4259 if (CUR == '>')
4260 NEXT;
4261 if (oldname != NULL)
4262 xmlFree(oldname);
4263 break;
4265 if (oldname != NULL)
4266 xmlFree(oldname);
4269 * Lookup the info for that element.
4271 info = htmlTagLookup(name);
4272 if (info == NULL) {
4273 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4274 ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
4275 name);
4276 ctxt->wellFormed = 0;
4277 } else if (info->depr) {
4278 /***************************
4279 if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
4280 ctxt->sax->warning(ctxt->userData,
4281 "Tag %s is deprecated\n",
4282 name);
4283 ***************************/
4287 * Check for an Empty Element labeled the XML/SGML way
4289 if ((CUR == '/') && (NXT(1) == '>')) {
4290 SKIP(2);
4291 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4292 ctxt->sax->endElement(ctxt->userData, name);
4293 oldname = htmlnamePop(ctxt);
4294 #ifdef DEBUG
4295 xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n",
4296 oldname);
4297 #endif
4298 if (oldname != NULL)
4299 xmlFree(oldname);
4300 ctxt->instate = XML_PARSER_CONTENT;
4301 #ifdef DEBUG_PUSH
4302 xmlGenericError(xmlGenericErrorContext,
4303 "HPP: entering CONTENT\n");
4304 #endif
4305 break;
4308 if (CUR == '>') {
4309 NEXT;
4310 } else {
4311 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4312 ctxt->sax->error(ctxt->userData,
4313 "Couldn't find end of Start Tag %s\n",
4314 name);
4315 ctxt->wellFormed = 0;
4318 * end of parsing of this node.
4320 if (xmlStrEqual(name, ctxt->name)) {
4321 nodePop(ctxt);
4322 oldname = htmlnamePop(ctxt);
4323 #ifdef DEBUG
4324 xmlGenericError(xmlGenericErrorContext,
4325 "End of start tag problem: popping out %s\n", oldname);
4326 #endif
4327 if (oldname != NULL)
4328 xmlFree(oldname);
4331 ctxt->instate = XML_PARSER_CONTENT;
4332 #ifdef DEBUG_PUSH
4333 xmlGenericError(xmlGenericErrorContext,
4334 "HPP: entering CONTENT\n");
4335 #endif
4336 break;
4340 * Check for an Empty Element from DTD definition
4342 if ((info != NULL) && (info->empty)) {
4343 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4344 ctxt->sax->endElement(ctxt->userData, name);
4345 oldname = htmlnamePop(ctxt);
4346 #ifdef DEBUG
4347 xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
4348 #endif
4349 if (oldname != NULL)
4350 xmlFree(oldname);
4352 ctxt->instate = XML_PARSER_CONTENT;
4353 #ifdef DEBUG_PUSH
4354 xmlGenericError(xmlGenericErrorContext,
4355 "HPP: entering CONTENT\n");
4356 #endif
4357 break;
4359 case XML_PARSER_CONTENT: {
4360 long cons;
4362 * Handle preparsed entities and charRef
4364 if (ctxt->token != 0) {
4365 xmlChar chr[2] = { 0 , 0 } ;
4367 chr[0] = (xmlChar) ctxt->token;
4368 htmlCheckParagraph(ctxt);
4369 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4370 ctxt->sax->characters(ctxt->userData, chr, 1);
4371 ctxt->token = 0;
4372 ctxt->checkIndex = 0;
4374 if ((avail == 1) && (terminate)) {
4375 cur = in->cur[0];
4376 if ((cur != '<') && (cur != '&')) {
4377 if (ctxt->sax != NULL) {
4378 if (IS_BLANK(cur)) {
4379 if (ctxt->sax->ignorableWhitespace != NULL)
4380 ctxt->sax->ignorableWhitespace(
4381 ctxt->userData, &cur, 1);
4382 } else {
4383 htmlCheckParagraph(ctxt);
4384 if (ctxt->sax->characters != NULL)
4385 ctxt->sax->characters(
4386 ctxt->userData, &cur, 1);
4389 ctxt->token = 0;
4390 ctxt->checkIndex = 0;
4391 in->cur++;
4392 break;
4395 if (avail < 2)
4396 goto done;
4397 cur = in->cur[0];
4398 next = in->cur[1];
4399 cons = ctxt->nbChars;
4400 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
4401 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
4403 * Handle SCRIPT/STYLE separately
4405 if ((!terminate) &&
4406 (htmlParseLookupSequence(ctxt, '<', '/', 0) < 0))
4407 goto done;
4408 htmlParseScript(ctxt);
4409 if ((cur == '<') && (next == '/')) {
4410 ctxt->instate = XML_PARSER_END_TAG;
4411 ctxt->checkIndex = 0;
4412 #ifdef DEBUG_PUSH
4413 xmlGenericError(xmlGenericErrorContext,
4414 "HPP: entering END_TAG\n");
4415 #endif
4416 break;
4418 } else {
4420 * Sometimes DOCTYPE arrives in the middle of the document
4422 if ((cur == '<') && (next == '!') &&
4423 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4424 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4425 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4426 (UPP(8) == 'E')) {
4427 if ((!terminate) &&
4428 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4429 goto done;
4430 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4431 ctxt->sax->error(ctxt->userData,
4432 "Misplaced DOCTYPE declaration\n");
4433 ctxt->wellFormed = 0;
4434 htmlParseDocTypeDecl(ctxt);
4435 } else if ((cur == '<') && (next == '!') &&
4436 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4437 if ((!terminate) &&
4438 (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4439 goto done;
4440 #ifdef DEBUG_PUSH
4441 xmlGenericError(xmlGenericErrorContext,
4442 "HPP: Parsing Comment\n");
4443 #endif
4444 htmlParseComment(ctxt);
4445 ctxt->instate = XML_PARSER_CONTENT;
4446 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
4447 goto done;
4448 } else if ((cur == '<') && (next == '/')) {
4449 ctxt->instate = XML_PARSER_END_TAG;
4450 ctxt->checkIndex = 0;
4451 #ifdef DEBUG_PUSH
4452 xmlGenericError(xmlGenericErrorContext,
4453 "HPP: entering END_TAG\n");
4454 #endif
4455 break;
4456 } else if (cur == '<') {
4457 ctxt->instate = XML_PARSER_START_TAG;
4458 ctxt->checkIndex = 0;
4459 #ifdef DEBUG_PUSH
4460 xmlGenericError(xmlGenericErrorContext,
4461 "HPP: entering START_TAG\n");
4462 #endif
4463 break;
4464 } else if (cur == '&') {
4465 if ((!terminate) &&
4466 (htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
4467 goto done;
4468 #ifdef DEBUG_PUSH
4469 xmlGenericError(xmlGenericErrorContext,
4470 "HPP: Parsing Reference\n");
4471 #endif
4472 /* TODO: check generation of subtrees if noent !!! */
4473 htmlParseReference(ctxt);
4474 } else {
4475 /* TODO Avoid the extra copy, handle directly !!!!!! */
4477 * Goal of the following test is :
4478 * - minimize calls to the SAX 'character' callback
4479 * when they are mergeable
4481 if ((ctxt->inputNr == 1) &&
4482 (avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
4483 if ((!terminate) &&
4484 (htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
4485 goto done;
4487 ctxt->checkIndex = 0;
4488 #ifdef DEBUG_PUSH
4489 xmlGenericError(xmlGenericErrorContext,
4490 "HPP: Parsing char data\n");
4491 #endif
4492 htmlParseCharData(ctxt);
4495 if (cons == ctxt->nbChars) {
4496 if (ctxt->node != NULL) {
4497 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4498 ctxt->sax->error(ctxt->userData,
4499 "detected an error in element content\n");
4500 ctxt->wellFormed = 0;
4502 NEXT;
4503 break;
4506 break;
4508 case XML_PARSER_END_TAG:
4509 if (avail < 2)
4510 goto done;
4511 if ((!terminate) &&
4512 (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4513 goto done;
4514 htmlParseEndTag(ctxt);
4515 if (ctxt->nameNr == 0) {
4516 ctxt->instate = XML_PARSER_EPILOG;
4517 } else {
4518 ctxt->instate = XML_PARSER_CONTENT;
4520 ctxt->checkIndex = 0;
4521 #ifdef DEBUG_PUSH
4522 xmlGenericError(xmlGenericErrorContext,
4523 "HPP: entering CONTENT\n");
4524 #endif
4525 break;
4526 case XML_PARSER_CDATA_SECTION:
4527 xmlGenericError(xmlGenericErrorContext,
4528 "HPP: internal error, state == CDATA\n");
4529 ctxt->instate = XML_PARSER_CONTENT;
4530 ctxt->checkIndex = 0;
4531 #ifdef DEBUG_PUSH
4532 xmlGenericError(xmlGenericErrorContext,
4533 "HPP: entering CONTENT\n");
4534 #endif
4535 break;
4536 case XML_PARSER_DTD:
4537 xmlGenericError(xmlGenericErrorContext,
4538 "HPP: internal error, state == DTD\n");
4539 ctxt->instate = XML_PARSER_CONTENT;
4540 ctxt->checkIndex = 0;
4541 #ifdef DEBUG_PUSH
4542 xmlGenericError(xmlGenericErrorContext,
4543 "HPP: entering CONTENT\n");
4544 #endif
4545 break;
4546 case XML_PARSER_COMMENT:
4547 xmlGenericError(xmlGenericErrorContext,
4548 "HPP: internal error, state == COMMENT\n");
4549 ctxt->instate = XML_PARSER_CONTENT;
4550 ctxt->checkIndex = 0;
4551 #ifdef DEBUG_PUSH
4552 xmlGenericError(xmlGenericErrorContext,
4553 "HPP: entering CONTENT\n");
4554 #endif
4555 break;
4556 case XML_PARSER_PI:
4557 xmlGenericError(xmlGenericErrorContext,
4558 "HPP: internal error, state == PI\n");
4559 ctxt->instate = XML_PARSER_CONTENT;
4560 ctxt->checkIndex = 0;
4561 #ifdef DEBUG_PUSH
4562 xmlGenericError(xmlGenericErrorContext,
4563 "HPP: entering CONTENT\n");
4564 #endif
4565 break;
4566 case XML_PARSER_ENTITY_DECL:
4567 xmlGenericError(xmlGenericErrorContext,
4568 "HPP: internal error, state == ENTITY_DECL\n");
4569 ctxt->instate = XML_PARSER_CONTENT;
4570 ctxt->checkIndex = 0;
4571 #ifdef DEBUG_PUSH
4572 xmlGenericError(xmlGenericErrorContext,
4573 "HPP: entering CONTENT\n");
4574 #endif
4575 break;
4576 case XML_PARSER_ENTITY_VALUE:
4577 xmlGenericError(xmlGenericErrorContext,
4578 "HPP: internal error, state == ENTITY_VALUE\n");
4579 ctxt->instate = XML_PARSER_CONTENT;
4580 ctxt->checkIndex = 0;
4581 #ifdef DEBUG_PUSH
4582 xmlGenericError(xmlGenericErrorContext,
4583 "HPP: entering DTD\n");
4584 #endif
4585 break;
4586 case XML_PARSER_ATTRIBUTE_VALUE:
4587 xmlGenericError(xmlGenericErrorContext,
4588 "HPP: internal error, state == ATTRIBUTE_VALUE\n");
4589 ctxt->instate = XML_PARSER_START_TAG;
4590 ctxt->checkIndex = 0;
4591 #ifdef DEBUG_PUSH
4592 xmlGenericError(xmlGenericErrorContext,
4593 "HPP: entering START_TAG\n");
4594 #endif
4595 break;
4596 case XML_PARSER_SYSTEM_LITERAL:
4597 xmlGenericError(xmlGenericErrorContext,
4598 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
4599 ctxt->instate = XML_PARSER_CONTENT;
4600 ctxt->checkIndex = 0;
4601 #ifdef DEBUG_PUSH
4602 xmlGenericError(xmlGenericErrorContext,
4603 "HPP: entering CONTENT\n");
4604 #endif
4605 break;
4606 case XML_PARSER_IGNORE:
4607 xmlGenericError(xmlGenericErrorContext,
4608 "HPP: internal error, state == XML_PARSER_IGNORE\n");
4609 ctxt->instate = XML_PARSER_CONTENT;
4610 ctxt->checkIndex = 0;
4611 #ifdef DEBUG_PUSH
4612 xmlGenericError(xmlGenericErrorContext,
4613 "HPP: entering CONTENT\n");
4614 #endif
4615 break;
4616 case XML_PARSER_PUBLIC_LITERAL:
4617 xmlGenericError(xmlGenericErrorContext,
4618 "HPP: internal error, state == XML_PARSER_LITERAL\n");
4619 ctxt->instate = XML_PARSER_CONTENT;
4620 ctxt->checkIndex = 0;
4621 #ifdef DEBUG_PUSH
4622 xmlGenericError(xmlGenericErrorContext,
4623 "HPP: entering CONTENT\n");
4624 #endif
4625 break;
4629 done:
4630 if ((avail == 0) && (terminate)) {
4631 htmlAutoCloseOnEnd(ctxt);
4632 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4634 * SAX: end of the document processing.
4636 ctxt->instate = XML_PARSER_EOF;
4637 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4638 ctxt->sax->endDocument(ctxt->userData);
4641 if ((ctxt->myDoc != NULL) &&
4642 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
4643 (ctxt->instate == XML_PARSER_EPILOG))) {
4644 xmlDtdPtr dtd;
4645 dtd = xmlGetIntSubset(ctxt->myDoc);
4646 if (dtd == NULL)
4647 ctxt->myDoc->intSubset =
4648 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
4649 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4650 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4652 #ifdef DEBUG_PUSH
4653 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
4654 #endif
4655 return(ret);
4659 * htmlParseChunk:
4660 * @ctxt: an XML parser context
4661 * @chunk: an char array
4662 * @size: the size in byte of the chunk
4663 * @terminate: last chunk indicator
4665 * Parse a Chunk of memory
4667 * Returns zero if no error, the xmlParserErrors otherwise.
4670 htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
4671 int terminate) {
4672 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4673 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
4674 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
4675 int cur = ctxt->input->cur - ctxt->input->base;
4677 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4678 ctxt->input->base = ctxt->input->buf->buffer->content + base;
4679 ctxt->input->cur = ctxt->input->base + cur;
4680 #ifdef DEBUG_PUSH
4681 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4682 #endif
4684 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
4685 htmlParseTryOrFinish(ctxt, terminate);
4686 } else if (ctxt->instate != XML_PARSER_EOF) {
4687 xmlParserInputBufferPush(ctxt->input->buf, 0, "");
4688 htmlParseTryOrFinish(ctxt, terminate);
4690 if (terminate) {
4691 if ((ctxt->instate != XML_PARSER_EOF) &&
4692 (ctxt->instate != XML_PARSER_EPILOG) &&
4693 (ctxt->instate != XML_PARSER_MISC)) {
4694 ctxt->errNo = XML_ERR_DOCUMENT_END;
4695 ctxt->wellFormed = 0;
4697 if (ctxt->instate != XML_PARSER_EOF) {
4698 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4699 ctxt->sax->endDocument(ctxt->userData);
4701 ctxt->instate = XML_PARSER_EOF;
4703 return((xmlParserErrors) ctxt->errNo);
4706 /************************************************************************
4708 * User entry points *
4710 ************************************************************************/
4713 * htmlCreatePushParserCtxt :
4714 * @sax: a SAX handler
4715 * @user_data: The user data returned on SAX callbacks
4716 * @chunk: a pointer to an array of chars
4717 * @size: number of chars in the array
4718 * @filename: an optional file name or URI
4719 * @enc: an optional encoding
4721 * Create a parser context for using the HTML parser in push mode
4722 * To allow content encoding detection, @size should be >= 4
4723 * The value of @filename is used for fetching external entities
4724 * and error/warning reports.
4726 * Returns the new parser context or NULL
4728 htmlParserCtxtPtr
4729 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
4730 const char *chunk, int size, const char *filename,
4731 xmlCharEncoding enc) {
4732 htmlParserCtxtPtr ctxt;
4733 htmlParserInputPtr inputStream;
4734 xmlParserInputBufferPtr buf;
4736 xmlInitParser();
4738 buf = xmlAllocParserInputBuffer(enc);
4739 if (buf == NULL) return(NULL);
4741 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4742 if (ctxt == NULL) {
4743 xmlFree(buf);
4744 return(NULL);
4746 memset(ctxt, 0, sizeof(htmlParserCtxt));
4747 htmlInitParserCtxt(ctxt);
4748 if (sax != NULL) {
4749 if (ctxt->sax != &htmlDefaultSAXHandler)
4750 xmlFree(ctxt->sax);
4751 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
4752 if (ctxt->sax == NULL) {
4753 xmlFree(buf);
4754 xmlFree(ctxt);
4755 return(NULL);
4757 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
4758 if (user_data != NULL)
4759 ctxt->userData = user_data;
4761 if (filename == NULL) {
4762 ctxt->directory = NULL;
4763 } else {
4764 ctxt->directory = xmlParserGetDirectory(filename);
4767 inputStream = htmlNewInputStream(ctxt);
4768 if (inputStream == NULL) {
4769 xmlFreeParserCtxt(ctxt);
4770 return(NULL);
4773 if (filename == NULL)
4774 inputStream->filename = NULL;
4775 else
4776 inputStream->filename = xmlMemStrdup(filename);
4777 inputStream->buf = buf;
4778 inputStream->base = inputStream->buf->buffer->content;
4779 inputStream->cur = inputStream->buf->buffer->content;
4781 inputPush(ctxt, inputStream);
4783 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4784 (ctxt->input->buf != NULL)) {
4785 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4786 #ifdef DEBUG_PUSH
4787 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4788 #endif
4791 return(ctxt);
4795 * htmlSAXParseDoc :
4796 * @cur: a pointer to an array of xmlChar
4797 * @encoding: a free form C string describing the HTML document encoding, or NULL
4798 * @sax: the SAX handler block
4799 * @userData: if using SAX, this pointer will be provided on callbacks.
4801 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
4802 * to handle parse events. If sax is NULL, fallback to the default DOM
4803 * behavior and return a tree.
4805 * Returns the resulting document tree unless SAX is NULL or the document is
4806 * not well formed.
4809 htmlDocPtr
4810 htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
4811 htmlDocPtr ret;
4812 htmlParserCtxtPtr ctxt;
4814 xmlInitParser();
4816 if (cur == NULL) return(NULL);
4819 ctxt = htmlCreateDocParserCtxt(cur, encoding);
4820 if (ctxt == NULL) return(NULL);
4821 if (sax != NULL) {
4822 ctxt->sax = sax;
4823 ctxt->userData = userData;
4826 htmlParseDocument(ctxt);
4827 ret = ctxt->myDoc;
4828 if (sax != NULL) {
4829 ctxt->sax = NULL;
4830 ctxt->userData = NULL;
4832 htmlFreeParserCtxt(ctxt);
4834 return(ret);
4838 * htmlParseDoc :
4839 * @cur: a pointer to an array of xmlChar
4840 * @encoding: a free form C string describing the HTML document encoding, or NULL
4842 * parse an HTML in-memory document and build a tree.
4844 * Returns the resulting document tree
4847 htmlDocPtr
4848 htmlParseDoc(xmlChar *cur, const char *encoding) {
4849 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
4854 * htmlCreateFileParserCtxt :
4855 * @filename: the filename
4856 * @encoding: a free form C string describing the HTML document encoding, or NULL
4858 * Create a parser context for a file content.
4859 * Automatic support for ZLIB/Compress compressed document is provided
4860 * by default if found at compile-time.
4862 * Returns the new parser context or NULL
4864 htmlParserCtxtPtr
4865 htmlCreateFileParserCtxt(const char *filename, const char *encoding)
4867 htmlParserCtxtPtr ctxt;
4868 htmlParserInputPtr inputStream;
4869 xmlParserInputBufferPtr buf;
4870 /* htmlCharEncoding enc; */
4871 xmlChar *content, *content_line = (xmlChar *) "charset=";
4873 buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
4874 if (buf == NULL) return(NULL);
4876 ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4877 if (ctxt == NULL) {
4878 xmlGenericError(xmlGenericErrorContext, "malloc failed\n");
4879 return(NULL);
4881 memset(ctxt, 0, sizeof(htmlParserCtxt));
4882 htmlInitParserCtxt(ctxt);
4883 inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
4884 if (inputStream == NULL) {
4885 xmlGenericError(xmlGenericErrorContext, "malloc failed\n");
4886 xmlFree(ctxt);
4887 return(NULL);
4889 memset(inputStream, 0, sizeof(htmlParserInput));
4891 inputStream->filename = (char *)
4892 xmlNormalizeWindowsPath((xmlChar *)filename);
4893 inputStream->line = 1;
4894 inputStream->col = 1;
4895 inputStream->buf = buf;
4896 inputStream->directory = NULL;
4898 inputStream->base = inputStream->buf->buffer->content;
4899 inputStream->cur = inputStream->buf->buffer->content;
4900 inputStream->free = NULL;
4902 inputPush(ctxt, inputStream);
4904 /* set encoding */
4905 if (encoding) {
4906 content = xmlMalloc (xmlStrlen(content_line) + strlen(encoding) + 1);
4907 if (content) {
4908 strcpy ((char *)content, (char *)content_line);
4909 strcat ((char *)content, (char *)encoding);
4910 htmlCheckEncoding (ctxt, content);
4911 xmlFree (content);
4915 return(ctxt);
4919 * htmlSAXParseFile :
4920 * @filename: the filename
4921 * @encoding: a free form C string describing the HTML document encoding, or NULL
4922 * @sax: the SAX handler block
4923 * @userData: if using SAX, this pointer will be provided on callbacks.
4925 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4926 * compressed document is provided by default if found at compile-time.
4927 * It use the given SAX function block to handle the parsing callback.
4928 * If sax is NULL, fallback to the default DOM tree building routines.
4930 * Returns the resulting document tree unless SAX is NULL or the document is
4931 * not well formed.
4934 htmlDocPtr
4935 htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
4936 void *userData) {
4937 htmlDocPtr ret;
4938 htmlParserCtxtPtr ctxt;
4939 htmlSAXHandlerPtr oldsax = NULL;
4941 xmlInitParser();
4943 ctxt = htmlCreateFileParserCtxt(filename, encoding);
4944 if (ctxt == NULL) return(NULL);
4945 if (sax != NULL) {
4946 oldsax = ctxt->sax;
4947 ctxt->sax = sax;
4948 ctxt->userData = userData;
4951 htmlParseDocument(ctxt);
4953 ret = ctxt->myDoc;
4954 if (sax != NULL) {
4955 ctxt->sax = oldsax;
4956 ctxt->userData = NULL;
4958 htmlFreeParserCtxt(ctxt);
4960 return(ret);
4964 * htmlParseFile :
4965 * @filename: the filename
4966 * @encoding: a free form C string describing the HTML document encoding, or NULL
4968 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4969 * compressed document is provided by default if found at compile-time.
4971 * Returns the resulting document tree
4974 htmlDocPtr
4975 htmlParseFile(const char *filename, const char *encoding) {
4976 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
4980 * htmlHandleOmittedElem:
4981 * @val: int 0 or 1
4983 * Set and return the previous value for handling HTML omitted tags.
4985 * Returns the last value for 0 for no handling, 1 for auto insertion.
4989 htmlHandleOmittedElem(int val) {
4990 int old = htmlOmittedDefaultValue;
4992 htmlOmittedDefaultValue = val;
4993 return(old);
4996 #endif /* LIBXML_HTML_ENABLED */