src/external/3rd/library/libxml/HTMLparser.c

   1 /*
   2  * HTMLparser.c : an HTML 4.0 non-verifying parser
   3  *
   4  * See Copyright for the status of this software.
   5  *
   6  * daniel@veillard.com
   7  */
   8
   9 #define IN_LIBXML
  10 #include "libxml.h"
  11 #ifdef LIBXML_HTML_ENABLED
  12
  13 #include <string.h>
  14 #ifdef HAVE_CTYPE_H
  15 #include <ctype.h>
  16 #endif
  17 #ifdef HAVE_STDLIB_H
  18 #include <stdlib.h>
  19 #endif
  20 #ifdef HAVE_SYS_STAT_H
  21 #include <sys/stat.h>
  22 #endif
  23 #ifdef HAVE_FCNTL_H
  24 #include <fcntl.h>
  25 #endif
  26 #ifdef HAVE_UNISTD_H
  27 #include <unistd.h>
  28 #endif
  29 #ifdef HAVE_ZLIB_H
  30 #include <zlib.h>
  31 #endif
  32
  33 #include <libxml/xmlmemory.h>
  34 #include <libxml/tree.h>
  35 #include <libxml/parser.h>
  36 #include <libxml/parserInternals.h>
  37 #include <libxml/xmlerror.h>
  38 #include <libxml/HTMLparser.h>
  39 #include <libxml/HTMLtree.h>
  40 #include <libxml/entities.h>
  41 #include <libxml/encoding.h>
  42 #include <libxml/valid.h>
  43 #include <libxml/xmlIO.h>
  44 #include <libxml/globals.h>
  45
  46 #define HTML_MAX_NAMELEN 1000
  47 #define HTML_PARSER_BIG_BUFFER_SIZE 1000
  48 #define HTML_PARSER_BUFFER_SIZE 100
  49
  50 /* #define DEBUG */
  51 /* #define DEBUG_PUSH */
  52
  53 static int htmlOmittedDefaultValue = 1;
  54
  55 xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
  56                              xmlChar end, xmlChar  end2, xmlChar end3);
  57 static void htmlParseComment(htmlParserCtxtPtr ctxt);
  58
  59 /************************************************************************
  60  *                                                                      *
  61  *              Parser stacks related functions and macros              *
  62  *                                                                      *
  63  ************************************************************************/
  64
  65 /*
  66  * Generic function for accessing stacks in the Parser Context
  67  */
  68
  69 #define PUSH_AND_POP(scope, type, name)                                 \
  70 scope int html##name##Push(htmlParserCtxtPtr ctxt, type value) {        \
  71     if (ctxt->name##Nr >= ctxt->name##Max) {                            \
  72         ctxt->name##Max *= 2;                                           \
  73         ctxt->name##Tab = (type *) xmlRealloc(ctxt->name##Tab,          \
  74                      ctxt->name##Max * sizeof(ctxt->name##Tab[0]));     \
  75         if (ctxt->name##Tab == NULL) {                                  \
  76             xmlGenericError(xmlGenericErrorContext,                     \
  77                                 "realloc failed !\n");                  \
  78             return(0);                                                  \
  79         }                                                               \
  80     }                                                                   \
  81     ctxt->name##Tab[ctxt->name##Nr] = value;                            \
  82     ctxt->name = value;                                                 \
  83     return(ctxt->name##Nr++);                                           \
  84 }                                                                       \
  85 scope type html##name##Pop(htmlParserCtxtPtr ctxt) {                    \
  86     type ret;                                                           \
  87     if (ctxt->name##Nr <= 0) return(0);                                 \
  88     ctxt->name##Nr--;                                                   \
  89     if (ctxt->name##Nr < 0) return(0);                                  \
  90     if (ctxt->name##Nr > 0)                                             \
  91         ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1];               \
  92     else                                                                \
  93         ctxt->name = NULL;                                              \
  94     ret = ctxt->name##Tab[ctxt->name##Nr];                              \
  95     ctxt->name##Tab[ctxt->name##Nr] = 0;                                \
  96     return(ret);                                                        \
  97 }                                                                       \
  98
  99 /* PUSH_AND_POP(static, xmlNodePtr, node) */
 100 PUSH_AND_POP(static, xmlChar*, name)
 101
 102 /*
 103  * Macros for accessing the content. Those should be used only by the parser,
 104  * and not exported.
 105  *
 106  * Dirty macros, i.e. one need to make assumption on the context to use them
 107  *
 108  *   CUR_PTR return the current pointer to the xmlChar to be parsed.
 109  *   CUR     returns the current xmlChar value, i.e. a 8 bit value if compiled
 110  *           in ISO-Latin or UTF-8, and the current 16 bit value if compiled
 111  *           in UNICODE mode. This should be used internally by the parser
 112  *           only to compare to ASCII values otherwise it would break when
 113  *           running with UTF-8 encoding.
 114  *   NXT(n)  returns the n'th next xmlChar. Same as CUR is should be used only
 115  *           to compare on ASCII based substring.
 116  *   UPP(n)  returns the n'th next xmlChar converted to uppercase. Same as CUR
 117  *           it should be used only to compare on ASCII based substring.
 118  *   SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
 119  *           strings within the parser.
 120  *
 121  * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
 122  *
 123  *   CURRENT Returns the current char value, with the full decoding of
 124  *           UTF-8 if we are using this mode. It returns an int.
 125  *   NEXT    Skip to the next character, this does the proper decoding
 126  *           in UTF-8 mode. It also pop-up unfinished entities on the fly.
 127  *   COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
 128  */
 129
 130 #define UPPER (toupper(*ctxt->input->cur))
 131
 132 #define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
 133
 134 #define NXT(val) ctxt->input->cur[(val)]
 135
 136 #define UPP(val) (toupper(ctxt->input->cur[(val)]))
 137
 138 #define CUR_PTR ctxt->input->cur
 139
 140 #define SHRINK  xmlParserInputShrink(ctxt->input)
 141
 142 #define GROW  xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
 143
 144 #define CURRENT ((int) (*ctxt->input->cur))
 145
 146 #define SKIP_BLANKS htmlSkipBlankChars(ctxt)
 147
 148 /* Inported from XML */
 149
 150 /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
 151 #define CUR ((int) (*ctxt->input->cur))
 152 #define NEXT xmlNextChar(ctxt),ctxt->nbChars++
 153
 154 #define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
 155 #define NXT(val) ctxt->input->cur[(val)]
 156 #define CUR_PTR ctxt->input->cur
 157
 158
 159 #define NEXTL(l) do {                                                   \
 160     if (*(ctxt->input->cur) == '\n') {                                  \
 161         ctxt->input->line++; ctxt->input->col = 1;                      \
 162     } else ctxt->input->col++;                                          \
 163     ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++;            \
 164   } while (0)
 165
 166 /************
 167     \
 168     if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt);     \
 169     if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
 170  ************/
 171
 172 #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
 173 #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
 174
 175 #define COPY_BUF(l,b,i,v)                                               \
 176     if (l == 1) b[i++] = (xmlChar) v;                                   \
 177     else i += xmlCopyChar(l,&b[i],v)
 178
 179 /**
 180  * htmlCurrentChar:
 181  * @ctxt:  the HTML parser context
 182  * @len:  pointer to the length of the char read
 183  *
 184  * The current char value, if using UTF-8 this may actually span multiple
 185  * bytes in the input buffer. Implement the end of line normalization:
 186  * 2.11 End-of-Line Handling
 187  * If the encoding is unspecified, in the case we find an ISO-Latin-1
 188  * char, then the encoding converter is plugged in automatically.
 189  *
 190  * Returns the current char value and its length
 191  */
 192
 193 static int
 194 htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
 195     if (ctxt->instate == XML_PARSER_EOF)
 196         return(0);
 197
 198     if (ctxt->token != 0) {
 199         *len = 0;
 200         return(ctxt->token);
 201     }
 202     if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
 203         /*
 204          * We are supposed to handle UTF8, check it's valid
 205          * From rfc2044: encoding of the Unicode values on UTF-8:
 206          *
 207          * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
 208          * 0000 0000-0000 007F   0xxxxxxx
 209          * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
 210          * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
 211          *
 212          * Check for the 0x110000 limit too
 213          */
 214         const unsigned char *cur = ctxt->input->cur;
 215         unsigned char c;
 216         unsigned int val;
 217
 218         c = *cur;
 219         if (c & 0x80) {
 220             if (cur[1] == 0)
 221                 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
 222             if ((cur[1] & 0xc0) != 0x80)
 223                 goto encoding_error;
 224             if ((c & 0xe0) == 0xe0) {
 225
 226                 if (cur[2] == 0)
 227                     xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
 228                 if ((cur[2] & 0xc0) != 0x80)
 229                     goto encoding_error;
 230                 if ((c & 0xf0) == 0xf0) {
 231                     if (cur[3] == 0)
 232                         xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
 233                     if (((c & 0xf8) != 0xf0) ||
 234                         ((cur[3] & 0xc0) != 0x80))
 235                         goto encoding_error;
 236                     /* 4-byte code */
 237                     *len = 4;
 238                     val = (cur[0] & 0x7) << 18;
 239                     val |= (cur[1] & 0x3f) << 12;
 240                     val |= (cur[2] & 0x3f) << 6;
 241                     val |= cur[3] & 0x3f;
 242                 } else {
 243                   /* 3-byte code */
 244                     *len = 3;
 245                     val = (cur[0] & 0xf) << 12;
 246                     val |= (cur[1] & 0x3f) << 6;
 247                     val |= cur[2] & 0x3f;
 248                 }
 249             } else {
 250               /* 2-byte code */
 251                 *len = 2;
 252                 val = (cur[0] & 0x1f) << 6;
 253                 val |= cur[1] & 0x3f;
 254             }
 255             if (!IS_CHAR(val)) {
 256                 ctxt->errNo = XML_ERR_INVALID_ENCODING;
 257                 if ((ctxt->sax != NULL) &&
 258                     (ctxt->sax->error != NULL))
 259                     ctxt->sax->error(ctxt->userData,
 260                                      "Char 0x%X out of allowed range\n", val);
 261                 ctxt->wellFormed = 0;
 262                 if (ctxt->recovery == 0) ctxt->disableSAX = 1;
 263             }
 264             return(val);
 265         } else {
 266             /* 1-byte code */
 267             *len = 1;
 268             return((int) *ctxt->input->cur);
 269         }
 270     }
 271     /*
 272      * Assume it's a fixed length encoding (1) with
 273      * a compatible encoding for the ASCII set, since
 274      * XML constructs only use < 128 chars
 275      */
 276     *len = 1;
 277     if ((int) *ctxt->input->cur < 0x80)
 278         return((int) *ctxt->input->cur);
 279
 280     /*
 281      * Humm this is bad, do an automatic flow conversion
 282      */
 283     xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
 284     ctxt->charset = XML_CHAR_ENCODING_UTF8;
 285     return(xmlCurrentChar(ctxt, len));
 286
 287 encoding_error:
 288     /*
 289      * If we detect an UTF8 error that probably mean that the
 290      * input encoding didn't get properly advertized in the
 291      * declaration header. Report the error and switch the encoding
 292      * to ISO-Latin-1 (if you don't like this policy, just declare the
 293      * encoding !)
 294      */
 295     ctxt->errNo = XML_ERR_INVALID_ENCODING;
 296     if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
 297         ctxt->sax->error(ctxt->userData,
 298                          "Input is not proper UTF-8, indicate encoding !\n");
 299         ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
 300                         ctxt->input->cur[0], ctxt->input->cur[1],
 301                         ctxt->input->cur[2], ctxt->input->cur[3]);
 302     }
 303
 304     ctxt->charset = XML_CHAR_ENCODING_8859_1;
 305     *len = 1;
 306     return((int) *ctxt->input->cur);
 307 }
 308
 309 /**
 310  * htmlSkipBlankChars:
 311  * @ctxt:  the HTML parser context
 312  *
 313  * skip all blanks character found at that point in the input streams.
 314  *
 315  * Returns the number of space chars skipped
 316  */
 317
 318 static int
 319 htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
 320     int res = 0;
 321
 322     while (IS_BLANK(*(ctxt->input->cur))) {
 323         if ((*ctxt->input->cur == 0) &&
 324             (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
 325                 xmlPopInput(ctxt);
 326         } else {
 327             if (*(ctxt->input->cur) == '\n') {
 328                 ctxt->input->line++; ctxt->input->col = 1;
 329             } else ctxt->input->col++;
 330             ctxt->input->cur++;
 331             ctxt->nbChars++;
 332             if (*ctxt->input->cur == 0)
 333                 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
 334         }
 335         res++;
 336     }
 337     return(res);
 338 }
 339
 340
 341
 342 /************************************************************************
 343  *                                                                      *
 344  *              The list of HTML elements and their properties          *
 345  *                                                                      *
 346  ************************************************************************/
 347
 348 /*
 349  *  Start Tag: 1 means the start tag can be ommited
 350  *  End Tag:   1 means the end tag can be ommited
 351  *             2 means it's forbidden (empty elements)
 352  *             3 means the tag is stylistic and should be closed easily
 353  *  Depr:      this element is deprecated
 354  *  DTD:       1 means that this element is valid only in the Loose DTD
 355  *             2 means that this element is valid only in the Frameset DTD
 356  *
 357  * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
 358  */
 359 static const htmlElemDesc
 360 html40ElementTable[] = {
 361 { "a",          0, 0, 0, 0, 0, 0, 1, "anchor " },
 362 { "abbr",       0, 0, 0, 0, 0, 0, 1, "abbreviated form" },
 363 { "acronym",    0, 0, 0, 0, 0, 0, 1, "" },
 364 { "address",    0, 0, 0, 0, 0, 0, 0, "information on author " },
 365 { "applet",     0, 0, 0, 0, 1, 1, 2, "java applet " },
 366 { "area",       0, 2, 2, 1, 0, 0, 0, "client-side image map area " },
 367 { "b",          0, 3, 0, 0, 0, 0, 1, "bold text style" },
 368 { "base",       0, 2, 2, 1, 0, 0, 0, "document base uri " },
 369 { "basefont",   0, 2, 2, 1, 1, 1, 1, "base font size " },
 370 { "bdo",        0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride " },
 371 { "big",        0, 3, 0, 0, 0, 0, 1, "large text style" },
 372 { "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation " },
 373 { "body",       1, 1, 0, 0, 0, 0, 0, "document body " },
 374 { "br",         0, 2, 2, 1, 0, 0, 1, "forced line break " },
 375 { "button",     0, 0, 0, 0, 0, 0, 2, "push button " },
 376 { "caption",    0, 0, 0, 0, 0, 0, 0, "table caption " },
 377 { "center",     0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center " },
 378 { "cite",       0, 0, 0, 0, 0, 0, 1, "citation" },
 379 { "code",       0, 0, 0, 0, 0, 0, 1, "computer code fragment" },
 380 { "col",        0, 2, 2, 1, 0, 0, 0, "table column " },
 381 { "colgroup",   0, 1, 0, 0, 0, 0, 0, "table column group " },
 382 { "dd",         0, 1, 0, 0, 0, 0, 0, "definition description " },
 383 { "del",        0, 0, 0, 0, 0, 0, 2, "deleted text " },
 384 { "dfn",        0, 0, 0, 0, 0, 0, 1, "instance definition" },
 385 { "dir",        0, 0, 0, 0, 1, 1, 0, "directory list" },
 386 { "div",        0, 0, 0, 0, 0, 0, 0, "generic language/style container"},
 387 { "dl",         0, 0, 0, 0, 0, 0, 0, "definition list " },
 388 { "dt",         0, 1, 0, 0, 0, 0, 0, "definition term " },
 389 { "em",         0, 3, 0, 0, 0, 0, 1, "emphasis" },
 390 { "fieldset",   0, 0, 0, 0, 0, 0, 0, "form control group " },
 391 { "font",       0, 3, 0, 0, 1, 1, 1, "local change to font " },
 392 { "form",       0, 0, 0, 0, 0, 0, 0, "interactive form " },
 393 { "frame",      0, 2, 2, 1, 0, 2, 0, "subwindow " },
 394 { "frameset",   0, 0, 0, 0, 0, 2, 0, "window subdivision" },
 395 { "h1",         0, 0, 0, 0, 0, 0, 0, "heading " },
 396 { "h2",         0, 0, 0, 0, 0, 0, 0, "heading " },
 397 { "h3",         0, 0, 0, 0, 0, 0, 0, "heading " },
 398 { "h4",         0, 0, 0, 0, 0, 0, 0, "heading " },
 399 { "h5",         0, 0, 0, 0, 0, 0, 0, "heading " },
 400 { "h6",         0, 0, 0, 0, 0, 0, 0, "heading " },
 401 { "head",       1, 1, 0, 0, 0, 0, 0, "document head " },
 402 { "hr",         0, 2, 2, 1, 0, 0, 0, "horizontal rule " },
 403 { "html",       1, 1, 0, 0, 0, 0, 0, "document root element " },
 404 { "i",          0, 3, 0, 0, 0, 0, 1, "italic text style" },
 405 { "iframe",     0, 0, 0, 0, 0, 1, 2, "inline subwindow " },
 406 { "img",        0, 2, 2, 1, 0, 0, 1, "embedded image " },
 407 { "input",      0, 2, 2, 1, 0, 0, 1, "form control " },
 408 { "ins",        0, 0, 0, 0, 0, 0, 2, "inserted text" },
 409 { "isindex",    0, 2, 2, 1, 1, 1, 0, "single line prompt " },
 410 { "kbd",        0, 0, 0, 0, 0, 0, 1, "text to be entered by the user" },
 411 { "label",      0, 0, 0, 0, 0, 0, 1, "form field label text " },
 412 { "legend",     0, 0, 0, 0, 0, 0, 0, "fieldset legend " },
 413 { "li",         0, 1, 1, 0, 0, 0, 0, "list item " },
 414 { "link",       0, 2, 2, 1, 0, 0, 0, "a media-independent link " },
 415 { "map",        0, 0, 0, 0, 0, 0, 2, "client-side image map " },
 416 { "menu",       0, 0, 0, 0, 1, 1, 0, "menu list " },
 417 { "meta",       0, 2, 2, 1, 0, 0, 0, "generic metainformation " },
 418 { "noframes",   0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering " },
 419 { "noscript",   0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },
 420 { "object",     0, 0, 0, 0, 0, 0, 2, "generic embedded object " },
 421 { "ol",         0, 0, 0, 0, 0, 0, 0, "ordered list " },
 422 { "optgroup",   0, 0, 0, 0, 0, 0, 0, "option group " },
 423 { "option",     0, 1, 0, 0, 0, 0, 0, "selectable choice " },
 424 { "p",          0, 1, 0, 0, 0, 0, 0, "paragraph " },
 425 { "param",      0, 2, 2, 1, 0, 0, 0, "named property value " },
 426 { "pre",        0, 0, 0, 0, 0, 0, 0, "preformatted text " },
 427 { "q",          0, 0, 0, 0, 0, 0, 1, "short inline quotation " },
 428 { "s",          0, 3, 0, 0, 1, 1, 1, "strike-through text style" },
 429 { "samp",       0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc." },
 430 { "script",     0, 0, 0, 0, 0, 0, 2, "script statements " },
 431 { "select",     0, 0, 0, 0, 0, 0, 1, "option selector " },
 432 { "small",      0, 3, 0, 0, 0, 0, 1, "small text style" },
 433 { "span",       0, 0, 0, 0, 0, 0, 1, "generic language/style container " },
 434 { "strike",     0, 3, 0, 0, 1, 1, 1, "strike-through text" },
 435 { "strong",     0, 3, 0, 0, 0, 0, 1, "strong emphasis" },
 436 { "style",      0, 0, 0, 0, 0, 0, 0, "style info " },
 437 { "sub",        0, 3, 0, 0, 0, 0, 1, "subscript" },
 438 { "sup",        0, 3, 0, 0, 0, 0, 1, "superscript " },
 439 { "table",      0, 0, 0, 0, 0, 0, 0, "&#160;" },
 440 { "tbody",      1, 0, 0, 0, 0, 0, 0, "table body " },
 441 { "td",         0, 0, 0, 0, 0, 0, 0, "table data cell" },
 442 { "textarea",   0, 0, 0, 0, 0, 0, 1, "multi-line text field " },
 443 { "tfoot",      0, 1, 0, 0, 0, 0, 0, "table footer " },
 444 { "th",         0, 1, 0, 0, 0, 0, 0, "table header cell" },
 445 { "thead",      0, 1, 0, 0, 0, 0, 0, "table header " },
 446 { "title",      0, 0, 0, 0, 0, 0, 0, "document title " },
 447 { "tr",         0, 0, 0, 0, 0, 0, 0, "table row " },
 448 { "tt",         0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style" },
 449 { "u",          0, 3, 0, 0, 1, 1, 1, "underlined text style" },
 450 { "ul",         0, 0, 0, 0, 0, 0, 0, "unordered list " },
 451 { "var",        0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument" },
 452 };
 453
 454 /*
 455  * start tags that imply the end of current element
 456  */
 457 static const char *htmlStartClose[] = {
 458 "form",         "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
 459                 "dl", "ul", "ol", "menu", "dir", "address", "pre",
 460                 "listing", "xmp", "head", NULL,
 461 "head",         "p", NULL,
 462 "title",        "p", NULL,
 463 "body",         "head", "style", "link", "title", "p", NULL,
 464 "li",           "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
 465                 "pre", "listing", "xmp", "head", "li", NULL,
 466 "hr",           "p", "head", NULL,
 467 "h1",           "p", "head", NULL,
 468 "h2",           "p", "head", NULL,
 469 "h3",           "p", "head", NULL,
 470 "h4",           "p", "head", NULL,
 471 "h5",           "p", "head", NULL,
 472 "h6",           "p", "head", NULL,
 473 "dir",          "p", "head", NULL,
 474 "address",      "p", "head", "ul", NULL,
 475 "pre",          "p", "head", "ul", NULL,
 476 "listing",      "p", "head", NULL,
 477 "xmp",          "p", "head", NULL,
 478 "blockquote",   "p", "head", NULL,
 479 "dl",           "p", "dt", "menu", "dir", "address", "pre", "listing",
 480                 "xmp", "head", NULL,
 481 "dt",           "p", "menu", "dir", "address", "pre", "listing", "xmp",
 482                 "head", "dd", NULL,
 483 "dd",           "p", "menu", "dir", "address", "pre", "listing", "xmp",
 484                 "head", "dt", NULL,
 485 "ul",           "p", "head", "ol", "menu", "dir", "address", "pre",
 486                 "listing", "xmp", NULL,
 487 "ol",           "p", "head", "ul", NULL,
 488 "menu",         "p", "head", "ul", NULL,
 489 "p",            "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
 490 "div",          "p", "head", NULL,
 491 "noscript",     "p", "head", NULL,
 492 "center",       "font", "b", "i", "p", "head", NULL,
 493 "a",            "a", NULL,
 494 "caption",      "p", NULL,
 495 "colgroup",     "caption", "colgroup", "col", "p", NULL,
 496 "col",          "caption", "col", "p", NULL,
 497 "table",        "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
 498                 "listing", "xmp", "a", NULL,
 499 "th",           "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
 500 "td",           "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
 501 "tr",           "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
 502 "thead",        "caption", "col", "colgroup", NULL,
 503 "tfoot",        "th", "td", "tr", "caption", "col", "colgroup", "thead",
 504                 "tbody", "p", NULL,
 505 "tbody",        "th", "td", "tr", "caption", "col", "colgroup", "thead",
 506                 "tfoot", "tbody", "p", NULL,
 507 "optgroup",     "option", NULL,
 508 "option",       "option", NULL,
 509 "fieldset",     "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
 510                 "pre", "listing", "xmp", "a", NULL,
 511 NULL
 512 };
 513
 514 /*
 515  * The list of HTML elements which are supposed not to have
 516  * CDATA content and where a p element will be implied
 517  *
 518  * TODO: extend that list by reading the HTML SGML DTD on
 519  *       implied paragraph
 520  */
 521 static const char *htmlNoContentElements[] = {
 522     "html",
 523     "head",
 524     "body",
 525     NULL
 526 };
 527
 528 /*
 529  * The list of HTML attributes which are of content %Script;
 530  * NOTE: when adding ones, check htmlIsScriptAttribute() since
 531  *       it assumes the name starts with 'on'
 532  */
 533 static const char *htmlScriptAttributes[] = {
 534     "onclick",
 535     "ondblclick",
 536     "onmousedown",
 537     "onmouseup",
 538     "onmouseover",
 539     "onmousemove",
 540     "onmouseout",
 541     "onkeypress",
 542     "onkeydown",
 543     "onkeyup",
 544     "onload",
 545     "onunload",
 546     "onfocus",
 547     "onblur",
 548     "onsubmit",
 549     "onrest",
 550     "onchange",
 551     "onselect"
 552 };
 553
 554 /*
 555  * This table is used by the htmlparser to know what to do with
 556  * broken html pages. By assigning different priorities to different
 557  * elements the parser can decide how to handle extra endtags.
 558  * Endtags are only allowed to close elements with lower or equal
 559  * priority.
 560  */
 561
 562 typedef struct {
 563     const char *name;
 564     int priority;
 565 } elementPriority;
 566
 567 static const elementPriority htmlEndPriority[] = {
 568     {"div",   150},
 569     {"td",    160},
 570     {"th",    160},
 571     {"tr",    170},
 572     {"thead", 180},
 573     {"tbody", 180},
 574     {"tfoot", 180},
 575     {"table", 190},
 576     {"head",  200},
 577     {"body",  200},
 578     {"html",  220},
 579     {NULL,    100} /* Default priority */
 580 };
 581
 582 static const char** htmlStartCloseIndex[100];
 583 static int htmlStartCloseIndexinitialized = 0;
 584
 585 /************************************************************************
 586  *                                                                      *
 587  *              functions to handle HTML specific data                  *
 588  *                                                                      *
 589  ************************************************************************/
 590
 591 /**
 592  * htmlInitAutoClose:
 593  *
 594  * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
 595  * This is not reentrant. Call xmlInitParser() once before processing in
 596  * case of use in multithreaded programs.
 597  */
 598 void
 599 htmlInitAutoClose(void) {
 600     int indx, i = 0;
 601
 602     if (htmlStartCloseIndexinitialized) return;
 603
 604     for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
 605     indx = 0;
 606     while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
 607         htmlStartCloseIndex[indx++] = &htmlStartClose[i];
 608         while (htmlStartClose[i] != NULL) i++;
 609         i++;
 610     }
 611     htmlStartCloseIndexinitialized = 1;
 612 }
 613
 614 /**
 615  * htmlTagLookup:
 616  * @tag:  The tag name in lowercase
 617  *
 618  * Lookup the HTML tag in the ElementTable
 619  *
 620  * Returns the related htmlElemDescPtr or NULL if not found.
 621  */
 622 const htmlElemDesc *
 623 htmlTagLookup(const xmlChar *tag) {
 624     unsigned int i;
 625
 626     for (i = 0; i < (sizeof(html40ElementTable) /
 627                      sizeof(html40ElementTable[0]));i++) {
 628         if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
 629             return((const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) (const htmlElemDescPtr) &html40ElementTable[i]);
 630     }
 631     return(NULL);
 632 }
 633
 634 /**
 635  * htmlGetEndPriority:
 636  * @name: The name of the element to look up the priority for.
 637  *
 638  * Return value: The "endtag" priority.
 639  **/
 640 static int
 641 htmlGetEndPriority (const xmlChar *name) {
 642         int i = 0;
 643
 644         while ((htmlEndPriority[i].name != NULL) &&
 645                (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
 646             i++;
 647
 648         return(htmlEndPriority[i].priority);
 649 }
 650
 651 /**
 652  * htmlCheckAutoClose:
 653  * @newtag:  The new tag name
 654  * @oldtag:  The old tag name
 655  *
 656  * Checks whether the new tag is one of the registered valid tags for
 657  * closing old.
 658  * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
 659  *
 660  * Returns 0 if no, 1 if yes.
 661  */
 662 static int
 663 htmlCheckAutoClose(const xmlChar *newtag, const xmlChar *oldtag) {
 664     int i, indx;
 665     const char **closed = NULL;
 666
 667     if (htmlStartCloseIndexinitialized == 0) htmlInitAutoClose();
 668
 669     /* inefficient, but not a big deal */
 670     for (indx = 0; indx < 100;indx++) {
 671         closed = htmlStartCloseIndex[indx];
 672         if (closed == NULL) return(0);
 673         if (xmlStrEqual(BAD_CAST *closed, newtag)) break;
 674     }
 675
 676     i = closed - htmlStartClose;
 677     i++;
 678     while (htmlStartClose[i] != NULL) {
 679         if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
 680             return(1);
 681         }
 682         i++;
 683     }
 684     return(0);
 685 }
 686
 687 /**
 688  * htmlAutoCloseOnClose:
 689  * @ctxt:  an HTML parser context
 690  * @newtag:  The new tag name
 691  * @force:  force the tag closure
 692  *
 693  * The HTML DTD allows an ending tag to implicitly close other tags.
 694  */
 695 static void
 696 htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
 697     const htmlElemDesc * info;
 698     xmlChar *oldname;
 699     int i, priority;
 700
 701 #ifdef DEBUG
 702     xmlGenericError(xmlGenericErrorContext,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr);
 703     for (i = 0;i < ctxt->nameNr;i++)
 704         xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
 705 #endif
 706
 707     priority = htmlGetEndPriority (newtag);
 708
 709     for (i = (ctxt->nameNr - 1);i >= 0;i--) {
 710
 711         if (xmlStrEqual(newtag, ctxt->nameTab[i])) break;
 712         /*
 713          * A missplaced endtag can only close elements with lower
 714          * or equal priority, so if we find an element with higher
 715          * priority before we find an element with
 716          * matching name, we just ignore this endtag
 717          */
 718         if (htmlGetEndPriority (ctxt->nameTab[i]) > priority) return;
 719     }
 720     if (i < 0) return;
 721
 722     while (!xmlStrEqual(newtag, ctxt->name)) {
 723         info = htmlTagLookup(ctxt->name);
 724         if ((info == NULL) || (info->endTag == 1)) {
 725 #ifdef DEBUG
 726             xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
 727 #endif
 728         } else if (info->endTag == 3) {
 729 #ifdef DEBUG
 730             xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", newtag, ctxt->name);
 731
 732 #endif
 733             if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
 734                 ctxt->sax->error(ctxt->userData,
 735                  "Opening and ending tag mismatch: %s and %s\n",
 736                                  newtag, ctxt->name);
 737             ctxt->wellFormed = 0;
 738         }
 739         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
 740             ctxt->sax->endElement(ctxt->userData, ctxt->name);
 741         oldname = htmlnamePop(ctxt);
 742         if (oldname != NULL) {
 743 #ifdef DEBUG
 744             xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: popped %s\n", oldname);
 745 #endif
 746             xmlFree(oldname);
 747         }
 748     }
 749 }
 750
 751 /**
 752  * htmlAutoCloseOnEnd:
 753  * @ctxt:  an HTML parser context
 754  *
 755  * Close all remaining tags at the end of the stream
 756  */
 757 static void
 758 htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt) {
 759     xmlChar *oldname;
 760     int i;
 761
 762     if (ctxt->nameNr == 0)
 763         return;
 764 #ifdef DEBUG
 765     xmlGenericError(xmlGenericErrorContext,"Close of stack: %d elements\n", ctxt->nameNr);
 766 #endif
 767
 768     for (i = (ctxt->nameNr - 1);i >= 0;i--) {
 769 #ifdef DEBUG
 770         xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
 771 #endif
 772         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
 773             ctxt->sax->endElement(ctxt->userData, ctxt->name);
 774         oldname = htmlnamePop(ctxt);
 775         if (oldname != NULL) {
 776 #ifdef DEBUG
 777             xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnEnd: popped %s\n", oldname);
 778 #endif
 779             xmlFree(oldname);
 780         }
 781     }
 782 }
 783
 784 /**
 785  * htmlAutoClose:
 786  * @ctxt:  an HTML parser context
 787  * @newtag:  The new tag name or NULL
 788  *
 789  * The HTML DTD allows a tag to implicitly close other tags.
 790  * The list is kept in htmlStartClose array. This function is
 791  * called when a new tag has been detected and generates the
 792  * appropriates closes if possible/needed.
 793  * If newtag is NULL this mean we are at the end of the resource
 794  * and we should check
 795  */
 796 static void
 797 htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
 798     xmlChar *oldname;
 799     while ((newtag != NULL) && (ctxt->name != NULL) &&
 800            (htmlCheckAutoClose(newtag, ctxt->name))) {
 801 #ifdef DEBUG
 802         xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: %s closes %s\n", newtag, ctxt->name);
 803 #endif
 804         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
 805             ctxt->sax->endElement(ctxt->userData, ctxt->name);
 806         oldname = htmlnamePop(ctxt);
 807         if (oldname != NULL) {
 808 #ifdef DEBUG
 809             xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
 810 #endif
 811             xmlFree(oldname);
 812         }
 813     }
 814     if (newtag == NULL) {
 815         htmlAutoCloseOnEnd(ctxt);
 816         return;
 817     }
 818     while ((newtag == NULL) && (ctxt->name != NULL) &&
 819            ((xmlStrEqual(ctxt->name, BAD_CAST"head")) ||
 820             (xmlStrEqual(ctxt->name, BAD_CAST"body")) ||
 821             (xmlStrEqual(ctxt->name, BAD_CAST"html")))) {
 822 #ifdef DEBUG
 823         xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: EOF closes %s\n", ctxt->name);
 824 #endif
 825         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
 826             ctxt->sax->endElement(ctxt->userData, ctxt->name);
 827         oldname = htmlnamePop(ctxt);
 828         if (oldname != NULL) {
 829 #ifdef DEBUG
 830             xmlGenericError(xmlGenericErrorContext,"htmlAutoClose: popped %s\n", oldname);
 831 #endif
 832             xmlFree(oldname);
 833         }
 834    }
 835
 836 }
 837
 838 /**
 839  * htmlAutoCloseTag:
 840  * @doc:  the HTML document
 841  * @name:  The tag name
 842  * @elem:  the HTML element
 843  *
 844  * The HTML DTD allows a tag to implicitly close other tags.
 845  * The list is kept in htmlStartClose array. This function checks
 846  * if the element or one of it's children would autoclose the
 847  * given tag.
 848  *
 849  * Returns 1 if autoclose, 0 otherwise
 850  */
 851 int
 852 htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
 853     htmlNodePtr child;
 854
 855     if (elem == NULL) return(1);
 856     if (xmlStrEqual(name, elem->name)) return(0);
 857     if (htmlCheckAutoClose(elem->name, name)) return(1);
 858     child = elem->children;
 859     while (child != NULL) {
 860         if (htmlAutoCloseTag(doc, name, child)) return(1);
 861         child = child->next;
 862     }
 863     return(0);
 864 }
 865
 866 /**
 867  * htmlIsAutoClosed:
 868  * @doc:  the HTML document
 869  * @elem:  the HTML element
 870  *
 871  * The HTML DTD allows a tag to implicitly close other tags.
 872  * The list is kept in htmlStartClose array. This function checks
 873  * if a tag is autoclosed by one of it's child
 874  *
 875  * Returns 1 if autoclosed, 0 otherwise
 876  */
 877 int
 878 htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
 879     htmlNodePtr child;
 880
 881     if (elem == NULL) return(1);
 882     child = elem->children;
 883     while (child != NULL) {
 884         if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
 885         child = child->next;
 886     }
 887     return(0);
 888 }
 889
 890 /**
 891  * htmlCheckImplied:
 892  * @ctxt:  an HTML parser context
 893  * @newtag:  The new tag name
 894  *
 895  * The HTML DTD allows a tag to exists only implicitly
 896  * called when a new tag has been detected and generates the
 897  * appropriates implicit tags if missing
 898  */
 899 static void
 900 htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
 901     if (!htmlOmittedDefaultValue)
 902         return;
 903     if (xmlStrEqual(newtag, BAD_CAST"html"))
 904         return;
 905     if (ctxt->nameNr <= 0) {
 906 #ifdef DEBUG
 907         xmlGenericError(xmlGenericErrorContext,"Implied element html: pushed html\n");
 908 #endif
 909         htmlnamePush(ctxt, xmlStrdup(BAD_CAST"html"));
 910         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
 911             ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
 912     }
 913     if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
 914         return;
 915     if ((ctxt->nameNr <= 1) &&
 916         ((xmlStrEqual(newtag, BAD_CAST"script")) ||
 917          (xmlStrEqual(newtag, BAD_CAST"style")) ||
 918          (xmlStrEqual(newtag, BAD_CAST"meta")) ||
 919          (xmlStrEqual(newtag, BAD_CAST"link")) ||
 920          (xmlStrEqual(newtag, BAD_CAST"title")) ||
 921          (xmlStrEqual(newtag, BAD_CAST"base")))) {
 922             /*
 923              * dropped OBJECT ... i you put it first BODY will be
 924              * assumed !
 925              */
 926 #ifdef DEBUG
 927             xmlGenericError(xmlGenericErrorContext,"Implied element head: pushed head\n");
 928 #endif
 929             htmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
 930             if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
 931                 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
 932     } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
 933                (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
 934                (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
 935         int i;
 936         for (i = 0;i < ctxt->nameNr;i++) {
 937             if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
 938                 return;
 939             }
 940             if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
 941                 return;
 942             }
 943         }
 944
 945 #ifdef DEBUG
 946         xmlGenericError(xmlGenericErrorContext,"Implied element body: pushed body\n");
 947 #endif
 948         htmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
 949         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
 950             ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
 951     }
 952 }
 953
 954 /**
 955  * htmlCheckParagraph
 956  * @ctxt:  an HTML parser context
 957  *
 958  * Check whether a p element need to be implied before inserting
 959  * characters in the current element.
 960  *
 961  * Returns 1 if a paragraph has been inserted, 0 if not and -1
 962  *         in case of error.
 963  */
 964
 965 static int
 966 htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
 967     const xmlChar *tag;
 968     int i;
 969
 970     if (ctxt == NULL)
 971         return(-1);
 972     tag = ctxt->name;
 973     if (tag == NULL) {
 974         htmlAutoClose(ctxt, BAD_CAST"p");
 975         htmlCheckImplied(ctxt, BAD_CAST"p");
 976         htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
 977         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
 978             ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
 979         return(1);
 980     }
 981     if (!htmlOmittedDefaultValue)
 982         return(0);
 983     for (i = 0; htmlNoContentElements[i] != NULL; i++) {
 984         if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
 985 #ifdef DEBUG
 986             xmlGenericError(xmlGenericErrorContext,"Implied element paragraph\n");
 987 #endif
 988             htmlAutoClose(ctxt, BAD_CAST"p");
 989             htmlCheckImplied(ctxt, BAD_CAST"p");
 990             htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
 991             if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
 992                 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
 993             return(1);
 994         }
 995     }
 996     return(0);
 997 }
 998
 999 /**
1000  * htmlIsScriptAttribute:
1001  * @name:  an attribute name
1002  *
1003  * Check if an attribute is of content type Script
1004  *
1005  * Returns 1 is the attribute is a script 0 otherwise
1006  */
1007 int
1008 htmlIsScriptAttribute(const xmlChar *name) {
1009     unsigned int i;
1010
1011     if (name == NULL)
1012         return(0);
1013     /*
1014      * all script attributes start with 'on'
1015      */
1016     if ((name[0] != 'o') || (name[1] != 'n'))
1017         return(0);
1018     for (i = 0;
1019          i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1020          i++) {
1021         if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1022             return(1);
1023     }
1024     return(0);
1025 }
1026
1027 /************************************************************************
1028  *                                                                      *
1029  *              The list of HTML predefined entities                    *
1030  *                                                                      *
1031  ************************************************************************/
1032
1033
1034 static const htmlEntityDesc  html40EntitiesTable[] = {
1035 /*
1036  * the 4 absolute ones, plus apostrophe.
1037  */
1038 { 34,   "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1039 { 38,   "amp",  "ampersand, U+0026 ISOnum" },
1040 { 39,   "apos", "single quote" },
1041 { 60,   "lt",   "less-than sign, U+003C ISOnum" },
1042 { 62,   "gt",   "greater-than sign, U+003E ISOnum" },
1043
1044 /*
1045  * A bunch still in the 128-255 range
1046  * Replacing them depend really on the charset used.
1047  */
1048 { 160,  "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1049 { 161,  "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1050 { 162,  "cent", "cent sign, U+00A2 ISOnum" },
1051 { 163,  "pound","pound sign, U+00A3 ISOnum" },
1052 { 164,  "curren","currency sign, U+00A4 ISOnum" },
1053 { 165,  "yen",  "yen sign = yuan sign, U+00A5 ISOnum" },
1054 { 166,  "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1055 { 167,  "sect", "section sign, U+00A7 ISOnum" },
1056 { 168,  "uml",  "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1057 { 169,  "copy", "copyright sign, U+00A9 ISOnum" },
1058 { 170,  "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1059 { 171,  "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1060 { 172,  "not",  "not sign, U+00AC ISOnum" },
1061 { 173,  "shy",  "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1062 { 174,  "reg",  "registered sign = registered trade mark sign, U+00AE ISOnum" },
1063 { 175,  "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1064 { 176,  "deg",  "degree sign, U+00B0 ISOnum" },
1065 { 177,  "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1066 { 178,  "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1067 { 179,  "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1068 { 180,  "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1069 { 181,  "micro","micro sign, U+00B5 ISOnum" },
1070 { 182,  "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1071 { 183,  "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1072 { 184,  "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1073 { 185,  "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1074 { 186,  "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1075 { 187,  "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1076 { 188,  "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1077 { 189,  "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1078 { 190,  "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1079 { 191,  "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1080 { 192,  "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1081 { 193,  "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1082 { 194,  "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1083 { 195,  "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1084 { 196,  "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1085 { 197,  "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1086 { 198,  "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1087 { 199,  "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1088 { 200,  "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1089 { 201,  "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1090 { 202,  "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1091 { 203,  "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1092 { 204,  "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1093 { 205,  "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1094 { 206,  "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1095 { 207,  "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1096 { 208,  "ETH",  "latin capital letter ETH, U+00D0 ISOlat1" },
1097 { 209,  "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1098 { 210,  "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1099 { 211,  "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1100 { 212,  "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1101 { 213,  "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1102 { 214,  "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1103 { 215,  "times","multiplication sign, U+00D7 ISOnum" },
1104 { 216,  "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1105 { 217,  "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1106 { 218,  "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1107 { 219,  "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1108 { 220,  "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1109 { 221,  "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1110 { 222,  "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1111 { 223,  "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1112 { 224,  "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1113 { 225,  "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1114 { 226,  "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1115 { 227,  "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1116 { 228,  "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1117 { 229,  "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1118 { 230,  "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1119 { 231,  "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1120 { 232,  "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1121 { 233,  "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1122 { 234,  "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1123 { 235,  "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1124 { 236,  "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1125 { 237,  "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1126 { 238,  "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1127 { 239,  "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1128 { 240,  "eth",  "latin small letter eth, U+00F0 ISOlat1" },
1129 { 241,  "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1130 { 242,  "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1131 { 243,  "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1132 { 244,  "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1133 { 245,  "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1134 { 246,  "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1135 { 247,  "divide","division sign, U+00F7 ISOnum" },
1136 { 248,  "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1137 { 249,  "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1138 { 250,  "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1139 { 251,  "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1140 { 252,  "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1141 { 253,  "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1142 { 254,  "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1143 { 255,  "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1144
1145 { 338,  "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1146 { 339,  "oelig","latin small ligature oe, U+0153 ISOlat2" },
1147 { 352,  "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1148 { 353,  "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1149 { 376,  "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1150
1151 /*
1152  * Anything below should really be kept as entities references
1153  */
1154 { 402,  "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1155
1156 { 710,  "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1157 { 732,  "tilde","small tilde, U+02DC ISOdia" },
1158
1159 { 913,  "Alpha","greek capital letter alpha, U+0391" },
1160 { 914,  "Beta", "greek capital letter beta, U+0392" },
1161 { 915,  "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1162 { 916,  "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1163 { 917,  "Epsilon","greek capital letter epsilon, U+0395" },
1164 { 918,  "Zeta", "greek capital letter zeta, U+0396" },
1165 { 919,  "Eta",  "greek capital letter eta, U+0397" },
1166 { 920,  "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1167 { 921,  "Iota", "greek capital letter iota, U+0399" },
1168 { 922,  "Kappa","greek capital letter kappa, U+039A" },
1169 { 923,  "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1170 { 924,  "Mu",   "greek capital letter mu, U+039C" },
1171 { 925,  "Nu",   "greek capital letter nu, U+039D" },
1172 { 926,  "Xi",   "greek capital letter xi, U+039E ISOgrk3" },
1173 { 927,  "Omicron","greek capital letter omicron, U+039F" },
1174 { 928,  "Pi",   "greek capital letter pi, U+03A0 ISOgrk3" },
1175 { 929,  "Rho",  "greek capital letter rho, U+03A1" },
1176 { 931,  "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1177 { 932,  "Tau",  "greek capital letter tau, U+03A4" },
1178 { 933,  "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1179 { 934,  "Phi",  "greek capital letter phi, U+03A6 ISOgrk3" },
1180 { 935,  "Chi",  "greek capital letter chi, U+03A7" },
1181 { 936,  "Psi",  "greek capital letter psi, U+03A8 ISOgrk3" },
1182 { 937,  "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1183
1184 { 945,  "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1185 { 946,  "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1186 { 947,  "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1187 { 948,  "delta","greek small letter delta, U+03B4 ISOgrk3" },
1188 { 949,  "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1189 { 950,  "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1190 { 951,  "eta",  "greek small letter eta, U+03B7 ISOgrk3" },
1191 { 952,  "theta","greek small letter theta, U+03B8 ISOgrk3" },
1192 { 953,  "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1193 { 954,  "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1194 { 955,  "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1195 { 956,  "mu",   "greek small letter mu, U+03BC ISOgrk3" },
1196 { 957,  "nu",   "greek small letter nu, U+03BD ISOgrk3" },
1197 { 958,  "xi",   "greek small letter xi, U+03BE ISOgrk3" },
1198 { 959,  "omicron","greek small letter omicron, U+03BF NEW" },
1199 { 960,  "pi",   "greek small letter pi, U+03C0 ISOgrk3" },
1200 { 961,  "rho",  "greek small letter rho, U+03C1 ISOgrk3" },
1201 { 962,  "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1202 { 963,  "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1203 { 964,  "tau",  "greek small letter tau, U+03C4 ISOgrk3" },
1204 { 965,  "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1205 { 966,  "phi",  "greek small letter phi, U+03C6 ISOgrk3" },
1206 { 967,  "chi",  "greek small letter chi, U+03C7 ISOgrk3" },
1207 { 968,  "psi",  "greek small letter psi, U+03C8 ISOgrk3" },
1208 { 969,  "omega","greek small letter omega, U+03C9 ISOgrk3" },
1209 { 977,  "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1210 { 978,  "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1211 { 982,  "piv",  "greek pi symbol, U+03D6 ISOgrk3" },
1212
1213 { 8194, "ensp", "en space, U+2002 ISOpub" },
1214 { 8195, "emsp", "em space, U+2003 ISOpub" },
1215 { 8201, "thinsp","thin space, U+2009 ISOpub" },
1216 { 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1217 { 8205, "zwj",  "zero width joiner, U+200D NEW RFC 2070" },
1218 { 8206, "lrm",  "left-to-right mark, U+200E NEW RFC 2070" },
1219 { 8207, "rlm",  "right-to-left mark, U+200F NEW RFC 2070" },
1220 { 8211, "ndash","en dash, U+2013 ISOpub" },
1221 { 8212, "mdash","em dash, U+2014 ISOpub" },
1222 { 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1223 { 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1224 { 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1225 { 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1226 { 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1227 { 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1228 { 8224, "dagger","dagger, U+2020 ISOpub" },
1229 { 8225, "Dagger","double dagger, U+2021 ISOpub" },
1230
1231 { 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1232 { 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1233
1234 { 8240, "permil","per mille sign, U+2030 ISOtech" },
1235
1236 { 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1237 { 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1238
1239 { 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1240 { 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1241
1242 { 8254, "oline","overline = spacing overscore, U+203E NEW" },
1243 { 8260, "frasl","fraction slash, U+2044 NEW" },
1244
1245 { 8364, "euro", "euro sign, U+20AC NEW" },
1246
1247 { 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1248 { 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1249 { 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1250 { 8482, "trade","trade mark sign, U+2122 ISOnum" },
1251 { 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1252 { 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1253 { 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1254 { 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1255 { 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1256 { 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1257 { 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1258 { 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1259 { 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1260 { 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1261 { 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1262 { 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1263
1264 { 8704, "forall","for all, U+2200 ISOtech" },
1265 { 8706, "part", "partial differential, U+2202 ISOtech" },
1266 { 8707, "exist","there exists, U+2203 ISOtech" },
1267 { 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1268 { 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1269 { 8712, "isin", "element of, U+2208 ISOtech" },
1270 { 8713, "notin","not an element of, U+2209 ISOtech" },
1271 { 8715, "ni",   "contains as member, U+220B ISOtech" },
1272 { 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1273 { 8721, "sum",  "n-ary summation, U+2211 ISOamsb" },
1274 { 8722, "minus","minus sign, U+2212 ISOtech" },
1275 { 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1276 { 8730, "radic","square root = radical sign, U+221A ISOtech" },
1277 { 8733, "prop", "proportional to, U+221D ISOtech" },
1278 { 8734, "infin","infinity, U+221E ISOtech" },
1279 { 8736, "ang",  "angle, U+2220 ISOamso" },
1280 { 8743, "and",  "logical and = wedge, U+2227 ISOtech" },
1281 { 8744, "or",   "logical or = vee, U+2228 ISOtech" },
1282 { 8745, "cap",  "intersection = cap, U+2229 ISOtech" },
1283 { 8746, "cup",  "union = cup, U+222A ISOtech" },
1284 { 8747, "int",  "integral, U+222B ISOtech" },
1285 { 8756, "there4","therefore, U+2234 ISOtech" },
1286 { 8764, "sim",  "tilde operator = varies with = similar to, U+223C ISOtech" },
1287 { 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1288 { 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1289 { 8800, "ne",   "not equal to, U+2260 ISOtech" },
1290 { 8801, "equiv","identical to, U+2261 ISOtech" },
1291 { 8804, "le",   "less-than or equal to, U+2264 ISOtech" },
1292 { 8805, "ge",   "greater-than or equal to, U+2265 ISOtech" },
1293 { 8834, "sub",  "subset of, U+2282 ISOtech" },
1294 { 8835, "sup",  "superset of, U+2283 ISOtech" },
1295 { 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1296 { 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1297 { 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1298 { 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1299 { 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1300 { 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1301 { 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1302 { 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1303 { 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1304 { 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1305 { 8971, "rfloor","right floor, U+230B ISOamsc" },
1306 { 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1307 { 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1308 { 9674, "loz",  "lozenge, U+25CA ISOpub" },
1309
1310 { 9824, "spades","black spade suit, U+2660 ISOpub" },
1311 { 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1312 { 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1313 { 9830, "diams","black diamond suit, U+2666 ISOpub" },
1314
1315 };
1316
1317 /************************************************************************
1318  *                                                                      *
1319  *              Commodity functions to handle entities                  *
1320  *                                                                      *
1321  ************************************************************************/
1322
1323 /*
1324  * Macro used to grow the current buffer.
1325  */
1326 #define growBuffer(buffer) {                                            \
1327     buffer##_size *= 2;                                                 \
1328     buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1329     if (buffer == NULL) {                                               \
1330         xmlGenericError(xmlGenericErrorContext, "realloc failed\n");    \
1331         return(NULL);                                                   \
1332     }                                                                   \
1333 }
1334
1335 /**
1336  * htmlEntityLookup:
1337  * @name: the entity name
1338  *
1339  * Lookup the given entity in EntitiesTable
1340  *
1341  * TODO: the linear scan is really ugly, an hash table is really needed.
1342  *
1343  * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1344  */
1345 const htmlEntityDesc *
1346 htmlEntityLookup(const xmlChar *name) {
1347     unsigned int i;
1348
1349     for (i = 0;i < (sizeof(html40EntitiesTable)/
1350                     sizeof(html40EntitiesTable[0]));i++) {
1351         if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1352 #ifdef DEBUG
1353             xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", name);
1354 #endif
1355             return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
1356         }
1357     }
1358     return(NULL);
1359 }
1360
1361 /**
1362  * htmlEntityValueLookup:
1363  * @value: the entity's unicode value
1364  *
1365  * Lookup the given entity in EntitiesTable
1366  *
1367  * TODO: the linear scan is really ugly, an hash table is really needed.
1368  *
1369  * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1370  */
1371 const htmlEntityDesc *
1372 htmlEntityValueLookup(unsigned int value) {
1373     unsigned int i;
1374 #ifdef DEBUG
1375     unsigned int lv = 0;
1376 #endif
1377
1378     for (i = 0;i < (sizeof(html40EntitiesTable)/
1379                     sizeof(html40EntitiesTable[0]));i++) {
1380         if (html40EntitiesTable[i].value >= value) {
1381             if (html40EntitiesTable[i].value > value)
1382                 break;
1383 #ifdef DEBUG
1384             xmlGenericError(xmlGenericErrorContext,"Found entity %s\n", html40EntitiesTable[i].name);
1385 #endif
1386             return((const htmlEntityDescPtr) &html40EntitiesTable[i]);
1387         }
1388 #ifdef DEBUG
1389         if (lv > html40EntitiesTable[i].value) {
1390             xmlGenericError(xmlGenericErrorContext,
1391                     "html40EntitiesTable[] is not sorted (%d > %d)!\n",
1392                     lv, html40EntitiesTable[i].value);
1393         }
1394         lv = html40EntitiesTable[i].value;
1395 #endif
1396     }
1397     return(NULL);
1398 }
1399
1400 /**
1401  * UTF8ToHtml:
1402  * @out:  a pointer to an array of bytes to store the result
1403  * @outlen:  the length of @out
1404  * @in:  a pointer to an array of UTF-8 chars
1405  * @inlen:  the length of @in
1406  *
1407  * Take a block of UTF-8 chars in and try to convert it to an ASCII
1408  * plus HTML entities block of chars out.
1409  *
1410  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1411  * The value of @inlen after return is the number of octets consumed
1412  *     as the return value is positive, else unpredictable.
1413  * The value of @outlen after return is the number of octets consumed.
1414  */
1415 int
1416 UTF8ToHtml(unsigned char* out, int *outlen,
1417               const unsigned char* in, int *inlen) {
1418     const unsigned char* processed = in;
1419     const unsigned char* outend;
1420     const unsigned char* outstart = out;
1421     const unsigned char* instart = in;
1422     const unsigned char* inend;
1423     unsigned int c, d;
1424     int trailing;
1425
1426     if (in == NULL) {
1427         /*
1428          * initialization nothing to do
1429          */
1430         *outlen = 0;
1431         *inlen = 0;
1432         return(0);
1433     }
1434     inend = in + (*inlen);
1435     outend = out + (*outlen);
1436     while (in < inend) {
1437         d = *in++;
1438         if      (d < 0x80)  { c= d; trailing= 0; }
1439         else if (d < 0xC0) {
1440             /* trailing byte in leading position */
1441             *outlen = out - outstart;
1442             *inlen = processed - instart;
1443             return(-2);
1444         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
1445         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
1446         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
1447         else {
1448             /* no chance for this in Ascii */
1449             *outlen = out - outstart;
1450             *inlen = processed - instart;
1451             return(-2);
1452         }
1453
1454         if (inend - in < trailing) {
1455             break;
1456         }
1457
1458         for ( ; trailing; trailing--) {
1459             if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1460                 break;
1461             c <<= 6;
1462             c |= d & 0x3F;
1463         }
1464
1465         /* assertion: c is a single UTF-4 value */
1466         if (c < 0x80) {
1467             if (out + 1 >= outend)
1468                 break;
1469             *out++ = c;
1470         } else {
1471             int len;
1472             const htmlEntityDesc * ent;
1473
1474             /*
1475              * Try to lookup a predefined HTML entity for it
1476              */
1477
1478             ent = htmlEntityValueLookup(c);
1479             if (ent == NULL) {
1480                 /* no chance for this in Ascii */
1481                 *outlen = out - outstart;
1482                 *inlen = processed - instart;
1483                 return(-2);
1484             }
1485             len = strlen(ent->name);
1486             if (out + 2 + len >= outend)
1487                 break;
1488             *out++ = '&';
1489             memcpy(out, ent->name, len);
1490             out += len;
1491             *out++ = ';';
1492         }
1493         processed = in;
1494     }
1495     *outlen = out - outstart;
1496     *inlen = processed - instart;
1497     return(0);
1498 }
1499
1500 /**
1501  * htmlEncodeEntities:
1502  * @out:  a pointer to an array of bytes to store the result
1503  * @outlen:  the length of @out
1504  * @in:  a pointer to an array of UTF-8 chars
1505  * @inlen:  the length of @in
1506  * @quoteChar: the quote character to escape (' or ") or zero.
1507  *
1508  * Take a block of UTF-8 chars in and try to convert it to an ASCII
1509  * plus HTML entities block of chars out.
1510  *
1511  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1512  * The value of @inlen after return is the number of octets consumed
1513  *     as the return value is positive, else unpredictable.
1514  * The value of @outlen after return is the number of octets consumed.
1515  */
1516 int
1517 htmlEncodeEntities(unsigned char* out, int *outlen,
1518                    const unsigned char* in, int *inlen, int quoteChar) {
1519     const unsigned char* processed = in;
1520     const unsigned char* outend = out + (*outlen);
1521     const unsigned char* outstart = out;
1522     const unsigned char* instart = in;
1523     const unsigned char* inend = in + (*inlen);
1524     unsigned int c, d;
1525     int trailing;
1526
1527     while (in < inend) {
1528         d = *in++;
1529         if      (d < 0x80)  { c= d; trailing= 0; }
1530         else if (d < 0xC0) {
1531             /* trailing byte in leading position */
1532             *outlen = out - outstart;
1533             *inlen = processed - instart;
1534             return(-2);
1535         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
1536         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
1537         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
1538         else {
1539             /* no chance for this in Ascii */
1540             *outlen = out - outstart;
1541             *inlen = processed - instart;
1542             return(-2);
1543         }
1544
1545         if (inend - in < trailing)
1546             break;
1547
1548         while (trailing--) {
1549             if (((d= *in++) & 0xC0) != 0x80) {
1550                 *outlen = out - outstart;
1551                 *inlen = processed - instart;
1552                 return(-2);
1553             }
1554             c <<= 6;
1555             c |= d & 0x3F;
1556         }
1557
1558         /* assertion: c is a single UTF-4 value */
1559         if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
1560             (c != '&') && (c != '<') && (c != '>')) {
1561             if (out >= outend)
1562                 break;
1563             *out++ = c;
1564         } else {
1565             const htmlEntityDesc * ent;
1566             const char *cp;
1567             char nbuf[16];
1568             int len;
1569
1570             /*
1571              * Try to lookup a predefined HTML entity for it
1572              */
1573             ent = htmlEntityValueLookup(c);
1574             if (ent == NULL) {
1575                 snprintf(nbuf, sizeof(nbuf), "#%u", c);
1576                 cp = nbuf;
1577             }
1578             else
1579                 cp = ent->name;
1580             len = strlen(cp);
1581             if (out + 2 + len > outend)
1582                 break;
1583             *out++ = '&';
1584             memcpy(out, cp, len);
1585             out += len;
1586             *out++ = ';';
1587         }
1588         processed = in;
1589     }
1590     *outlen = out - outstart;
1591     *inlen = processed - instart;
1592     return(0);
1593 }
1594
1595 /**
1596  * htmlDecodeEntities:
1597  * @ctxt:  the parser context
1598  * @len:  the len to decode (in bytes !), -1 for no size limit
1599  * @end:  an end marker xmlChar, 0 if none
1600  * @end2:  an end marker xmlChar, 0 if none
1601  * @end3:  an end marker xmlChar, 0 if none
1602  *
1603  * Substitute the HTML entities by their value
1604  *
1605  * DEPRECATED !!!!
1606  *
1607  * Returns A newly allocated string with the substitution done. The caller
1608  *      must deallocate it !
1609  */
1610 xmlChar *
1611 htmlDecodeEntities(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED, int len ATTRIBUTE_UNUSED,
1612           xmlChar end ATTRIBUTE_UNUSED, xmlChar  end2 ATTRIBUTE_UNUSED, xmlChar end3 ATTRIBUTE_UNUSED) {
1613     static int deprecated = 0;
1614     if (!deprecated) {
1615         xmlGenericError(xmlGenericErrorContext,
1616                 "htmlDecodeEntities() deprecated function reached\n");
1617         deprecated = 1;
1618     }
1619     return(NULL);
1620 }
1621
1622 /************************************************************************
1623  *                                                                      *
1624  *              Commodity functions to handle streams                   *
1625  *                                                                      *
1626  ************************************************************************/
1627
1628 /**
1629  * htmlNewInputStream:
1630  * @ctxt:  an HTML parser context
1631  *
1632  * Create a new input stream structure
1633  * Returns the new input stream or NULL
1634  */
1635 static htmlParserInputPtr
1636 htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1637     htmlParserInputPtr input;
1638
1639     input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1640     if (input == NULL) {
1641         ctxt->errNo = XML_ERR_NO_MEMORY;
1642         if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1643             ctxt->sax->error(ctxt->userData,
1644                              "malloc: couldn't allocate a new input stream\n");
1645         return(NULL);
1646     }
1647     memset(input, 0, sizeof(htmlParserInput));
1648     input->filename = NULL;
1649     input->directory = NULL;
1650     input->base = NULL;
1651     input->cur = NULL;
1652     input->buf = NULL;
1653     input->line = 1;
1654     input->col = 1;
1655     input->buf = NULL;
1656     input->free = NULL;
1657     input->version = NULL;
1658     input->consumed = 0;
1659     input->length = 0;
1660     return(input);
1661 }
1662
1663
1664 /************************************************************************
1665  *                                                                      *
1666  *              Commodity functions, cleanup needed ?                   *
1667  *                                                                      *
1668  ************************************************************************/
1669 /*
1670  * all tags allowing pc data from the html 4.01 loose dtd
1671  * NOTE: it might be more apropriate to integrate this information
1672  * into the html40ElementTable array but I don't want to risk any
1673  * binary incomptibility
1674  */
1675 static const char *allowPCData[] = {
1676     "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
1677     "blockquote", "body", "button", "caption", "center", "cite", "code",
1678     "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
1679     "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
1680     "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
1681     "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
1682 };
1683
1684 /**
1685  * areBlanks:
1686  * @ctxt:  an HTML parser context
1687  * @str:  a xmlChar *
1688  * @len:  the size of @str
1689  *
1690  * Is this a sequence of blank chars that one can ignore ?
1691  *
1692  * Returns 1 if ignorable 0 otherwise.
1693  */
1694
1695 static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
1696     unsigned int i;
1697     int j;
1698     xmlNodePtr lastChild;
1699
1700     for (j = 0;j < len;j++)
1701         if (!(IS_BLANK(str[j]))) return(0);
1702
1703     if (CUR == 0) return(1);
1704     if (CUR != '<') return(0);
1705     if (ctxt->name == NULL)
1706         return(1);
1707     if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
1708         return(1);
1709     if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
1710         return(1);
1711     if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
1712         return(1);
1713     if (ctxt->node == NULL) return(0);
1714     lastChild = xmlGetLastChild(ctxt->node);
1715     if (lastChild == NULL) {
1716         if ((ctxt->node->type != XML_ELEMENT_NODE) &&
1717             (ctxt->node->content != NULL)) return(0);
1718         /* keep ws in constructs like ...<b> </b>...
1719            for all tags "b" allowing PCDATA */
1720         for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
1721             if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
1722                 return(0);
1723             }
1724         }
1725     } else if (xmlNodeIsText(lastChild)) {
1726         return(0);
1727     } else {
1728         /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
1729            for all tags "p" allowing PCDATA */
1730         for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
1731             if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
1732                 return(0);
1733             }
1734         }
1735     }
1736     return(1);
1737 }
1738
1739 /**
1740  * htmlNewDocNoDtD:
1741  * @URI:  URI for the dtd, or NULL
1742  * @ExternalID:  the external ID of the DTD, or NULL
1743  *
1744  * Creates a new HTML document without a DTD node if @URI and @ExternalID
1745  * are NULL
1746  *
1747  * Returns a new document, do not initialize the DTD if not provided
1748  */
1749 htmlDocPtr
1750 htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
1751     xmlDocPtr cur;
1752
1753     /*
1754      * Allocate a new document and fill the fields.
1755      */
1756     cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
1757     if (cur == NULL) {
1758         xmlGenericError(xmlGenericErrorContext,
1759                 "htmlNewDocNoDtD : malloc failed\n");
1760         return(NULL);
1761     }
1762     memset(cur, 0, sizeof(xmlDoc));
1763
1764     cur->type = XML_HTML_DOCUMENT_NODE;
1765     cur->version = NULL;
1766     cur->intSubset = NULL;
1767     cur->doc = cur;
1768     cur->name = NULL;
1769     cur->children = NULL;
1770     cur->extSubset = NULL;
1771     cur->oldNs = NULL;
1772     cur->encoding = NULL;
1773     cur->standalone = 1;
1774     cur->compression = 0;
1775     cur->ids = NULL;
1776     cur->refs = NULL;
1777     cur->_private = NULL;
1778     if ((ExternalID != NULL) ||
1779         (URI != NULL))
1780         xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
1781     return(cur);
1782 }
1783
1784 /**
1785  * htmlNewDoc:
1786  * @URI:  URI for the dtd, or NULL
1787  * @ExternalID:  the external ID of the DTD, or NULL
1788  *
1789  * Creates a new HTML document
1790  *
1791  * Returns a new document
1792  */
1793 htmlDocPtr
1794 htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
1795     if ((URI == NULL) && (ExternalID == NULL))
1796         return(htmlNewDocNoDtD(
1797                     BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
1798                     BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
1799
1800     return(htmlNewDocNoDtD(URI, ExternalID));
1801 }
1802
1803
1804 /************************************************************************
1805  *                                                                      *
1806  *                      The parser itself                               *
1807  *      Relates to http://www.w3.org/TR/html40                          *
1808  *                                                                      *
1809  ************************************************************************/
1810
1811 /************************************************************************
1812  *                                                                      *
1813  *                      The parser itself                               *
1814  *                                                                      *
1815  ************************************************************************/
1816
1817 /**
1818  * htmlParseHTMLName:
1819  * @ctxt:  an HTML parser context
1820  *
1821  * parse an HTML tag or attribute name, note that we convert it to lowercase
1822  * since HTML names are not case-sensitive.
1823  *
1824  * Returns the Tag Name parsed or NULL
1825  */
1826
1827 static xmlChar *
1828 htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
1829     xmlChar *ret = NULL;
1830     int i = 0;
1831     xmlChar loc[HTML_PARSER_BUFFER_SIZE];
1832
1833     if (!IS_LETTER(CUR) && (CUR != '_') &&
1834         (CUR != ':')) return(NULL);
1835
1836     while ((i < HTML_PARSER_BUFFER_SIZE) &&
1837            ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1838            (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
1839         if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
1840         else loc[i] = CUR;
1841         i++;
1842
1843         NEXT;
1844     }
1845
1846     ret = xmlStrndup(loc, i);
1847
1848     return(ret);
1849 }
1850
1851 /**
1852  * htmlParseName:
1853  * @ctxt:  an HTML parser context
1854  *
1855  * parse an HTML name, this routine is case sensitive.
1856  *
1857  * Returns the Name parsed or NULL
1858  */
1859
1860 static xmlChar *
1861 htmlParseName(htmlParserCtxtPtr ctxt) {
1862     xmlChar buf[HTML_MAX_NAMELEN];
1863     int len = 0;
1864
1865     GROW;
1866     if (!IS_LETTER(CUR) && (CUR != '_')) {
1867         return(NULL);
1868     }
1869
1870     while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1871            (CUR == '.') || (CUR == '-') ||
1872            (CUR == '_') || (CUR == ':') ||
1873            (IS_COMBINING(CUR)) ||
1874            (IS_EXTENDER(CUR))) {
1875         buf[len++] = CUR;
1876         NEXT;
1877         if (len >= HTML_MAX_NAMELEN) {
1878             xmlGenericError(xmlGenericErrorContext,
1879                "htmlParseName: reached HTML_MAX_NAMELEN limit\n");
1880             while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
1881                    (CUR == '.') || (CUR == '-') ||
1882                    (CUR == '_') || (CUR == ':') ||
1883                    (IS_COMBINING(CUR)) ||
1884                    (IS_EXTENDER(CUR)))
1885                  NEXT;
1886             break;
1887         }
1888     }
1889     return(xmlStrndup(buf, len));
1890 }
1891
1892 /**
1893  * htmlParseHTMLAttribute:
1894  * @ctxt:  an HTML parser context
1895  * @stop:  a char stop value
1896  *
1897  * parse an HTML attribute value till the stop (quote), if
1898  * stop is 0 then it stops at the first space
1899  *
1900  * Returns the attribute parsed or NULL
1901  */
1902
1903 static xmlChar *
1904 htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
1905     xmlChar *buffer = NULL;
1906     int buffer_size = 0;
1907     xmlChar *out = NULL;
1908     xmlChar *name = NULL;
1909
1910     xmlChar *cur = NULL;
1911     const htmlEntityDesc * ent;
1912
1913     /*
1914      * allocate a translation buffer.
1915      */
1916     buffer_size = HTML_PARSER_BUFFER_SIZE;
1917     buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
1918     if (buffer == NULL) {
1919         xmlGenericError(xmlGenericErrorContext,
1920                         "htmlParseHTMLAttribute: malloc failed\n");
1921         return(NULL);
1922     }
1923     out = buffer;
1924
1925     /*
1926      * Ok loop until we reach one of the ending chars
1927      */
1928     while ((CUR != 0) && (CUR != stop)) {
1929         if ((stop == 0) && (CUR == '>')) break;
1930         if ((stop == 0) && (IS_BLANK(CUR))) break;
1931         if (CUR == '&') {
1932             if (NXT(1) == '#') {
1933                 unsigned int c;
1934                 int bits;
1935
1936                 c = htmlParseCharRef(ctxt);
1937                 if      (c <    0x80)
1938                         { *out++  = c;                bits= -6; }
1939                 else if (c <   0x800)
1940                         { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
1941                 else if (c < 0x10000)
1942                         { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
1943                 else
1944                         { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
1945
1946                 for ( ; bits >= 0; bits-= 6) {
1947                     *out++  = ((c >> bits) & 0x3F) | 0x80;
1948                 }
1949
1950                 if (out - buffer > buffer_size - 100) {
1951                         int indx = out - buffer;
1952
1953                         growBuffer(buffer);
1954                         out = &buffer[indx];
1955                 }
1956             } else {
1957                 ent = htmlParseEntityRef(ctxt, &name);
1958                 if (name == NULL) {
1959                     *out++ = '&';
1960                     if (out - buffer > buffer_size - 100) {
1961                         int indx = out - buffer;
1962
1963                         growBuffer(buffer);
1964                         out = &buffer[indx];
1965                     }
1966                 } else if (ent == NULL) {
1967                     *out++ = '&';
1968                     cur = name;
1969                     while (*cur != 0) {
1970                         if (out - buffer > buffer_size - 100) {
1971                             int indx = out - buffer;
1972
1973                             growBuffer(buffer);
1974                             out = &buffer[indx];
1975                         }
1976                         *out++ = *cur++;
1977                     }
1978                     xmlFree(name);
1979                 } else {
1980                     unsigned int c;
1981                     int bits;
1982
1983                     if (out - buffer > buffer_size - 100) {
1984                         int indx = out - buffer;
1985
1986                         growBuffer(buffer);
1987                         out = &buffer[indx];
1988                     }
1989                     c = (xmlChar)ent->value;
1990                     if      (c <    0x80)
1991                         { *out++  = c;                bits= -6; }
1992                     else if (c <   0x800)
1993                         { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
1994                     else if (c < 0x10000)
1995                         { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
1996                     else
1997                         { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
1998
1999                     for ( ; bits >= 0; bits-= 6) {
2000                         *out++  = ((c >> bits) & 0x3F) | 0x80;
2001                     }
2002                     xmlFree(name);
2003                 }
2004             }
2005         } else {
2006             unsigned int c;
2007             int bits, l;
2008
2009             if (out - buffer > buffer_size - 100) {
2010                 int indx = out - buffer;
2011
2012                 growBuffer(buffer);
2013                 out = &buffer[indx];
2014             }
2015             c = CUR_CHAR(l);
2016             if      (c <    0x80)
2017                     { *out++  = c;                bits= -6; }
2018             else if (c <   0x800)
2019                     { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2020             else if (c < 0x10000)
2021                     { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2022             else
2023                     { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2024
2025             for ( ; bits >= 0; bits-= 6) {
2026                 *out++  = ((c >> bits) & 0x3F) | 0x80;
2027             }
2028             NEXT;
2029         }
2030     }
2031     *out++ = 0;
2032     return(buffer);
2033 }
2034
2035 /**
2036  * htmlParseEntityRef:
2037  * @ctxt:  an HTML parser context
2038  * @str:  location to store the entity name
2039  *
2040  * parse an HTML ENTITY references
2041  *
2042  * [68] EntityRef ::= '&' Name ';'
2043  *
2044  * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2045  *         if non-NULL *str will have to be freed by the caller.
2046  */
2047 const htmlEntityDesc *
2048 htmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) {
2049     xmlChar *name;
2050     const htmlEntityDesc * ent = NULL;
2051     *str = NULL;
2052
2053     if (CUR == '&') {
2054         NEXT;
2055         name = htmlParseName(ctxt);
2056         if (name == NULL) {
2057             if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2058                 ctxt->sax->error(ctxt->userData, "htmlParseEntityRef: no name\n");
2059             ctxt->wellFormed = 0;
2060         } else {
2061             GROW;
2062             if (CUR == ';') {
2063                 *str = name;
2064
2065                 /*
2066                  * Lookup the entity in the table.
2067                  */
2068                 ent = htmlEntityLookup(name);
2069                 if (ent != NULL) /* OK that's ugly !!! */
2070                     NEXT;
2071             } else {
2072                 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2073                     ctxt->sax->error(ctxt->userData,
2074                                      "htmlParseEntityRef: expecting ';'\n");
2075                 *str = name;
2076             }
2077         }
2078     }
2079     return(ent);
2080 }
2081
2082 /**
2083  * htmlParseAttValue:
2084  * @ctxt:  an HTML parser context
2085  *
2086  * parse a value for an attribute
2087  * Note: the parser won't do substitution of entities here, this
2088  * will be handled later in xmlStringGetNodeList, unless it was
2089  * asked for ctxt->replaceEntities != 0
2090  *
2091  * Returns the AttValue parsed or NULL.
2092  */
2093
2094 static xmlChar *
2095 htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2096     xmlChar *ret = NULL;
2097
2098     if (CUR == '"') {
2099         NEXT;
2100         ret = htmlParseHTMLAttribute(ctxt, '"');
2101         if (CUR != '"') {
2102             if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2103                 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2104             ctxt->wellFormed = 0;
2105         } else
2106             NEXT;
2107     } else if (CUR == '\'') {
2108         NEXT;
2109         ret = htmlParseHTMLAttribute(ctxt, '\'');
2110         if (CUR != '\'') {
2111             if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2112                 ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
2113             ctxt->wellFormed = 0;
2114         } else
2115             NEXT;
2116     } else {
2117         /*
2118          * That's an HTMLism, the attribute value may not be quoted
2119          */
2120         ret = htmlParseHTMLAttribute(ctxt, 0);
2121         if (ret == NULL) {
2122             if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2123                 ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
2124             ctxt->wellFormed = 0;
2125         }
2126     }
2127     return(ret);
2128 }
2129
2130 /**
2131  * htmlParseSystemLiteral:
2132  * @ctxt:  an HTML parser context
2133  *
2134  * parse an HTML Literal
2135  *
2136  * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2137  *
2138  * Returns the SystemLiteral parsed or NULL
2139  */
2140
2141 static xmlChar *
2142 htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2143     const xmlChar *q;
2144     xmlChar *ret = NULL;
2145
2146     if (CUR == '"') {
2147         NEXT;
2148         q = CUR_PTR;
2149         while ((IS_CHAR(CUR)) && (CUR != '"'))
2150             NEXT;
2151         if (!IS_CHAR(CUR)) {
2152             if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2153                 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2154             ctxt->wellFormed = 0;
2155         } else {
2156             ret = xmlStrndup(q, CUR_PTR - q);
2157             NEXT;
2158         }
2159     } else if (CUR == '\'') {
2160         NEXT;
2161         q = CUR_PTR;
2162         while ((IS_CHAR(CUR)) && (CUR != '\''))
2163             NEXT;
2164         if (!IS_CHAR(CUR)) {
2165             if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2166                 ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
2167             ctxt->wellFormed = 0;
2168         } else {
2169             ret = xmlStrndup(q, CUR_PTR - q);
2170             NEXT;
2171         }
2172     } else {
2173         if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2174             ctxt->sax->error(ctxt->userData,
2175                              "SystemLiteral \" or ' expected\n");
2176         ctxt->wellFormed = 0;
2177     }
2178
2179     return(ret);
2180 }
2181
2182 /**
2183  * htmlParsePubidLiteral:
2184  * @ctxt:  an HTML parser context
2185  *
2186  * parse an HTML public literal
2187  *
2188  * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2189  *
2190  * Returns the PubidLiteral parsed or NULL.
2191  */
2192
2193 static xmlChar *
2194 htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2195     const xmlChar *q;
2196     xmlChar *ret = NULL;
2197     /*
2198      * Name ::= (Letter | '_') (NameChar)*
2199      */
2200     if (CUR == '"') {
2201         NEXT;
2202         q = CUR_PTR;
2203         while (IS_PUBIDCHAR(CUR)) NEXT;
2204         if (CUR != '"') {
2205             if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2206                 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2207             ctxt->wellFormed = 0;
2208         } else {
2209             ret = xmlStrndup(q, CUR_PTR - q);
2210             NEXT;
2211         }
2212     } else if (CUR == '\'') {
2213         NEXT;
2214         q = CUR_PTR;
2215         while ((IS_LETTER(CUR)) && (CUR != '\''))
2216             NEXT;
2217         if (!IS_LETTER(CUR)) {
2218             if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2219                 ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
2220             ctxt->wellFormed = 0;
2221         } else {
2222             ret = xmlStrndup(q, CUR_PTR - q);
2223             NEXT;
2224         }
2225     } else {
2226         if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2227             ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
2228         ctxt->wellFormed = 0;
2229     }
2230
2231     return(ret);
2232 }
2233
2234 /**
2235  * htmlParseScript:
2236  * @ctxt:  an HTML parser context
2237  *
2238  * parse the content of an HTML SCRIPT or STYLE element
2239  * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2240  * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2241  * http://www.w3.org/TR/html4/types.html#type-script
2242  * http://www.w3.org/TR/html4/types.html#h-6.15
2243  * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2244  *
2245  * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2246  * element and the value of intrinsic event attributes. User agents must
2247  * not evaluate script data as HTML markup but instead must pass it on as
2248  * data to a script engine.
2249  * NOTES:
2250  * - The content is passed like CDATA
2251  * - the attributes for style and scripting "onXXX" are also described
2252  *   as CDATA but SGML allows entities references in attributes so their
2253  *   processing is identical as other attributes
2254  */
2255 static void
2256 htmlParseScript(htmlParserCtxtPtr ctxt) {
2257     xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
2258     int nbchar = 0;
2259     xmlChar cur;
2260
2261     SHRINK;
2262     cur = CUR;
2263     while (IS_CHAR(cur)) {
2264         if ((cur == '<') && (NXT(1) == '!') && (NXT(2) == '-') &&
2265             (NXT(3) == '-')) {
2266             if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2267                 if (ctxt->sax->cdataBlock!= NULL) {
2268                     /*
2269                      * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2270                      */
2271                     ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2272                 }
2273             }
2274             nbchar = 0;
2275             htmlParseComment(ctxt);
2276             cur = CUR;
2277             continue;
2278         } else if ((cur == '<') && (NXT(1) == '/')) {
2279             /*
2280              * One should break here, the specification is clear:
2281              * Authors should therefore escape "</" within the content.
2282              * Escape mechanisms are specific to each scripting or
2283              * style sheet language.
2284              */
2285             if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2286                 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2287                 break; /* while */
2288         }
2289         buf[nbchar++] = cur;
2290         if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2291             if (ctxt->sax->cdataBlock!= NULL) {
2292                 /*
2293                  * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2294                  */
2295                 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2296             }
2297             nbchar = 0;
2298         }
2299         NEXT;
2300         cur = CUR;
2301     }
2302     if (!(IS_CHAR(cur))) {
2303         if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2304             ctxt->sax->error(ctxt->userData,
2305                 "Invalid char in CDATA 0x%X\n", cur);
2306         ctxt->wellFormed = 0;
2307         NEXT;
2308     }
2309
2310     if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2311         if (ctxt->sax->cdataBlock!= NULL) {
2312             /*
2313              * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2314              */
2315             ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2316         }
2317     }
2318 }
2319
2320
2321 /**
2322  * htmlParseCharData:
2323  * @ctxt:  an HTML parser context
2324  *
2325  * parse a CharData section.
2326  * if we are within a CDATA section ']]>' marks an end of section.
2327  *
2328  * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2329  */
2330
2331 static void
2332 htmlParseCharData(htmlParserCtxtPtr ctxt) {
2333     xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2334     int nbchar = 0;
2335     int cur, l;
2336
2337     SHRINK;
2338     cur = CUR_CHAR(l);
2339     while (((cur != '<') || (ctxt->token == '<')) &&
2340            ((cur != '&') || (ctxt->token == '&')) &&
2341            (IS_CHAR(cur))) {
2342         COPY_BUF(l,buf,nbchar,cur);
2343         if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2344             /*
2345              * Ok the segment is to be consumed as chars.
2346              */
2347             if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2348                 if (areBlanks(ctxt, buf, nbchar)) {
2349                     if (ctxt->sax->ignorableWhitespace != NULL)
2350                         ctxt->sax->ignorableWhitespace(ctxt->userData,
2351                                                        buf, nbchar);
2352                 } else {
2353                     htmlCheckParagraph(ctxt);
2354                     if (ctxt->sax->characters != NULL)
2355                         ctxt->sax->characters(ctxt->userData, buf, nbchar);
2356                 }
2357             }
2358             nbchar = 0;
2359         }
2360         NEXTL(l);
2361         cur = CUR_CHAR(l);
2362     }
2363     if (nbchar != 0) {
2364         /*
2365          * Ok the segment is to be consumed as chars.
2366          */
2367         if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2368             if (areBlanks(ctxt, buf, nbchar)) {
2369                 if (ctxt->sax->ignorableWhitespace != NULL)
2370                     ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2371             } else {
2372                 htmlCheckParagraph(ctxt);
2373                 if (ctxt->sax->characters != NULL)
2374                     ctxt->sax->characters(ctxt->userData, buf, nbchar);
2375             }
2376         }
2377     } else {
2378         /*
2379          * Loop detection
2380          */
2381         if (cur == 0)
2382             ctxt->instate = XML_PARSER_EOF;
2383     }
2384 }
2385
2386 /**
2387  * htmlParseExternalID:
2388  * @ctxt:  an HTML parser context
2389  * @publicID:  a xmlChar** receiving PubidLiteral
2390  *
2391  * Parse an External ID or a Public ID
2392  *
2393  * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2394  *                   | 'PUBLIC' S PubidLiteral S SystemLiteral
2395  *
2396  * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2397  *
2398  * Returns the function returns SystemLiteral and in the second
2399  *                case publicID receives PubidLiteral, is strict is off
2400  *                it is possible to return NULL and have publicID set.
2401  */
2402
2403 static xmlChar *
2404 htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
2405     xmlChar *URI = NULL;
2406
2407     if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2408          (UPP(2) == 'S') && (UPP(3) == 'T') &&
2409          (UPP(4) == 'E') && (UPP(5) == 'M')) {
2410         SKIP(6);
2411         if (!IS_BLANK(CUR)) {
2412             if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2413                 ctxt->sax->error(ctxt->userData,
2414                     "Space required after 'SYSTEM'\n");
2415             ctxt->wellFormed = 0;
2416         }
2417         SKIP_BLANKS;
2418         URI = htmlParseSystemLiteral(ctxt);
2419         if (URI == NULL) {
2420             if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2421                 ctxt->sax->error(ctxt->userData,
2422                   "htmlParseExternalID: SYSTEM, no URI\n");
2423             ctxt->wellFormed = 0;
2424         }
2425     } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2426                (UPP(2) == 'B') && (UPP(3) == 'L') &&
2427                (UPP(4) == 'I') && (UPP(5) == 'C')) {
2428         SKIP(6);
2429         if (!IS_BLANK(CUR)) {
2430             if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2431                 ctxt->sax->error(ctxt->userData,
2432                     "Space required after 'PUBLIC'\n");
2433             ctxt->wellFormed = 0;
2434         }
2435         SKIP_BLANKS;
2436         *publicID = htmlParsePubidLiteral(ctxt);
2437         if (*publicID == NULL) {
2438             if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2439                 ctxt->sax->error(ctxt->userData,
2440                   "htmlParseExternalID: PUBLIC, no Public Identifier\n");
2441             ctxt->wellFormed = 0;
2442         }
2443         SKIP_BLANKS;
2444         if ((CUR == '"') || (CUR == '\'')) {
2445             URI = htmlParseSystemLiteral(ctxt);
2446         }
2447     }
2448     return(URI);
2449 }
2450
2451 /**
2452  * htmlParseComment:
2453  * @ctxt:  an HTML parser context
2454  *
2455  * Parse an XML (SGML) comment <!-- .... -->
2456  *
2457  * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2458  */
2459 static void
2460 htmlParseComment(htmlParserCtxtPtr ctxt) {
2461     xmlChar *buf = NULL;
2462     int len;
2463     int size = HTML_PARSER_BUFFER_SIZE;
2464     int q, ql;
2465     int r, rl;
2466     int cur, l;
2467     xmlParserInputState state;
2468
2469     /*
2470      * Check that there is a comment right here.
2471      */
2472     if ((RAW != '<') || (NXT(1) != '!') ||
2473         (NXT(2) != '-') || (NXT(3) != '-')) return;
2474
2475     state = ctxt->instate;
2476     ctxt->instate = XML_PARSER_COMMENT;
2477     SHRINK;
2478     SKIP(4);
2479     buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar));
2480     if (buf == NULL) {
2481         xmlGenericError(xmlGenericErrorContext,
2482                 "malloc of %d byte failed\n", size);
2483         ctxt->instate = state;
2484         return;
2485     }
2486     q = CUR_CHAR(ql);
2487     NEXTL(ql);
2488     r = CUR_CHAR(rl);
2489     NEXTL(rl);
2490     cur = CUR_CHAR(l);
2491     len = 0;
2492     while (IS_CHAR(cur) &&
2493            ((cur != '>') ||
2494             (r != '-') || (q != '-'))) {
2495         if (len + 5 >= size) {
2496             size *= 2;
2497             buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
2498             if (buf == NULL) {
2499                 xmlGenericError(xmlGenericErrorContext,
2500                         "realloc of %d byte failed\n", size);
2501                 ctxt->instate = state;
2502                 return;
2503             }
2504         }
2505         COPY_BUF(ql,buf,len,q);
2506         q = r;
2507         ql = rl;
2508         r = cur;
2509         rl = l;
2510         NEXTL(l);
2511         cur = CUR_CHAR(l);
2512         if (cur == 0) {
2513             SHRINK;
2514             GROW;
2515             cur = CUR_CHAR(l);
2516         }
2517     }
2518     buf[len] = 0;
2519     if (!IS_CHAR(cur)) {
2520         ctxt->errNo = XML_ERR_COMMENT_NOT_FINISHED;
2521         if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2522             ctxt->sax->error(ctxt->userData,
2523                              "Comment not terminated \n<!--%.50s\n", buf);
2524         ctxt->wellFormed = 0;
2525         xmlFree(buf);
2526     } else {
2527         NEXT;
2528         if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
2529             (!ctxt->disableSAX))
2530             ctxt->sax->comment(ctxt->userData, buf);
2531         xmlFree(buf);
2532     }
2533     ctxt->instate = state;
2534 }
2535
2536 /**
2537  * htmlParseCharRef:
2538  * @ctxt:  an HTML parser context
2539  *
2540  * parse Reference declarations
2541  *
2542  * [66] CharRef ::= '&#' [0-9]+ ';' |
2543  *                  '&#x' [0-9a-fA-F]+ ';'
2544  *
2545  * Returns the value parsed (as an int)
2546  */
2547 int
2548 htmlParseCharRef(htmlParserCtxtPtr ctxt) {
2549     int val = 0;
2550
2551     if ((CUR == '&') && (NXT(1) == '#') &&
2552         (NXT(2) == 'x')) {
2553         SKIP(3);
2554         while (CUR != ';') {
2555             if ((CUR >= '0') && (CUR <= '9'))
2556                 val = val * 16 + (CUR - '0');
2557             else if ((CUR >= 'a') && (CUR <= 'f'))
2558                 val = val * 16 + (CUR - 'a') + 10;
2559             else if ((CUR >= 'A') && (CUR <= 'F'))
2560                 val = val * 16 + (CUR - 'A') + 10;
2561             else {
2562                 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2563                     ctxt->sax->error(ctxt->userData,
2564                          "htmlParseCharRef: invalid hexadecimal value\n");
2565                 ctxt->wellFormed = 0;
2566                 return(0);
2567             }
2568             NEXT;
2569         }
2570         if (CUR == ';')
2571             NEXT;
2572     } else if  ((CUR == '&') && (NXT(1) == '#')) {
2573         SKIP(2);
2574         while (CUR != ';') {
2575             if ((CUR >= '0') && (CUR <= '9'))
2576                 val = val * 10 + (CUR - '0');
2577             else {
2578                 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2579                     ctxt->sax->error(ctxt->userData,
2580                          "htmlParseCharRef: invalid decimal value\n");
2581                 ctxt->wellFormed = 0;
2582                 return(0);
2583             }
2584             NEXT;
2585         }
2586         if (CUR == ';')
2587             NEXT;
2588     } else {
2589         if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2590             ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid value\n");
2591         ctxt->wellFormed = 0;
2592     }
2593     /*
2594      * Check the value IS_CHAR ...
2595      */
2596     if (IS_CHAR(val)) {
2597         return(val);
2598     } else {
2599         if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2600             ctxt->sax->error(ctxt->userData, "htmlParseCharRef: invalid xmlChar value %d\n",
2601                              val);
2602         ctxt->wellFormed = 0;
2603     }
2604     return(0);
2605 }
2606
2607
2608 /**
2609  * htmlParseDocTypeDecl :
2610  * @ctxt:  an HTML parser context
2611  *
2612  * parse a DOCTYPE declaration
2613  *
2614  * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
2615  *                      ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
2616  */
2617
2618 static void
2619 htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
2620     xmlChar *name;
2621     xmlChar *ExternalID = NULL;
2622     xmlChar *URI = NULL;
2623
2624     /*
2625      * We know that '<!DOCTYPE' has been detected.
2626      */
2627     SKIP(9);
2628
2629     SKIP_BLANKS;
2630
2631     /*
2632      * Parse the DOCTYPE name.
2633      */
2634     name = htmlParseName(ctxt);
2635     if (name == NULL) {
2636         if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2637             ctxt->sax->error(ctxt->userData, "htmlParseDocTypeDecl : no DOCTYPE name !\n");
2638         ctxt->wellFormed = 0;
2639     }
2640     /*
2641      * Check that upper(name) == "HTML" !!!!!!!!!!!!!
2642      */
2643
2644     SKIP_BLANKS;
2645
2646     /*
2647      * Check for SystemID and ExternalID
2648      */
2649     URI = htmlParseExternalID(ctxt, &ExternalID);
2650     SKIP_BLANKS;
2651
2652     /*
2653      * We should be at the end of the DOCTYPE declaration.
2654      */
2655     if (CUR != '>') {
2656         if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2657             ctxt->sax->error(ctxt->userData, "DOCTYPE improperly terminated\n");
2658         ctxt->wellFormed = 0;
2659         /* We shouldn't try to resynchronize ... */
2660     }
2661     NEXT;
2662
2663     /*
2664      * Create or update the document accordingly to the DOCTYPE
2665      */
2666     if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
2667         (!ctxt->disableSAX))
2668         ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
2669
2670     /*
2671      * Cleanup, since we don't use all those identifiers
2672      */
2673     if (URI != NULL) xmlFree(URI);
2674     if (ExternalID != NULL) xmlFree(ExternalID);
2675     if (name != NULL) xmlFree(name);
2676 }
2677
2678 /**
2679  * htmlParseAttribute:
2680  * @ctxt:  an HTML parser context
2681  * @value:  a xmlChar ** used to store the value of the attribute
2682  *
2683  * parse an attribute
2684  *
2685  * [41] Attribute ::= Name Eq AttValue
2686  *
2687  * [25] Eq ::= S? '=' S?
2688  *
2689  * With namespace:
2690  *
2691  * [NS 11] Attribute ::= QName Eq AttValue
2692  *
2693  * Also the case QName == xmlns:??? is handled independently as a namespace
2694  * definition.
2695  *
2696  * Returns the attribute name, and the value in *value.
2697  */
2698
2699 static xmlChar *
2700 htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
2701     xmlChar *name, *val = NULL;
2702
2703     *value = NULL;
2704     name = htmlParseHTMLName(ctxt);
2705     if (name == NULL) {
2706         if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2707             ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
2708         ctxt->wellFormed = 0;
2709         return(NULL);
2710     }
2711
2712     /*
2713      * read the value
2714      */
2715     SKIP_BLANKS;
2716     if (CUR == '=') {
2717         NEXT;
2718         SKIP_BLANKS;
2719         val = htmlParseAttValue(ctxt);
2720         /******
2721     } else {
2722         * TODO : some attribute must have values, some may not
2723         if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2724             ctxt->sax->warning(ctxt->userData,
2725                "No value for attribute %s\n", name); */
2726     }
2727
2728     *value = val;
2729     return(name);
2730 }
2731
2732 /**
2733  * htmlCheckEncoding:
2734  * @ctxt:  an HTML parser context
2735  * @attvalue: the attribute value
2736  *
2737  * Checks an http-equiv attribute from a Meta tag to detect
2738  * the encoding
2739  * If a new encoding is detected the parser is switched to decode
2740  * it and pass UTF8
2741  */
2742 static void
2743 htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
2744     const xmlChar *encoding;
2745
2746     if ((ctxt == NULL) || (attvalue == NULL))
2747         return;
2748
2749     /* do not change encoding */
2750     if (ctxt->input->encoding != NULL)
2751         return;
2752
2753     encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
2754     if (encoding != NULL) {
2755         encoding += 8;
2756     } else {
2757         encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
2758         if (encoding != NULL)
2759             encoding += 9;
2760     }
2761     if (encoding != NULL) {
2762         xmlCharEncoding enc;
2763         xmlCharEncodingHandlerPtr handler;
2764
2765         while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
2766
2767         if (ctxt->input->encoding != NULL)
2768             xmlFree((xmlChar *) ctxt->input->encoding);
2769         ctxt->input->encoding = xmlStrdup(encoding);
2770
2771         enc = xmlParseCharEncoding((const char *) encoding);
2772         /*
2773          * registered set of known encodings
2774          */
2775         if (enc != XML_CHAR_ENCODING_ERROR) {
2776             xmlSwitchEncoding(ctxt, enc);
2777             ctxt->charset = XML_CHAR_ENCODING_UTF8;
2778         } else {
2779             /*
2780              * fallback for unknown encodings
2781              */
2782             handler = xmlFindCharEncodingHandler((const char *) encoding);
2783             if (handler != NULL) {
2784                 xmlSwitchToEncoding(ctxt, handler);
2785                 ctxt->charset = XML_CHAR_ENCODING_UTF8;
2786             } else {
2787                 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
2788             }
2789         }
2790
2791         if ((ctxt->input->buf != NULL) &&
2792             (ctxt->input->buf->encoder != NULL) &&
2793             (ctxt->input->buf->raw != NULL) &&
2794             (ctxt->input->buf->buffer != NULL)) {
2795             int nbchars;
2796             int processed;
2797
2798             /*
2799              * convert as much as possible to the parser reading buffer.
2800              */
2801             processed = ctxt->input->cur - ctxt->input->base;
2802             xmlBufferShrink(ctxt->input->buf->buffer, processed);
2803             nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
2804                                        ctxt->input->buf->buffer,
2805                                        ctxt->input->buf->raw);
2806             if (nbchars < 0) {
2807                 ctxt->errNo = XML_ERR_INVALID_ENCODING;
2808                 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2809                     ctxt->sax->error(ctxt->userData,
2810                      "htmlCheckEncoding: encoder error\n");
2811             }
2812             ctxt->input->base =
2813             ctxt->input->cur = ctxt->input->buf->buffer->content;
2814         }
2815     }
2816 }
2817
2818 /**
2819  * htmlCheckMeta:
2820  * @ctxt:  an HTML parser context
2821  * @atts:  the attributes values
2822  *
2823  * Checks an attributes from a Meta tag
2824  */
2825 static void
2826 htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
2827     int i;
2828     const xmlChar *att, *value;
2829     int http = 0;
2830     const xmlChar *content = NULL;
2831
2832     if ((ctxt == NULL) || (atts == NULL))
2833         return;
2834
2835     i = 0;
2836     att = atts[i++];
2837     while (att != NULL) {
2838         value = atts[i++];
2839         if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
2840          && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
2841             http = 1;
2842         else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
2843             content = value;
2844         att = atts[i++];
2845     }
2846     if ((http) && (content != NULL))
2847         htmlCheckEncoding(ctxt, content);
2848
2849 }
2850
2851 /**
2852  * htmlParseStartTag:
2853  * @ctxt:  an HTML parser context
2854  *
2855  * parse a start of tag either for rule element or
2856  * EmptyElement. In both case we don't parse the tag closing chars.
2857  *
2858  * [40] STag ::= '<' Name (S Attribute)* S? '>'
2859  *
2860  * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
2861  *
2862  * With namespace:
2863  *
2864  * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
2865  *
2866  * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
2867  *
2868  */
2869
2870 static void
2871 htmlParseStartTag(htmlParserCtxtPtr ctxt) {
2872     xmlChar *name;
2873     xmlChar *attname;
2874     xmlChar *attvalue;
2875     const xmlChar **atts = NULL;
2876     int nbatts = 0;
2877     int maxatts = 0;
2878     int meta = 0;
2879     int i;
2880
2881     if (CUR != '<') return;
2882     NEXT;
2883
2884     GROW;
2885     name = htmlParseHTMLName(ctxt);
2886     if (name == NULL) {
2887         if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2888             ctxt->sax->error(ctxt->userData,
2889              "htmlParseStartTag: invalid element name\n");
2890         ctxt->wellFormed = 0;
2891         /* Dump the bogus tag like browsers do */
2892         while ((IS_CHAR(CUR)) && (CUR != '>'))
2893             NEXT;
2894         return;
2895     }
2896     if (xmlStrEqual(name, BAD_CAST"meta"))
2897         meta = 1;
2898
2899     /*
2900      * Check for auto-closure of HTML elements.
2901      */
2902     htmlAutoClose(ctxt, name);
2903
2904     /*
2905      * Check for implied HTML elements.
2906      */
2907     htmlCheckImplied(ctxt, name);
2908
2909     /*
2910      * Avoid html at any level > 0, head at any level != 1
2911      * or any attempt to recurse body
2912      */
2913     if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
2914         if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2915             ctxt->sax->error(ctxt->userData,
2916              "htmlParseStartTag: misplaced <html> tag\n");
2917         ctxt->wellFormed = 0;
2918         xmlFree(name);
2919         return;
2920     }
2921     if ((ctxt->nameNr != 1) &&
2922         (xmlStrEqual(name, BAD_CAST"head"))) {
2923         if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2924             ctxt->sax->error(ctxt->userData,
2925              "htmlParseStartTag: misplaced <head> tag\n");
2926         ctxt->wellFormed = 0;
2927         xmlFree(name);
2928         return;
2929     }
2930     if (xmlStrEqual(name, BAD_CAST"body")) {
2931         int indx;
2932         for (indx = 0;indx < ctxt->nameNr;indx++) {
2933             if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
2934                 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2935                     ctxt->sax->error(ctxt->userData,
2936                      "htmlParseStartTag: misplaced <body> tag\n");
2937                 ctxt->wellFormed = 0;
2938                 xmlFree(name);
2939                 return;
2940             }
2941         }
2942     }
2943
2944     /*
2945      * Now parse the attributes, it ends up with the ending
2946      *
2947      * (S Attribute)* S?
2948      */
2949     SKIP_BLANKS;
2950     while ((IS_CHAR(CUR)) &&
2951            (CUR != '>') &&
2952            ((CUR != '/') || (NXT(1) != '>'))) {
2953         long cons = ctxt->nbChars;
2954
2955         GROW;
2956         attname = htmlParseAttribute(ctxt, &attvalue);
2957         if (attname != NULL) {
2958
2959             /*
2960              * Well formedness requires at most one declaration of an attribute
2961              */
2962             for (i = 0; i < nbatts;i += 2) {
2963                 if (xmlStrEqual(atts[i], attname)) {
2964                     if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2965                         ctxt->sax->error(ctxt->userData,
2966                                          "Attribute %s redefined\n",
2967                                          attname);
2968                     ctxt->wellFormed = 0;
2969                     xmlFree(attname);
2970                     if (attvalue != NULL)
2971                         xmlFree(attvalue);
2972                     goto failed;
2973                 }
2974             }
2975
2976             /*
2977              * Add the pair to atts
2978              */
2979             if (atts == NULL) {
2980                 maxatts = 10;
2981                 atts = (const xmlChar **) xmlMalloc(maxatts * sizeof(xmlChar *));
2982                 if (atts == NULL) {
2983                     xmlGenericError(xmlGenericErrorContext,
2984                             "malloc of %ld byte failed\n",
2985                             maxatts * (long)sizeof(xmlChar *));
2986                     if (name != NULL) xmlFree(name);
2987                     return;
2988                 }
2989             } else if (nbatts + 4 > maxatts) {
2990                 maxatts *= 2;
2991                 atts = (const xmlChar **) xmlRealloc((void *) atts,
2992                                                      maxatts * sizeof(xmlChar *));
2993                 if (atts == NULL) {
2994                     xmlGenericError(xmlGenericErrorContext,
2995                             "realloc of %ld byte failed\n",
2996                             maxatts * (long)sizeof(xmlChar *));
2997                     if (name != NULL) xmlFree(name);
2998                     return;
2999                 }
3000             }
3001             atts[nbatts++] = attname;
3002             atts[nbatts++] = attvalue;
3003             atts[nbatts] = NULL;
3004             atts[nbatts + 1] = NULL;
3005         }
3006         else {
3007             /* Dump the bogus attribute string up to the next blank or
3008              * the end of the tag. */
3009             while ((IS_CHAR(CUR)) && !(IS_BLANK(CUR)) && (CUR != '>')
3010              && ((CUR != '/') || (NXT(1) != '>')))
3011                 NEXT;
3012         }
3013
3014 failed:
3015         SKIP_BLANKS;
3016         if (cons == ctxt->nbChars) {
3017             if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3018                 ctxt->sax->error(ctxt->userData,
3019                  "htmlParseStartTag: problem parsing attributes\n");
3020             ctxt->wellFormed = 0;
3021             break;
3022         }
3023     }
3024
3025     /*
3026      * Handle specific association to the META tag
3027      */
3028     if (meta)
3029         htmlCheckMeta(ctxt, atts);
3030
3031     /*
3032      * SAX: Start of Element !
3033      */
3034     htmlnamePush(ctxt, xmlStrdup(name));
3035 #ifdef DEBUG
3036     xmlGenericError(xmlGenericErrorContext,"Start of element %s: pushed %s\n", name, ctxt->name);
3037 #endif
3038     if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
3039         ctxt->sax->startElement(ctxt->userData, name, atts);
3040
3041     if (atts != NULL) {
3042         for (i = 0;i < nbatts;i++) {
3043             if (atts[i] != NULL)
3044                 xmlFree((xmlChar *) atts[i]);
3045         }
3046         xmlFree((void *) atts);
3047     }
3048     if (name != NULL) xmlFree(name);
3049 }
3050
3051 /**
3052  * htmlParseEndTag:
3053  * @ctxt:  an HTML parser context
3054  *
3055  * parse an end of tag
3056  *
3057  * [42] ETag ::= '</' Name S? '>'
3058  *
3059  * With namespace
3060  *
3061  * [NS 9] ETag ::= '</' QName S? '>'
3062  *
3063  * Returns 1 if the current level should be closed.
3064  */
3065
3066 static int
3067 htmlParseEndTag(htmlParserCtxtPtr ctxt) {
3068     xmlChar *name;
3069     xmlChar *oldname;
3070     int i, ret;
3071
3072     if ((CUR != '<') || (NXT(1) != '/')) {
3073         if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3074             ctxt->sax->error(ctxt->userData, "htmlParseEndTag: '</' not found\n");
3075         ctxt->wellFormed = 0;
3076         return(0);
3077     }
3078     SKIP(2);
3079
3080     name = htmlParseHTMLName(ctxt);
3081     if (name == NULL) return(0);
3082
3083     /*
3084      * We should definitely be at the ending "S? '>'" part
3085      */
3086     SKIP_BLANKS;
3087     if ((!IS_CHAR(CUR)) || (CUR != '>')) {
3088         if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3089             ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
3090         ctxt->wellFormed = 0;
3091     } else
3092         NEXT;
3093
3094     /*
3095      * If the name read is not one of the element in the parsing stack
3096      * then return, it's just an error.
3097      */
3098     for (i = (ctxt->nameNr - 1);i >= 0;i--) {
3099         if (xmlStrEqual(name, ctxt->nameTab[i])) break;
3100     }
3101     if (i < 0) {
3102         if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3103             ctxt->sax->error(ctxt->userData,
3104              "Unexpected end tag : %s\n", name);
3105         xmlFree(name);
3106         ctxt->wellFormed = 0;
3107         return(0);
3108     }
3109
3110
3111     /*
3112      * Check for auto-closure of HTML elements.
3113      */
3114
3115     htmlAutoCloseOnClose(ctxt, name);
3116
3117     /*
3118      * Well formedness constraints, opening and closing must match.
3119      * With the exception that the autoclose may have popped stuff out
3120      * of the stack.
3121      */
3122     if (!xmlStrEqual(name, ctxt->name)) {
3123 #ifdef DEBUG
3124         xmlGenericError(xmlGenericErrorContext,"End of tag %s: expecting %s\n", name, ctxt->name);
3125 #endif
3126         if ((ctxt->name != NULL) &&
3127             (!xmlStrEqual(ctxt->name, name))) {
3128             if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3129                 ctxt->sax->error(ctxt->userData,
3130                  "Opening and ending tag mismatch: %s and %s\n",
3131                                  name, ctxt->name);
3132             ctxt->wellFormed = 0;
3133         }
3134     }
3135
3136     /*
3137      * SAX: End of Tag
3138      */
3139     oldname = ctxt->name;
3140     if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
3141         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3142             ctxt->sax->endElement(ctxt->userData, name);
3143         oldname = htmlnamePop(ctxt);
3144         if (oldname != NULL) {
3145 #ifdef DEBUG
3146             xmlGenericError(xmlGenericErrorContext,"End of tag %s: popping out %s\n", name, oldname);
3147 #endif
3148             xmlFree(oldname);
3149 #ifdef DEBUG
3150         } else {
3151             xmlGenericError(xmlGenericErrorContext,"End of tag %s: stack empty !!!\n", name);
3152 #endif
3153         }
3154         ret = 1;
3155     } else {
3156         ret = 0;
3157     }
3158
3159     if (name != NULL)
3160         xmlFree(name);
3161
3162     return(ret);
3163 }
3164
3165
3166 /**
3167  * htmlParseReference:
3168  * @ctxt:  an HTML parser context
3169  *
3170  * parse and handle entity references in content,
3171  * this will end-up in a call to character() since this is either a
3172  * CharRef, or a predefined entity.
3173  */
3174 static void
3175 htmlParseReference(htmlParserCtxtPtr ctxt) {
3176     const htmlEntityDesc * ent;
3177     xmlChar out[6];
3178     xmlChar *name;
3179     if (CUR != '&') return;
3180
3181     if (NXT(1) == '#') {
3182         unsigned int c;
3183         int bits, i = 0;
3184
3185         c = htmlParseCharRef(ctxt);
3186         if (c == 0)
3187             return;
3188
3189         if      (c <    0x80) { out[i++]= c;                bits= -6; }
3190         else if (c <   0x800) { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
3191         else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
3192         else                  { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
3193
3194         for ( ; bits >= 0; bits-= 6) {
3195             out[i++]= ((c >> bits) & 0x3F) | 0x80;
3196         }
3197         out[i] = 0;
3198
3199         htmlCheckParagraph(ctxt);
3200         if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3201             ctxt->sax->characters(ctxt->userData, out, i);
3202     } else {
3203         ent = htmlParseEntityRef(ctxt, &name);
3204         if (name == NULL) {
3205             htmlCheckParagraph(ctxt);
3206             if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3207                 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3208             return;
3209         }
3210         if ((ent == NULL) || !(ent->value > 0)) {
3211             htmlCheckParagraph(ctxt);
3212             if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3213                 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3214                 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3215                 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3216             }
3217         } else {
3218             unsigned int c;
3219             int bits, i = 0;
3220
3221             c = ent->value;
3222             if      (c <    0x80)
3223                     { out[i++]= c;                bits= -6; }
3224             else if (c <   0x800)
3225                     { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
3226             else if (c < 0x10000)
3227                     { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
3228             else
3229                     { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
3230
3231             for ( ; bits >= 0; bits-= 6) {
3232                 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3233             }
3234             out[i] = 0;
3235
3236             htmlCheckParagraph(ctxt);
3237             if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3238                 ctxt->sax->characters(ctxt->userData, out, i);
3239         }
3240         xmlFree(name);
3241     }
3242 }
3243
3244 /**
3245  * htmlParseContent:
3246  * @ctxt:  an HTML parser context
3247  * @name:  the node name
3248  *
3249  * Parse a content: comment, sub-element, reference or text.
3250  *
3251  */
3252
3253 static void
3254 htmlParseContent(htmlParserCtxtPtr ctxt) {
3255     xmlChar *currentNode;
3256     int depth;
3257
3258     currentNode = xmlStrdup(ctxt->name);
3259     depth = ctxt->nameNr;
3260     while (1) {
3261         long cons = ctxt->nbChars;
3262
3263         GROW;
3264         /*
3265          * Our tag or one of it's parent or children is ending.
3266          */
3267         if ((CUR == '<') && (NXT(1) == '/')) {
3268             if (htmlParseEndTag(ctxt) &&
3269                 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
3270                 if (currentNode != NULL)
3271                     xmlFree(currentNode);
3272                 return;
3273             }
3274             continue; /* while */
3275         }
3276
3277         /*
3278          * Has this node been popped out during parsing of
3279          * the next element
3280          */
3281         if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3282             (!xmlStrEqual(currentNode, ctxt->name)))
3283              {
3284             if (currentNode != NULL) xmlFree(currentNode);
3285             return;
3286         }
3287
3288         if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3289             (xmlStrEqual(currentNode, BAD_CAST"style")))) {
3290             /*
3291              * Handle SCRIPT/STYLE separately
3292              */
3293             htmlParseScript(ctxt);
3294         } else {
3295             /*
3296              * Sometimes DOCTYPE arrives in the middle of the document
3297              */
3298             if ((CUR == '<') && (NXT(1) == '!') &&
3299                 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3300                 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3301                 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3302                 (UPP(8) == 'E')) {
3303                 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3304                     ctxt->sax->error(ctxt->userData,
3305                                      "Misplaced DOCTYPE declaration\n");
3306                 ctxt->wellFormed = 0;
3307                 htmlParseDocTypeDecl(ctxt);
3308             }
3309
3310             /*
3311              * First case :  a comment
3312              */
3313             if ((CUR == '<') && (NXT(1) == '!') &&
3314                 (NXT(2) == '-') && (NXT(3) == '-')) {
3315                 htmlParseComment(ctxt);
3316             }
3317
3318             /*
3319              * Second case :  a sub-element.
3320              */
3321             else if (CUR == '<') {
3322                 htmlParseElement(ctxt);
3323             }
3324
3325             /*
3326              * Third case : a reference. If if has not been resolved,
3327              *    parsing returns it's Name, create the node
3328              */
3329             else if (CUR == '&') {
3330                 htmlParseReference(ctxt);
3331             }
3332
3333             /*
3334              * Fourth : end of the resource
3335              */
3336             else if (CUR == 0) {
3337                 htmlAutoCloseOnEnd(ctxt);
3338                 break;
3339             }
3340
3341             /*
3342              * Last case, text. Note that References are handled directly.
3343              */
3344             else {
3345                 htmlParseCharData(ctxt);
3346             }
3347
3348             if (cons == ctxt->nbChars) {
3349                 if (ctxt->node != NULL) {
3350                     if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3351                         ctxt->sax->error(ctxt->userData,
3352                                          "detected an error in element content\n");
3353                     ctxt->wellFormed = 0;
3354                 }
3355                 break;
3356             }
3357         }
3358         GROW;
3359     }
3360     if (currentNode != NULL) xmlFree(currentNode);
3361 }
3362
3363 /**
3364  * htmlParseElement:
3365  * @ctxt:  an HTML parser context
3366  *
3367  * parse an HTML element, this is highly recursive
3368  *
3369  * [39] element ::= EmptyElemTag | STag content ETag
3370  *
3371  * [41] Attribute ::= Name Eq AttValue
3372  */
3373
3374 void
3375 htmlParseElement(htmlParserCtxtPtr ctxt) {
3376     xmlChar *name;
3377     xmlChar *currentNode = NULL;
3378     const htmlElemDesc * info;
3379     htmlParserNodeInfo node_info;
3380     xmlChar *oldname;
3381     int depth = ctxt->nameNr;
3382     const xmlChar *oldptr;
3383
3384     /* Capture start position */
3385     if (ctxt->record_info) {
3386         node_info.begin_pos = ctxt->input->consumed +
3387                           (CUR_PTR - ctxt->input->base);
3388         node_info.begin_line = ctxt->input->line;
3389     }
3390
3391     oldname = xmlStrdup(ctxt->name);
3392     htmlParseStartTag(ctxt);
3393     name = ctxt->name;
3394 #ifdef DEBUG
3395     if (oldname == NULL)
3396         xmlGenericError(xmlGenericErrorContext,
3397                 "Start of element %s\n", name);
3398     else if (name == NULL)
3399         xmlGenericError(xmlGenericErrorContext,
3400                 "Start of element failed, was %s\n", oldname);
3401     else
3402         xmlGenericError(xmlGenericErrorContext,
3403                 "Start of element %s, was %s\n", name, oldname);
3404 #endif
3405     if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) ||
3406         (name == NULL)) {
3407         if (CUR == '>')
3408             NEXT;
3409         if (oldname != NULL)
3410             xmlFree(oldname);
3411         return;
3412     }
3413     if (oldname != NULL)
3414         xmlFree(oldname);
3415
3416     /*
3417      * Lookup the info for that element.
3418      */
3419     info = htmlTagLookup(name);
3420     if (info == NULL) {
3421         if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3422             ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
3423                              name);
3424         ctxt->wellFormed = 0;
3425     } else if (info->depr) {
3426 /***************************
3427         if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
3428             ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
3429                                name);
3430  ***************************/
3431     }
3432
3433     /*
3434      * Check for an Empty Element labeled the XML/SGML way
3435      */
3436     if ((CUR == '/') && (NXT(1) == '>')) {
3437         SKIP(2);
3438         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3439             ctxt->sax->endElement(ctxt->userData, name);
3440         oldname = htmlnamePop(ctxt);
3441 #ifdef DEBUG
3442         xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n", oldname);
3443 #endif
3444         if (oldname != NULL)
3445             xmlFree(oldname);
3446         return;
3447     }
3448
3449     if (CUR == '>') {
3450         NEXT;
3451     } else {
3452         if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3453             ctxt->sax->error(ctxt->userData,
3454                              "Couldn't find end of Start Tag %s\n",
3455                              name);
3456         ctxt->wellFormed = 0;
3457
3458         /*
3459          * end of parsing of this node.
3460          */
3461         if (xmlStrEqual(name, ctxt->name)) {
3462             nodePop(ctxt);
3463             oldname = htmlnamePop(ctxt);
3464 #ifdef DEBUG
3465             xmlGenericError(xmlGenericErrorContext,"End of start tag problem: popping out %s\n", oldname);
3466 #endif
3467             if (oldname != NULL)
3468                 xmlFree(oldname);
3469         }
3470
3471         /*
3472          * Capture end position and add node
3473          */
3474         if ( currentNode != NULL && ctxt->record_info ) {
3475            node_info.end_pos = ctxt->input->consumed +
3476                               (CUR_PTR - ctxt->input->base);
3477            node_info.end_line = ctxt->input->line;
3478            node_info.node = ctxt->node;
3479            xmlParserAddNodeInfo(ctxt, &node_info);
3480         }
3481         return;
3482     }
3483
3484     /*
3485      * Check for an Empty Element from DTD definition
3486      */
3487     if ((info != NULL) && (info->empty)) {
3488         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3489             ctxt->sax->endElement(ctxt->userData, name);
3490         oldname = htmlnamePop(ctxt);
3491 #ifdef DEBUG
3492         xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
3493 #endif
3494         if (oldname != NULL)
3495             xmlFree(oldname);
3496         return;
3497     }
3498
3499     /*
3500      * Parse the content of the element:
3501      */
3502     currentNode = xmlStrdup(ctxt->name);
3503     depth = ctxt->nameNr;
3504     while (IS_CHAR(CUR)) {
3505         oldptr = ctxt->input->cur;
3506         htmlParseContent(ctxt);
3507         if (oldptr==ctxt->input->cur) break;
3508         if (ctxt->nameNr < depth) break;
3509     }
3510
3511     /*
3512      * Capture end position and add node
3513      */
3514     if ( currentNode != NULL && ctxt->record_info ) {
3515        node_info.end_pos = ctxt->input->consumed +
3516                           (CUR_PTR - ctxt->input->base);
3517        node_info.end_line = ctxt->input->line;
3518        node_info.node = ctxt->node;
3519        xmlParserAddNodeInfo(ctxt, &node_info);
3520     }
3521     if (!IS_CHAR(CUR)) {
3522         htmlAutoCloseOnEnd(ctxt);
3523     }
3524
3525     if (currentNode != NULL)
3526         xmlFree(currentNode);
3527 }
3528
3529 /**
3530  * htmlParseDocument :
3531  * @ctxt:  an HTML parser context
3532  *
3533  * parse an HTML document (and build a tree if using the standard SAX
3534  * interface).
3535  *
3536  * Returns 0, -1 in case of error. the parser context is augmented
3537  *                as a result of the parsing.
3538  */
3539
3540 int
3541 htmlParseDocument(htmlParserCtxtPtr ctxt) {
3542     xmlDtdPtr dtd;
3543
3544     xmlInitParser();
3545
3546     htmlDefaultSAXHandlerInit();
3547     ctxt->html = 1;
3548
3549     GROW;
3550     /*
3551      * SAX: beginning of the document processing.
3552      */
3553     if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3554         ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
3555
3556     /*
3557      * Wipe out everything which is before the first '<'
3558      */
3559     SKIP_BLANKS;
3560     if (CUR == 0) {
3561         if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3562             ctxt->sax->error(ctxt->userData, "Document is empty\n");
3563         ctxt->wellFormed = 0;
3564     }
3565
3566     if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
3567         ctxt->sax->startDocument(ctxt->userData);
3568
3569
3570     /*
3571      * Parse possible comments before any content
3572      */
3573     while ((CUR == '<') && (NXT(1) == '!') &&
3574            (NXT(2) == '-') && (NXT(3) == '-')) {
3575         htmlParseComment(ctxt);
3576         SKIP_BLANKS;
3577     }
3578
3579
3580     /*
3581      * Then possibly doc type declaration(s) and more Misc
3582      * (doctypedecl Misc*)?
3583      */
3584     if ((CUR == '<') && (NXT(1) == '!') &&
3585         (UPP(2) == 'D') && (UPP(3) == 'O') &&
3586         (UPP(4) == 'C') && (UPP(5) == 'T') &&
3587         (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3588         (UPP(8) == 'E')) {
3589         htmlParseDocTypeDecl(ctxt);
3590     }
3591     SKIP_BLANKS;
3592
3593     /*
3594      * Parse possible comments before any content
3595      */
3596     while ((CUR == '<') && (NXT(1) == '!') &&
3597            (NXT(2) == '-') && (NXT(3) == '-')) {
3598         htmlParseComment(ctxt);
3599         SKIP_BLANKS;
3600     }
3601
3602     /*
3603      * Time to start parsing the tree itself
3604      */
3605     htmlParseContent(ctxt);
3606
3607     /*
3608      * autoclose
3609      */
3610     if (CUR == 0)
3611         htmlAutoCloseOnEnd(ctxt);
3612
3613
3614     /*
3615      * SAX: end of the document processing.
3616      */
3617     if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3618         ctxt->sax->endDocument(ctxt->userData);
3619
3620     if (ctxt->myDoc != NULL) {
3621         dtd = xmlGetIntSubset(ctxt->myDoc);
3622         if (dtd == NULL)
3623             ctxt->myDoc->intSubset =
3624                 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
3625                     BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
3626                     BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
3627     }
3628     if (! ctxt->wellFormed) return(-1);
3629     return(0);
3630 }
3631
3632
3633 /************************************************************************
3634  *                                                                      *
3635  *                      Parser contexts handling                        *
3636  *                                                                      *
3637  ************************************************************************/
3638
3639 /**
3640  * xmlInitParserCtxt:
3641  * @ctxt:  an HTML parser context
3642  *
3643  * Initialize a parser context
3644  */
3645
3646 static void
3647 htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
3648 {
3649     htmlSAXHandler *sax;
3650
3651     if (ctxt == NULL) return;
3652     memset(ctxt, 0, sizeof(htmlParserCtxt));
3653
3654     sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
3655     if (sax == NULL) {
3656         xmlGenericError(xmlGenericErrorContext,
3657                 "htmlInitParserCtxt: out of memory\n");
3658     }
3659     else
3660         memset(sax, 0, sizeof(htmlSAXHandler));
3661
3662     /* Allocate the Input stack */
3663     ctxt->inputTab = (htmlParserInputPtr *)
3664                       xmlMalloc(5 * sizeof(htmlParserInputPtr));
3665     if (ctxt->inputTab == NULL) {
3666         xmlGenericError(xmlGenericErrorContext,
3667                 "htmlInitParserCtxt: out of memory\n");
3668         ctxt->inputNr = 0;
3669         ctxt->inputMax = 0;
3670         ctxt->input = NULL;
3671         return;
3672     }
3673     ctxt->inputNr = 0;
3674     ctxt->inputMax = 5;
3675     ctxt->input = NULL;
3676     ctxt->version = NULL;
3677     ctxt->encoding = NULL;
3678     ctxt->standalone = -1;
3679     ctxt->instate = XML_PARSER_START;
3680
3681     /* Allocate the Node stack */
3682     ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
3683     if (ctxt->nodeTab == NULL) {
3684         xmlGenericError(xmlGenericErrorContext,
3685                 "htmlInitParserCtxt: out of memory\n");
3686         ctxt->nodeNr = 0;
3687         ctxt->nodeMax = 0;
3688         ctxt->node = NULL;
3689         ctxt->inputNr = 0;
3690         ctxt->inputMax = 0;
3691         ctxt->input = NULL;
3692         return;
3693     }
3694     ctxt->nodeNr = 0;
3695     ctxt->nodeMax = 10;
3696     ctxt->node = NULL;
3697
3698     /* Allocate the Name stack */
3699     ctxt->nameTab = (xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
3700     if (ctxt->nameTab == NULL) {
3701         xmlGenericError(xmlGenericErrorContext,
3702                 "htmlInitParserCtxt: out of memory\n");
3703         ctxt->nameNr = 0;
3704         ctxt->nameMax = 10;
3705         ctxt->name = NULL;
3706         ctxt->nodeNr = 0;
3707         ctxt->nodeMax = 0;
3708         ctxt->node = NULL;
3709         ctxt->inputNr = 0;
3710         ctxt->inputMax = 0;
3711         ctxt->input = NULL;
3712         return;
3713     }
3714     ctxt->nameNr = 0;
3715     ctxt->nameMax = 10;
3716     ctxt->name = NULL;
3717
3718     if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
3719     else {
3720         ctxt->sax = sax;
3721         memcpy(sax, &htmlDefaultSAXHandler, sizeof(htmlSAXHandler));
3722     }
3723     ctxt->userData = ctxt;
3724     ctxt->myDoc = NULL;
3725     ctxt->wellFormed = 1;
3726     ctxt->replaceEntities = 0;
3727     ctxt->linenumbers = xmlLineNumbersDefaultValue;
3728     ctxt->html = 1;
3729     ctxt->record_info = 0;
3730     ctxt->validate = 0;
3731     ctxt->nbChars = 0;
3732     ctxt->checkIndex = 0;
3733     ctxt->catalogs = NULL;
3734     xmlInitNodeInfoSeq(&ctxt->node_seq);
3735 }
3736
3737 /**
3738  * htmlFreeParserCtxt:
3739  * @ctxt:  an HTML parser context
3740  *
3741  * Free all the memory used by a parser context. However the parsed
3742  * document in ctxt->myDoc is not freed.
3743  */
3744
3745 void
3746 htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
3747 {
3748     xmlFreeParserCtxt(ctxt);
3749 }
3750
3751 /**
3752  * htmlNewParserCtxt:
3753  *
3754  * Allocate and initialize a new parser context.
3755  *
3756  * Returns the xmlParserCtxtPtr or NULL
3757  */
3758
3759 static htmlParserCtxtPtr
3760 htmlNewParserCtxt(void)
3761 {
3762     xmlParserCtxtPtr ctxt;
3763
3764     ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
3765     if (ctxt == NULL) {
3766         xmlGenericError(xmlGenericErrorContext,
3767                 "xmlNewParserCtxt : cannot allocate context\n");
3768         return(NULL);
3769     }
3770     memset(ctxt, 0, sizeof(xmlParserCtxt));
3771     htmlInitParserCtxt(ctxt);
3772     return(ctxt);
3773 }
3774
3775 /**
3776  * htmlCreateMemoryParserCtxt:
3777  * @buffer:  a pointer to a char array
3778  * @size:  the size of the array
3779  *
3780  * Create a parser context for an HTML in-memory document.
3781  *
3782  * Returns the new parser context or NULL
3783  */
3784 static htmlParserCtxtPtr
3785 htmlCreateMemoryParserCtxt(const char *buffer, int size) {
3786     xmlParserCtxtPtr ctxt;
3787     xmlParserInputPtr input;
3788     xmlParserInputBufferPtr buf;
3789
3790     if (buffer == NULL)
3791         return(NULL);
3792     if (size <= 0)
3793         return(NULL);
3794
3795     ctxt = htmlNewParserCtxt();
3796     if (ctxt == NULL)
3797         return(NULL);
3798
3799     buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
3800     if (buf == NULL) return(NULL);
3801
3802     input = xmlNewInputStream(ctxt);
3803     if (input == NULL) {
3804         xmlFreeParserCtxt(ctxt);
3805         return(NULL);
3806     }
3807
3808     input->filename = NULL;
3809     input->buf = buf;
3810     input->base = input->buf->buffer->content;
3811     input->cur = input->buf->buffer->content;
3812     input->end = &input->buf->buffer->content[input->buf->buffer->use];
3813
3814     inputPush(ctxt, input);
3815     return(ctxt);
3816 }
3817
3818 /**
3819  * htmlCreateDocParserCtxt :
3820  * @cur:  a pointer to an array of xmlChar
3821  * @encoding:  a free form C string describing the HTML document encoding, or NULL
3822  *
3823  * Create a parser context for an HTML document.
3824  *
3825  * TODO: check the need to add encoding handling there
3826  *
3827  * Returns the new parser context or NULL
3828  */
3829 static htmlParserCtxtPtr
3830 htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding ATTRIBUTE_UNUSED) {
3831     int len;
3832
3833     if (cur == NULL)
3834         return(NULL);
3835     len = xmlStrlen(cur);
3836     return(htmlCreateMemoryParserCtxt((char *)cur, len));
3837 }
3838
3839 /************************************************************************
3840  *                                                                      *
3841  *              Progressive parsing interfaces                          *
3842  *                                                                      *
3843  ************************************************************************/
3844
3845 /**
3846  * htmlParseLookupSequence:
3847  * @ctxt:  an HTML parser context
3848  * @first:  the first char to lookup
3849  * @next:  the next char to lookup or zero
3850  * @third:  the next char to lookup or zero
3851  *
3852  * Try to find if a sequence (first, next, third) or  just (first next) or
3853  * (first) is available in the input stream.
3854  * This function has a side effect of (possibly) incrementing ctxt->checkIndex
3855  * to avoid rescanning sequences of bytes, it DOES change the state of the
3856  * parser, do not use liberally.
3857  * This is basically similar to xmlParseLookupSequence()
3858  *
3859  * Returns the index to the current parsing point if the full sequence
3860  *      is available, -1 otherwise.
3861  */
3862 static int
3863 htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
3864                        xmlChar next, xmlChar third) {
3865     int base, len;
3866     htmlParserInputPtr in;
3867     const xmlChar *buf;
3868     int incomment = 0;
3869
3870     in = ctxt->input;
3871     if (in == NULL) return(-1);
3872     base = in->cur - in->base;
3873     if (base < 0) return(-1);
3874     if (ctxt->checkIndex > base)
3875         base = ctxt->checkIndex;
3876     if (in->buf == NULL) {
3877         buf = in->base;
3878         len = in->length;
3879     } else {
3880         buf = in->buf->buffer->content;
3881         len = in->buf->buffer->use;
3882     }
3883     /* take into account the sequence length */
3884     if (third) len -= 2;
3885     else if (next) len --;
3886     for (;base < len;base++) {
3887         if (!incomment && (base + 4 < len)) {
3888             if ((buf[base] == '<') && (buf[base + 1] == '!') &&
3889                 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
3890                 incomment = 1;
3891             }
3892             /* do not increment base, some people use <!--> */
3893         }
3894         if (incomment) {
3895             if (base + 3 < len)
3896                 return(-1);
3897             if ((buf[base] == '-') && (buf[base + 1] == '-') &&
3898                 (buf[base + 2] == '>')) {
3899                 incomment = 0;
3900                 base += 2;
3901             }
3902             continue;
3903         }
3904         if (buf[base] == first) {
3905             if (third != 0) {
3906                 if ((buf[base + 1] != next) ||
3907                     (buf[base + 2] != third)) continue;
3908             } else if (next != 0) {
3909                 if (buf[base + 1] != next) continue;
3910             }
3911             ctxt->checkIndex = 0;
3912 #ifdef DEBUG_PUSH
3913             if (next == 0)
3914                 xmlGenericError(xmlGenericErrorContext,
3915                         "HPP: lookup '%c' found at %d\n",
3916                         first, base);
3917             else if (third == 0)
3918                 xmlGenericError(xmlGenericErrorContext,
3919                         "HPP: lookup '%c%c' found at %d\n",
3920                         first, next, base);
3921             else
3922                 xmlGenericError(xmlGenericErrorContext,
3923                         "HPP: lookup '%c%c%c' found at %d\n",
3924                         first, next, third, base);
3925 #endif
3926             return(base - (in->cur - in->base));
3927         }
3928     }
3929     ctxt->checkIndex = base;
3930 #ifdef DEBUG_PUSH
3931     if (next == 0)
3932         xmlGenericError(xmlGenericErrorContext,
3933                 "HPP: lookup '%c' failed\n", first);
3934     else if (third == 0)
3935         xmlGenericError(xmlGenericErrorContext,
3936                 "HPP: lookup '%c%c' failed\n", first, next);
3937     else
3938         xmlGenericError(xmlGenericErrorContext,
3939                 "HPP: lookup '%c%c%c' failed\n", first, next, third);
3940 #endif
3941     return(-1);
3942 }
3943
3944 /**
3945  * htmlParseTryOrFinish:
3946  * @ctxt:  an HTML parser context
3947  * @terminate:  last chunk indicator
3948  *
3949  * Try to progress on parsing
3950  *
3951  * Returns zero if no parsing was possible
3952  */
3953 static int
3954 htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
3955     int ret = 0;
3956     htmlParserInputPtr in;
3957     int avail = 0;
3958     xmlChar cur, next;
3959
3960 #ifdef DEBUG_PUSH
3961     switch (ctxt->instate) {
3962         case XML_PARSER_EOF:
3963             xmlGenericError(xmlGenericErrorContext,
3964                     "HPP: try EOF\n"); break;
3965         case XML_PARSER_START:
3966             xmlGenericError(xmlGenericErrorContext,
3967                     "HPP: try START\n"); break;
3968         case XML_PARSER_MISC:
3969             xmlGenericError(xmlGenericErrorContext,
3970                     "HPP: try MISC\n");break;
3971         case XML_PARSER_COMMENT:
3972             xmlGenericError(xmlGenericErrorContext,
3973                     "HPP: try COMMENT\n");break;
3974         case XML_PARSER_PROLOG:
3975             xmlGenericError(xmlGenericErrorContext,
3976                     "HPP: try PROLOG\n");break;
3977         case XML_PARSER_START_TAG:
3978             xmlGenericError(xmlGenericErrorContext,
3979                     "HPP: try START_TAG\n");break;
3980         case XML_PARSER_CONTENT:
3981             xmlGenericError(xmlGenericErrorContext,
3982                     "HPP: try CONTENT\n");break;
3983         case XML_PARSER_CDATA_SECTION:
3984             xmlGenericError(xmlGenericErrorContext,
3985                     "HPP: try CDATA_SECTION\n");break;
3986         case XML_PARSER_END_TAG:
3987             xmlGenericError(xmlGenericErrorContext,
3988                     "HPP: try END_TAG\n");break;
3989         case XML_PARSER_ENTITY_DECL:
3990             xmlGenericError(xmlGenericErrorContext,
3991                     "HPP: try ENTITY_DECL\n");break;
3992         case XML_PARSER_ENTITY_VALUE:
3993             xmlGenericError(xmlGenericErrorContext,
3994                     "HPP: try ENTITY_VALUE\n");break;
3995         case XML_PARSER_ATTRIBUTE_VALUE:
3996             xmlGenericError(xmlGenericErrorContext,
3997                     "HPP: try ATTRIBUTE_VALUE\n");break;
3998         case XML_PARSER_DTD:
3999             xmlGenericError(xmlGenericErrorContext,
4000                     "HPP: try DTD\n");break;
4001         case XML_PARSER_EPILOG:
4002             xmlGenericError(xmlGenericErrorContext,
4003                     "HPP: try EPILOG\n");break;
4004         case XML_PARSER_PI:
4005             xmlGenericError(xmlGenericErrorContext,
4006                     "HPP: try PI\n");break;
4007         case XML_PARSER_SYSTEM_LITERAL:
4008             xmlGenericError(xmlGenericErrorContext,
4009                     "HPP: try SYSTEM_LITERAL\n");break;
4010     }
4011 #endif
4012
4013     while (1) {
4014
4015         in = ctxt->input;
4016         if (in == NULL) break;
4017         if (in->buf == NULL)
4018             avail = in->length - (in->cur - in->base);
4019         else
4020             avail = in->buf->buffer->use - (in->cur - in->base);
4021         if ((avail == 0) && (terminate)) {
4022             htmlAutoCloseOnEnd(ctxt);
4023             if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4024                 /*
4025                  * SAX: end of the document processing.
4026                  */
4027                 ctxt->instate = XML_PARSER_EOF;
4028                 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4029                     ctxt->sax->endDocument(ctxt->userData);
4030             }
4031         }
4032         if (avail < 1)
4033             goto done;
4034         switch (ctxt->instate) {
4035             case XML_PARSER_EOF:
4036                 /*
4037                  * Document parsing is done !
4038                  */
4039                 goto done;
4040             case XML_PARSER_START:
4041                 /*
4042                  * Very first chars read from the document flow.
4043                  */
4044                 cur = in->cur[0];
4045                 if (IS_BLANK(cur)) {
4046                     SKIP_BLANKS;
4047                     if (in->buf == NULL)
4048                         avail = in->length - (in->cur - in->base);
4049                     else
4050                         avail = in->buf->buffer->use - (in->cur - in->base);
4051                 }
4052                 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4053                     ctxt->sax->setDocumentLocator(ctxt->userData,
4054                                                   &xmlDefaultSAXLocator);
4055                 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4056                     (!ctxt->disableSAX))
4057                     ctxt->sax->startDocument(ctxt->userData);
4058
4059                 cur = in->cur[0];
4060                 next = in->cur[1];
4061                 if ((cur == '<') && (next == '!') &&
4062                     (UPP(2) == 'D') && (UPP(3) == 'O') &&
4063                     (UPP(4) == 'C') && (UPP(5) == 'T') &&
4064                     (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4065                     (UPP(8) == 'E')) {
4066                     if ((!terminate) &&
4067                         (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4068                         goto done;
4069 #ifdef DEBUG_PUSH
4070                     xmlGenericError(xmlGenericErrorContext,
4071                             "HPP: Parsing internal subset\n");
4072 #endif
4073                     htmlParseDocTypeDecl(ctxt);
4074                     ctxt->instate = XML_PARSER_PROLOG;
4075 #ifdef DEBUG_PUSH
4076                     xmlGenericError(xmlGenericErrorContext,
4077                             "HPP: entering PROLOG\n");
4078 #endif
4079                 } else {
4080                     ctxt->instate = XML_PARSER_MISC;
4081                 }
4082 #ifdef DEBUG_PUSH
4083                 xmlGenericError(xmlGenericErrorContext,
4084                         "HPP: entering MISC\n");
4085 #endif
4086                 break;
4087             case XML_PARSER_MISC:
4088                 SKIP_BLANKS;
4089                 if (in->buf == NULL)
4090                     avail = in->length - (in->cur - in->base);
4091                 else
4092                     avail = in->buf->buffer->use - (in->cur - in->base);
4093                 if (avail < 2)
4094                     goto done;
4095                 cur = in->cur[0];
4096                 next = in->cur[1];
4097                 if ((cur == '<') && (next == '!') &&
4098                     (in->cur[2] == '-') && (in->cur[3] == '-')) {
4099                     if ((!terminate) &&
4100                         (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4101                         goto done;
4102 #ifdef DEBUG_PUSH
4103                     xmlGenericError(xmlGenericErrorContext,
4104                             "HPP: Parsing Comment\n");
4105 #endif
4106                     htmlParseComment(ctxt);
4107                     ctxt->instate = XML_PARSER_MISC;
4108                 } else if ((cur == '<') && (next == '!') &&
4109                     (UPP(2) == 'D') && (UPP(3) == 'O') &&
4110                     (UPP(4) == 'C') && (UPP(5) == 'T') &&
4111                     (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4112                     (UPP(8) == 'E')) {
4113                     if ((!terminate) &&
4114                         (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4115                         goto done;
4116 #ifdef DEBUG_PUSH
4117                     xmlGenericError(xmlGenericErrorContext,
4118                             "HPP: Parsing internal subset\n");
4119 #endif
4120                     htmlParseDocTypeDecl(ctxt);
4121                     ctxt->instate = XML_PARSER_PROLOG;
4122 #ifdef DEBUG_PUSH
4123                     xmlGenericError(xmlGenericErrorContext,
4124                             "HPP: entering PROLOG\n");
4125 #endif
4126                 } else if ((cur == '<') && (next == '!') &&
4127                            (avail < 9)) {
4128                     goto done;
4129                 } else {
4130                     ctxt->instate = XML_PARSER_START_TAG;
4131 #ifdef DEBUG_PUSH
4132                     xmlGenericError(xmlGenericErrorContext,
4133                             "HPP: entering START_TAG\n");
4134 #endif
4135                 }
4136                 break;
4137             case XML_PARSER_PROLOG:
4138                 SKIP_BLANKS;
4139                 if (in->buf == NULL)
4140                     avail = in->length - (in->cur - in->base);
4141                 else
4142                     avail = in->buf->buffer->use - (in->cur - in->base);
4143                 if (avail < 2)
4144                     goto done;
4145                 cur = in->cur[0];
4146                 next = in->cur[1];
4147                 if ((cur == '<') && (next == '!') &&
4148                     (in->cur[2] == '-') && (in->cur[3] == '-')) {
4149                     if ((!terminate) &&
4150                         (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4151                         goto done;
4152 #ifdef DEBUG_PUSH
4153                     xmlGenericError(xmlGenericErrorContext,
4154                             "HPP: Parsing Comment\n");
4155 #endif
4156                     htmlParseComment(ctxt);
4157                     ctxt->instate = XML_PARSER_PROLOG;
4158                 } else if ((cur == '<') && (next == '!') &&
4159                            (avail < 4)) {
4160                     goto done;
4161                 } else {
4162                     ctxt->instate = XML_PARSER_START_TAG;
4163 #ifdef DEBUG_PUSH
4164                     xmlGenericError(xmlGenericErrorContext,
4165                             "HPP: entering START_TAG\n");
4166 #endif
4167                 }
4168                 break;
4169             case XML_PARSER_EPILOG:
4170                 if (in->buf == NULL)
4171                     avail = in->length - (in->cur - in->base);
4172                 else
4173                     avail = in->buf->buffer->use - (in->cur - in->base);
4174                 if (avail < 1)
4175                     goto done;
4176                 cur = in->cur[0];
4177                 if (IS_BLANK(cur)) {
4178                     htmlParseCharData(ctxt);
4179                     goto done;
4180                 }
4181                 if (avail < 2)
4182                     goto done;
4183                 next = in->cur[1];
4184                 if ((cur == '<') && (next == '!') &&
4185                     (in->cur[2] == '-') && (in->cur[3] == '-')) {
4186                     if ((!terminate) &&
4187                         (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4188                         goto done;
4189 #ifdef DEBUG_PUSH
4190                     xmlGenericError(xmlGenericErrorContext,
4191                             "HPP: Parsing Comment\n");
4192 #endif
4193                     htmlParseComment(ctxt);
4194                     ctxt->instate = XML_PARSER_EPILOG;
4195                 } else if ((cur == '<') && (next == '!') &&
4196                            (avail < 4)) {
4197                     goto done;
4198                 } else {
4199                     ctxt->errNo = XML_ERR_DOCUMENT_END;
4200                     ctxt->wellFormed = 0;
4201                     ctxt->instate = XML_PARSER_EOF;
4202 #ifdef DEBUG_PUSH
4203                     xmlGenericError(xmlGenericErrorContext,
4204                             "HPP: entering EOF\n");
4205 #endif
4206                     if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4207                         ctxt->sax->endDocument(ctxt->userData);
4208                     goto done;
4209                 }
4210                 break;
4211             case XML_PARSER_START_TAG: {
4212                 xmlChar *name, *oldname;
4213                 int depth = ctxt->nameNr;
4214                 const htmlElemDesc * info;
4215
4216                 if (avail < 2)
4217                     goto done;
4218                 cur = in->cur[0];
4219                 if (cur != '<') {
4220                     ctxt->instate = XML_PARSER_CONTENT;
4221 #ifdef DEBUG_PUSH
4222                     xmlGenericError(xmlGenericErrorContext,
4223                             "HPP: entering CONTENT\n");
4224 #endif
4225                     break;
4226                 }
4227                 if (in->cur[1] == '/') {
4228                     ctxt->instate = XML_PARSER_END_TAG;
4229                     ctxt->checkIndex = 0;
4230 #ifdef DEBUG_PUSH
4231                     xmlGenericError(xmlGenericErrorContext,
4232                             "HPP: entering END_TAG\n");
4233 #endif
4234                     break;
4235                 }
4236                 if ((!terminate) &&
4237                     (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4238                     goto done;
4239
4240                 oldname = xmlStrdup(ctxt->name);
4241                 htmlParseStartTag(ctxt);
4242                 name = ctxt->name;
4243 #ifdef DEBUG
4244                 if (oldname == NULL)
4245                     xmlGenericError(xmlGenericErrorContext,
4246                             "Start of element %s\n", name);
4247                 else if (name == NULL)
4248                     xmlGenericError(xmlGenericErrorContext,
4249                             "Start of element failed, was %s\n",
4250                             oldname);
4251                 else
4252                     xmlGenericError(xmlGenericErrorContext,
4253                             "Start of element %s, was %s\n",
4254                             name, oldname);
4255 #endif
4256                 if (((depth == ctxt->nameNr) &&
4257                      (xmlStrEqual(oldname, ctxt->name))) ||
4258                     (name == NULL)) {
4259                     if (CUR == '>')
4260                         NEXT;
4261                     if (oldname != NULL)
4262                         xmlFree(oldname);
4263                     break;
4264                 }
4265                 if (oldname != NULL)
4266                     xmlFree(oldname);
4267
4268                 /*
4269                  * Lookup the info for that element.
4270                  */
4271                 info = htmlTagLookup(name);
4272                 if (info == NULL) {
4273                     if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4274                         ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
4275                                          name);
4276                     ctxt->wellFormed = 0;
4277                 } else if (info->depr) {
4278                     /***************************
4279                     if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
4280                         ctxt->sax->warning(ctxt->userData,
4281                                            "Tag %s is deprecated\n",
4282                                            name);
4283                      ***************************/
4284                 }
4285
4286                 /*
4287                  * Check for an Empty Element labeled the XML/SGML way
4288                  */
4289                 if ((CUR == '/') && (NXT(1) == '>')) {
4290                     SKIP(2);
4291                     if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4292                         ctxt->sax->endElement(ctxt->userData, name);
4293                     oldname = htmlnamePop(ctxt);
4294 #ifdef DEBUG
4295                     xmlGenericError(xmlGenericErrorContext,"End of tag the XML way: popping out %s\n",
4296                             oldname);
4297 #endif
4298                     if (oldname != NULL)
4299                         xmlFree(oldname);
4300                     ctxt->instate = XML_PARSER_CONTENT;
4301 #ifdef DEBUG_PUSH
4302                     xmlGenericError(xmlGenericErrorContext,
4303                             "HPP: entering CONTENT\n");
4304 #endif
4305                     break;
4306                 }
4307
4308                 if (CUR == '>') {
4309                     NEXT;
4310                 } else {
4311                     if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4312                         ctxt->sax->error(ctxt->userData,
4313                                          "Couldn't find end of Start Tag %s\n",
4314                                          name);
4315                     ctxt->wellFormed = 0;
4316
4317                     /*
4318                      * end of parsing of this node.
4319                      */
4320                     if (xmlStrEqual(name, ctxt->name)) {
4321                         nodePop(ctxt);
4322                         oldname = htmlnamePop(ctxt);
4323 #ifdef DEBUG
4324                         xmlGenericError(xmlGenericErrorContext,
4325                          "End of start tag problem: popping out %s\n", oldname);
4326 #endif
4327                         if (oldname != NULL)
4328                             xmlFree(oldname);
4329                     }
4330
4331                     ctxt->instate = XML_PARSER_CONTENT;
4332 #ifdef DEBUG_PUSH
4333                     xmlGenericError(xmlGenericErrorContext,
4334                             "HPP: entering CONTENT\n");
4335 #endif
4336                     break;
4337                 }
4338
4339                 /*
4340                  * Check for an Empty Element from DTD definition
4341                  */
4342                 if ((info != NULL) && (info->empty)) {
4343                     if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4344                         ctxt->sax->endElement(ctxt->userData, name);
4345                     oldname = htmlnamePop(ctxt);
4346 #ifdef DEBUG
4347                     xmlGenericError(xmlGenericErrorContext,"End of empty tag %s : popping out %s\n", name, oldname);
4348 #endif
4349                     if (oldname != NULL)
4350                         xmlFree(oldname);
4351                 }
4352                 ctxt->instate = XML_PARSER_CONTENT;
4353 #ifdef DEBUG_PUSH
4354                 xmlGenericError(xmlGenericErrorContext,
4355                         "HPP: entering CONTENT\n");
4356 #endif
4357                 break;
4358             }
4359             case XML_PARSER_CONTENT: {
4360                 long cons;
4361                 /*
4362                  * Handle preparsed entities and charRef
4363                  */
4364                 if (ctxt->token != 0) {
4365                     xmlChar chr[2] = { 0 , 0 } ;
4366
4367                     chr[0] = (xmlChar) ctxt->token;
4368                     htmlCheckParagraph(ctxt);
4369                     if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4370                         ctxt->sax->characters(ctxt->userData, chr, 1);
4371                     ctxt->token = 0;
4372                     ctxt->checkIndex = 0;
4373                 }
4374                 if ((avail == 1) && (terminate)) {
4375                     cur = in->cur[0];
4376                     if ((cur != '<') && (cur != '&')) {
4377                         if (ctxt->sax != NULL) {
4378                             if (IS_BLANK(cur)) {
4379                                 if (ctxt->sax->ignorableWhitespace != NULL)
4380                                     ctxt->sax->ignorableWhitespace(
4381                                             ctxt->userData, &cur, 1);
4382                             } else {
4383                                 htmlCheckParagraph(ctxt);
4384                                 if (ctxt->sax->characters != NULL)
4385                                     ctxt->sax->characters(
4386                                             ctxt->userData, &cur, 1);
4387                             }
4388                         }
4389                         ctxt->token = 0;
4390                         ctxt->checkIndex = 0;
4391                         in->cur++;
4392                         break;
4393                     }
4394                 }
4395                 if (avail < 2)
4396                     goto done;
4397                 cur = in->cur[0];
4398                 next = in->cur[1];
4399                 cons = ctxt->nbChars;
4400                 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
4401                     (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
4402                     /*
4403                      * Handle SCRIPT/STYLE separately
4404                      */
4405                     if ((!terminate) &&
4406                         (htmlParseLookupSequence(ctxt, '<', '/', 0) < 0))
4407                         goto done;
4408                     htmlParseScript(ctxt);
4409                     if ((cur == '<') && (next == '/')) {
4410                         ctxt->instate = XML_PARSER_END_TAG;
4411                         ctxt->checkIndex = 0;
4412 #ifdef DEBUG_PUSH
4413                         xmlGenericError(xmlGenericErrorContext,
4414                                 "HPP: entering END_TAG\n");
4415 #endif
4416                         break;
4417                     }
4418                 } else {
4419                     /*
4420                      * Sometimes DOCTYPE arrives in the middle of the document
4421                      */
4422                     if ((cur == '<') && (next == '!') &&
4423                         (UPP(2) == 'D') && (UPP(3) == 'O') &&
4424                         (UPP(4) == 'C') && (UPP(5) == 'T') &&
4425                         (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4426                         (UPP(8) == 'E')) {
4427                         if ((!terminate) &&
4428                             (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4429                             goto done;
4430                         if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4431                             ctxt->sax->error(ctxt->userData,
4432                                  "Misplaced DOCTYPE declaration\n");
4433                         ctxt->wellFormed = 0;
4434                         htmlParseDocTypeDecl(ctxt);
4435                     } else if ((cur == '<') && (next == '!') &&
4436                         (in->cur[2] == '-') && (in->cur[3] == '-')) {
4437                         if ((!terminate) &&
4438                             (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
4439                             goto done;
4440 #ifdef DEBUG_PUSH
4441                         xmlGenericError(xmlGenericErrorContext,
4442                                 "HPP: Parsing Comment\n");
4443 #endif
4444                         htmlParseComment(ctxt);
4445                         ctxt->instate = XML_PARSER_CONTENT;
4446                     } else if ((cur == '<') && (next == '!') && (avail < 4)) {
4447                         goto done;
4448                     } else if ((cur == '<') && (next == '/')) {
4449                         ctxt->instate = XML_PARSER_END_TAG;
4450                         ctxt->checkIndex = 0;
4451 #ifdef DEBUG_PUSH
4452                         xmlGenericError(xmlGenericErrorContext,
4453                                 "HPP: entering END_TAG\n");
4454 #endif
4455                         break;
4456                     } else if (cur == '<') {
4457                         ctxt->instate = XML_PARSER_START_TAG;
4458                         ctxt->checkIndex = 0;
4459 #ifdef DEBUG_PUSH
4460                         xmlGenericError(xmlGenericErrorContext,
4461                                 "HPP: entering START_TAG\n");
4462 #endif
4463                         break;
4464                     } else if (cur == '&') {
4465                         if ((!terminate) &&
4466                             (htmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
4467                             goto done;
4468 #ifdef DEBUG_PUSH
4469                         xmlGenericError(xmlGenericErrorContext,
4470                                 "HPP: Parsing Reference\n");
4471 #endif
4472                         /* TODO: check generation of subtrees if noent !!! */
4473                         htmlParseReference(ctxt);
4474                     } else {
4475                         /* TODO Avoid the extra copy, handle directly !!!!!! */
4476                         /*
4477                          * Goal of the following test is :
4478                          *  - minimize calls to the SAX 'character' callback
4479                          *    when they are mergeable
4480                          */
4481                         if ((ctxt->inputNr == 1) &&
4482                             (avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
4483                             if ((!terminate) &&
4484                                 (htmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
4485                                 goto done;
4486                         }
4487                         ctxt->checkIndex = 0;
4488 #ifdef DEBUG_PUSH
4489                         xmlGenericError(xmlGenericErrorContext,
4490                                 "HPP: Parsing char data\n");
4491 #endif
4492                         htmlParseCharData(ctxt);
4493                     }
4494                 }
4495                 if (cons == ctxt->nbChars) {
4496                     if (ctxt->node != NULL) {
4497                         if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4498                             ctxt->sax->error(ctxt->userData,
4499                                  "detected an error in element content\n");
4500                         ctxt->wellFormed = 0;
4501                     }
4502                     NEXT;
4503                     break;
4504                 }
4505
4506                 break;
4507             }
4508             case XML_PARSER_END_TAG:
4509                 if (avail < 2)
4510                     goto done;
4511                 if ((!terminate) &&
4512                     (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
4513                     goto done;
4514                 htmlParseEndTag(ctxt);
4515                 if (ctxt->nameNr == 0) {
4516                     ctxt->instate = XML_PARSER_EPILOG;
4517                 } else {
4518                     ctxt->instate = XML_PARSER_CONTENT;
4519                 }
4520                 ctxt->checkIndex = 0;
4521 #ifdef DEBUG_PUSH
4522                 xmlGenericError(xmlGenericErrorContext,
4523                         "HPP: entering CONTENT\n");
4524 #endif
4525                 break;
4526             case XML_PARSER_CDATA_SECTION:
4527                 xmlGenericError(xmlGenericErrorContext,
4528                         "HPP: internal error, state == CDATA\n");
4529                 ctxt->instate = XML_PARSER_CONTENT;
4530                 ctxt->checkIndex = 0;
4531 #ifdef DEBUG_PUSH
4532                 xmlGenericError(xmlGenericErrorContext,
4533                         "HPP: entering CONTENT\n");
4534 #endif
4535                 break;
4536             case XML_PARSER_DTD:
4537                 xmlGenericError(xmlGenericErrorContext,
4538                         "HPP: internal error, state == DTD\n");
4539                 ctxt->instate = XML_PARSER_CONTENT;
4540                 ctxt->checkIndex = 0;
4541 #ifdef DEBUG_PUSH
4542                 xmlGenericError(xmlGenericErrorContext,
4543                         "HPP: entering CONTENT\n");
4544 #endif
4545                 break;
4546             case XML_PARSER_COMMENT:
4547                 xmlGenericError(xmlGenericErrorContext,
4548                         "HPP: internal error, state == COMMENT\n");
4549                 ctxt->instate = XML_PARSER_CONTENT;
4550                 ctxt->checkIndex = 0;
4551 #ifdef DEBUG_PUSH
4552                 xmlGenericError(xmlGenericErrorContext,
4553                         "HPP: entering CONTENT\n");
4554 #endif
4555                 break;
4556             case XML_PARSER_PI:
4557                 xmlGenericError(xmlGenericErrorContext,
4558                         "HPP: internal error, state == PI\n");
4559                 ctxt->instate = XML_PARSER_CONTENT;
4560                 ctxt->checkIndex = 0;
4561 #ifdef DEBUG_PUSH
4562                 xmlGenericError(xmlGenericErrorContext,
4563                         "HPP: entering CONTENT\n");
4564 #endif
4565                 break;
4566             case XML_PARSER_ENTITY_DECL:
4567                 xmlGenericError(xmlGenericErrorContext,
4568                         "HPP: internal error, state == ENTITY_DECL\n");
4569                 ctxt->instate = XML_PARSER_CONTENT;
4570                 ctxt->checkIndex = 0;
4571 #ifdef DEBUG_PUSH
4572                 xmlGenericError(xmlGenericErrorContext,
4573                         "HPP: entering CONTENT\n");
4574 #endif
4575                 break;
4576             case XML_PARSER_ENTITY_VALUE:
4577                 xmlGenericError(xmlGenericErrorContext,
4578                         "HPP: internal error, state == ENTITY_VALUE\n");
4579                 ctxt->instate = XML_PARSER_CONTENT;
4580                 ctxt->checkIndex = 0;
4581 #ifdef DEBUG_PUSH
4582                 xmlGenericError(xmlGenericErrorContext,
4583                         "HPP: entering DTD\n");
4584 #endif
4585                 break;
4586             case XML_PARSER_ATTRIBUTE_VALUE:
4587                 xmlGenericError(xmlGenericErrorContext,
4588                         "HPP: internal error, state == ATTRIBUTE_VALUE\n");
4589                 ctxt->instate = XML_PARSER_START_TAG;
4590                 ctxt->checkIndex = 0;
4591 #ifdef DEBUG_PUSH
4592                 xmlGenericError(xmlGenericErrorContext,
4593                         "HPP: entering START_TAG\n");
4594 #endif
4595                 break;
4596             case XML_PARSER_SYSTEM_LITERAL:
4597                 xmlGenericError(xmlGenericErrorContext,
4598                         "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
4599                 ctxt->instate = XML_PARSER_CONTENT;
4600                 ctxt->checkIndex = 0;
4601 #ifdef DEBUG_PUSH
4602                 xmlGenericError(xmlGenericErrorContext,
4603                         "HPP: entering CONTENT\n");
4604 #endif
4605                 break;
4606             case XML_PARSER_IGNORE:
4607                 xmlGenericError(xmlGenericErrorContext,
4608                         "HPP: internal error, state == XML_PARSER_IGNORE\n");
4609                 ctxt->instate = XML_PARSER_CONTENT;
4610                 ctxt->checkIndex = 0;
4611 #ifdef DEBUG_PUSH
4612                 xmlGenericError(xmlGenericErrorContext,
4613                         "HPP: entering CONTENT\n");
4614 #endif
4615                 break;
4616             case XML_PARSER_PUBLIC_LITERAL:
4617                 xmlGenericError(xmlGenericErrorContext,
4618                         "HPP: internal error, state == XML_PARSER_LITERAL\n");
4619                 ctxt->instate = XML_PARSER_CONTENT;
4620                 ctxt->checkIndex = 0;
4621 #ifdef DEBUG_PUSH
4622                 xmlGenericError(xmlGenericErrorContext,
4623                         "HPP: entering CONTENT\n");
4624 #endif
4625                 break;
4626
4627         }
4628     }
4629 done:
4630     if ((avail == 0) && (terminate)) {
4631         htmlAutoCloseOnEnd(ctxt);
4632         if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4633             /*
4634              * SAX: end of the document processing.
4635              */
4636             ctxt->instate = XML_PARSER_EOF;
4637             if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4638                 ctxt->sax->endDocument(ctxt->userData);
4639         }
4640     }
4641     if ((ctxt->myDoc != NULL) &&
4642         ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
4643          (ctxt->instate == XML_PARSER_EPILOG))) {
4644         xmlDtdPtr dtd;
4645         dtd = xmlGetIntSubset(ctxt->myDoc);
4646         if (dtd == NULL)
4647             ctxt->myDoc->intSubset =
4648                 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
4649                     BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4650                     BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4651     }
4652 #ifdef DEBUG_PUSH
4653     xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
4654 #endif
4655     return(ret);
4656 }
4657
4658 /**
4659  * htmlParseChunk:
4660  * @ctxt:  an XML parser context
4661  * @chunk:  an char array
4662  * @size:  the size in byte of the chunk
4663  * @terminate:  last chunk indicator
4664  *
4665  * Parse a Chunk of memory
4666  *
4667  * Returns zero if no error, the xmlParserErrors otherwise.
4668  */
4669 int
4670 htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
4671               int terminate) {
4672     if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4673         (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF))  {
4674         int base = ctxt->input->base - ctxt->input->buf->buffer->content;
4675         int cur = ctxt->input->cur - ctxt->input->base;
4676
4677         xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4678         ctxt->input->base = ctxt->input->buf->buffer->content + base;
4679         ctxt->input->cur = ctxt->input->base + cur;
4680 #ifdef DEBUG_PUSH
4681         xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4682 #endif
4683
4684         if ((terminate) || (ctxt->input->buf->buffer->use > 80))
4685             htmlParseTryOrFinish(ctxt, terminate);
4686     } else if (ctxt->instate != XML_PARSER_EOF) {
4687         xmlParserInputBufferPush(ctxt->input->buf, 0, "");
4688         htmlParseTryOrFinish(ctxt, terminate);
4689     }
4690     if (terminate) {
4691         if ((ctxt->instate != XML_PARSER_EOF) &&
4692             (ctxt->instate != XML_PARSER_EPILOG) &&
4693             (ctxt->instate != XML_PARSER_MISC)) {
4694             ctxt->errNo = XML_ERR_DOCUMENT_END;
4695             ctxt->wellFormed = 0;
4696         }
4697         if (ctxt->instate != XML_PARSER_EOF) {
4698             if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4699                 ctxt->sax->endDocument(ctxt->userData);
4700         }
4701         ctxt->instate = XML_PARSER_EOF;
4702     }
4703     return((xmlParserErrors) ctxt->errNo);
4704 }
4705
4706 /************************************************************************
4707  *                                                                      *
4708  *                      User entry points                               *
4709  *                                                                      *
4710  ************************************************************************/
4711
4712 /**
4713  * htmlCreatePushParserCtxt :
4714  * @sax:  a SAX handler
4715  * @user_data:  The user data returned on SAX callbacks
4716  * @chunk:  a pointer to an array of chars
4717  * @size:  number of chars in the array
4718  * @filename:  an optional file name or URI
4719  * @enc:  an optional encoding
4720  *
4721  * Create a parser context for using the HTML parser in push mode
4722  * To allow content encoding detection, @size should be >= 4
4723  * The value of @filename is used for fetching external entities
4724  * and error/warning reports.
4725  *
4726  * Returns the new parser context or NULL
4727  */
4728 htmlParserCtxtPtr
4729 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
4730                          const char *chunk, int size, const char *filename,
4731                          xmlCharEncoding enc) {
4732     htmlParserCtxtPtr ctxt;
4733     htmlParserInputPtr inputStream;
4734     xmlParserInputBufferPtr buf;
4735
4736     xmlInitParser();
4737
4738     buf = xmlAllocParserInputBuffer(enc);
4739     if (buf == NULL) return(NULL);
4740
4741     ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4742     if (ctxt == NULL) {
4743         xmlFree(buf);
4744         return(NULL);
4745     }
4746     memset(ctxt, 0, sizeof(htmlParserCtxt));
4747     htmlInitParserCtxt(ctxt);
4748     if (sax != NULL) {
4749         if (ctxt->sax != &htmlDefaultSAXHandler)
4750             xmlFree(ctxt->sax);
4751         ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
4752         if (ctxt->sax == NULL) {
4753             xmlFree(buf);
4754             xmlFree(ctxt);
4755             return(NULL);
4756         }
4757         memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
4758         if (user_data != NULL)
4759             ctxt->userData = user_data;
4760     }
4761     if (filename == NULL) {
4762         ctxt->directory = NULL;
4763     } else {
4764         ctxt->directory = xmlParserGetDirectory(filename);
4765     }
4766
4767     inputStream = htmlNewInputStream(ctxt);
4768     if (inputStream == NULL) {
4769         xmlFreeParserCtxt(ctxt);
4770         return(NULL);
4771     }
4772
4773     if (filename == NULL)
4774         inputStream->filename = NULL;
4775     else
4776         inputStream->filename = xmlMemStrdup(filename);
4777     inputStream->buf = buf;
4778     inputStream->base = inputStream->buf->buffer->content;
4779     inputStream->cur = inputStream->buf->buffer->content;
4780
4781     inputPush(ctxt, inputStream);
4782
4783     if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4784         (ctxt->input->buf != NULL))  {
4785         xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
4786 #ifdef DEBUG_PUSH
4787         xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
4788 #endif
4789     }
4790
4791     return(ctxt);
4792 }
4793
4794 /**
4795  * htmlSAXParseDoc :
4796  * @cur:  a pointer to an array of xmlChar
4797  * @encoding:  a free form C string describing the HTML document encoding, or NULL
4798  * @sax:  the SAX handler block
4799  * @userData: if using SAX, this pointer will be provided on callbacks.
4800  *
4801  * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
4802  * to handle parse events. If sax is NULL, fallback to the default DOM
4803  * behavior and return a tree.
4804  *
4805  * Returns the resulting document tree unless SAX is NULL or the document is
4806  *     not well formed.
4807  */
4808
4809 htmlDocPtr
4810 htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
4811     htmlDocPtr ret;
4812     htmlParserCtxtPtr ctxt;
4813
4814     xmlInitParser();
4815
4816     if (cur == NULL) return(NULL);
4817
4818
4819     ctxt = htmlCreateDocParserCtxt(cur, encoding);
4820     if (ctxt == NULL) return(NULL);
4821     if (sax != NULL) {
4822         ctxt->sax = sax;
4823         ctxt->userData = userData;
4824     }
4825
4826     htmlParseDocument(ctxt);
4827     ret = ctxt->myDoc;
4828     if (sax != NULL) {
4829         ctxt->sax = NULL;
4830         ctxt->userData = NULL;
4831     }
4832     htmlFreeParserCtxt(ctxt);
4833
4834     return(ret);
4835 }
4836
4837 /**
4838  * htmlParseDoc :
4839  * @cur:  a pointer to an array of xmlChar
4840  * @encoding:  a free form C string describing the HTML document encoding, or NULL
4841  *
4842  * parse an HTML in-memory document and build a tree.
4843  *
4844  * Returns the resulting document tree
4845  */
4846
4847 htmlDocPtr
4848 htmlParseDoc(xmlChar *cur, const char *encoding) {
4849     return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
4850 }
4851
4852
4853 /**
4854  * htmlCreateFileParserCtxt :
4855  * @filename:  the filename
4856  * @encoding:  a free form C string describing the HTML document encoding, or NULL
4857  *
4858  * Create a parser context for a file content.
4859  * Automatic support for ZLIB/Compress compressed document is provided
4860  * by default if found at compile-time.
4861  *
4862  * Returns the new parser context or NULL
4863  */
4864 htmlParserCtxtPtr
4865 htmlCreateFileParserCtxt(const char *filename, const char *encoding)
4866 {
4867     htmlParserCtxtPtr ctxt;
4868     htmlParserInputPtr inputStream;
4869     xmlParserInputBufferPtr buf;
4870     /* htmlCharEncoding enc; */
4871     xmlChar *content, *content_line = (xmlChar *) "charset=";
4872
4873     buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
4874     if (buf == NULL) return(NULL);
4875
4876     ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
4877     if (ctxt == NULL) {
4878         xmlGenericError(xmlGenericErrorContext, "malloc failed\n");
4879         return(NULL);
4880     }
4881     memset(ctxt, 0, sizeof(htmlParserCtxt));
4882     htmlInitParserCtxt(ctxt);
4883     inputStream = (htmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
4884     if (inputStream == NULL) {
4885         xmlGenericError(xmlGenericErrorContext, "malloc failed\n");
4886         xmlFree(ctxt);
4887         return(NULL);
4888     }
4889     memset(inputStream, 0, sizeof(htmlParserInput));
4890
4891     inputStream->filename = (char *)
4892         xmlNormalizeWindowsPath((xmlChar *)filename);
4893     inputStream->line = 1;
4894     inputStream->col = 1;
4895     inputStream->buf = buf;
4896     inputStream->directory = NULL;
4897
4898     inputStream->base = inputStream->buf->buffer->content;
4899     inputStream->cur = inputStream->buf->buffer->content;
4900     inputStream->free = NULL;
4901
4902     inputPush(ctxt, inputStream);
4903
4904     /* set encoding */
4905     if (encoding) {
4906         content = xmlMalloc (xmlStrlen(content_line) + strlen(encoding) + 1);
4907         if (content) {
4908             strcpy ((char *)content, (char *)content_line);
4909             strcat ((char *)content, (char *)encoding);
4910             htmlCheckEncoding (ctxt, content);
4911             xmlFree (content);
4912         }
4913     }
4914
4915     return(ctxt);
4916 }
4917
4918 /**
4919  * htmlSAXParseFile :
4920  * @filename:  the filename
4921  * @encoding:  a free form C string describing the HTML document encoding, or NULL
4922  * @sax:  the SAX handler block
4923  * @userData: if using SAX, this pointer will be provided on callbacks.
4924  *
4925  * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4926  * compressed document is provided by default if found at compile-time.
4927  * It use the given SAX function block to handle the parsing callback.
4928  * If sax is NULL, fallback to the default DOM tree building routines.
4929  *
4930  * Returns the resulting document tree unless SAX is NULL or the document is
4931  *     not well formed.
4932  */
4933
4934 htmlDocPtr
4935 htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
4936                  void *userData) {
4937     htmlDocPtr ret;
4938     htmlParserCtxtPtr ctxt;
4939     htmlSAXHandlerPtr oldsax = NULL;
4940
4941     xmlInitParser();
4942
4943     ctxt = htmlCreateFileParserCtxt(filename, encoding);
4944     if (ctxt == NULL) return(NULL);
4945     if (sax != NULL) {
4946         oldsax = ctxt->sax;
4947         ctxt->sax = sax;
4948         ctxt->userData = userData;
4949     }
4950
4951     htmlParseDocument(ctxt);
4952
4953     ret = ctxt->myDoc;
4954     if (sax != NULL) {
4955         ctxt->sax = oldsax;
4956         ctxt->userData = NULL;
4957     }
4958     htmlFreeParserCtxt(ctxt);
4959
4960     return(ret);
4961 }
4962
4963 /**
4964  * htmlParseFile :
4965  * @filename:  the filename
4966  * @encoding:  a free form C string describing the HTML document encoding, or NULL
4967  *
4968  * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
4969  * compressed document is provided by default if found at compile-time.
4970  *
4971  * Returns the resulting document tree
4972  */
4973
4974 htmlDocPtr
4975 htmlParseFile(const char *filename, const char *encoding) {
4976     return(htmlSAXParseFile(filename, encoding, NULL, NULL));
4977 }
4978
4979 /**
4980  * htmlHandleOmittedElem:
4981  * @val:  int 0 or 1
4982  *
4983  * Set and return the previous value for handling HTML omitted tags.
4984  *
4985  * Returns the last value for 0 for no handling, 1 for auto insertion.
4986  */
4987
4988 int
4989 htmlHandleOmittedElem(int val) {
4990     int old = htmlOmittedDefaultValue;
4991
4992     htmlOmittedDefaultValue = val;
4993     return(old);
4994 }
4995
4996 #endif /* LIBXML_HTML_ENABLED */