third_party/libxml/src/HTMLparser.c

   1 /*
   2  * HTMLparser.c : an HTML 4.0 non-verifying parser
   3  *
   4  * See Copyright for the status of this software.
   5  *
   6  * daniel@veillard.com
   7  */
   8
   9 #define IN_LIBXML
  10 #include "libxml.h"
  11 #ifdef LIBXML_HTML_ENABLED
  12
  13 #include <string.h>
  14 #ifdef HAVE_CTYPE_H
  15 #include <ctype.h>
  16 #endif
  17 #ifdef HAVE_STDLIB_H
  18 #include <stdlib.h>
  19 #endif
  20 #ifdef HAVE_SYS_STAT_H
  21 #include <sys/stat.h>
  22 #endif
  23 #ifdef HAVE_FCNTL_H
  24 #include <fcntl.h>
  25 #endif
  26 #ifdef HAVE_UNISTD_H
  27 #include <unistd.h>
  28 #endif
  29 #ifdef HAVE_ZLIB_H
  30 #include <zlib.h>
  31 #endif
  32
  33 #include <libxml/xmlmemory.h>
  34 #include <libxml/tree.h>
  35 #include <libxml/parser.h>
  36 #include <libxml/parserInternals.h>
  37 #include <libxml/xmlerror.h>
  38 #include <libxml/HTMLparser.h>
  39 #include <libxml/HTMLtree.h>
  40 #include <libxml/entities.h>
  41 #include <libxml/encoding.h>
  42 #include <libxml/valid.h>
  43 #include <libxml/xmlIO.h>
  44 #include <libxml/globals.h>
  45 #include <libxml/uri.h>
  46
  47 #define HTML_MAX_NAMELEN 1000
  48 #define HTML_PARSER_BIG_BUFFER_SIZE 1000
  49 #define HTML_PARSER_BUFFER_SIZE 100
  50
  51 /* #define DEBUG */
  52 /* #define DEBUG_PUSH */
  53
  54 static int htmlOmittedDefaultValue = 1;
  55
  56 xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
  57                              xmlChar end, xmlChar  end2, xmlChar end3);
  58 static void htmlParseComment(htmlParserCtxtPtr ctxt);
  59
  60 /************************************************************************
  61  *                                                                      *
  62  *              Some factorized error routines                          *
  63  *                                                                      *
  64  ************************************************************************/
  65
  66 /**
  67  * htmlErrMemory:
  68  * @ctxt:  an HTML parser context
  69  * @extra:  extra informations
  70  *
  71  * Handle a redefinition of attribute error
  72  */
  73 static void
  74 htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
  75 {
  76     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
  77         (ctxt->instate == XML_PARSER_EOF))
  78         return;
  79     if (ctxt != NULL) {
  80         ctxt->errNo = XML_ERR_NO_MEMORY;
  81         ctxt->instate = XML_PARSER_EOF;
  82         ctxt->disableSAX = 1;
  83     }
  84     if (extra)
  85         __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
  86                         XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
  87                         NULL, NULL, 0, 0,
  88                         "Memory allocation failed : %s\n", extra);
  89     else
  90         __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
  91                         XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
  92                         NULL, NULL, 0, 0, "Memory allocation failed\n");
  93 }
  94
  95 /**
  96  * htmlParseErr:
  97  * @ctxt:  an HTML parser context
  98  * @error:  the error number
  99  * @msg:  the error message
 100  * @str1:  string infor
 101  * @str2:  string infor
 102  *
 103  * Handle a fatal parser error, i.e. violating Well-Formedness constraints
 104  */
 105 static void
 106 htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
 107              const char *msg, const xmlChar *str1, const xmlChar *str2)
 108 {
 109     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
 110         (ctxt->instate == XML_PARSER_EOF))
 111         return;
 112     if (ctxt != NULL)
 113         ctxt->errNo = error;
 114     __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
 115                     XML_ERR_ERROR, NULL, 0,
 116                     (const char *) str1, (const char *) str2,
 117                     NULL, 0, 0,
 118                     msg, str1, str2);
 119     if (ctxt != NULL)
 120         ctxt->wellFormed = 0;
 121 }
 122
 123 /**
 124  * htmlParseErrInt:
 125  * @ctxt:  an HTML parser context
 126  * @error:  the error number
 127  * @msg:  the error message
 128  * @val:  integer info
 129  *
 130  * Handle a fatal parser error, i.e. violating Well-Formedness constraints
 131  */
 132 static void
 133 htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
 134              const char *msg, int val)
 135 {
 136     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
 137         (ctxt->instate == XML_PARSER_EOF))
 138         return;
 139     if (ctxt != NULL)
 140         ctxt->errNo = error;
 141     __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
 142                     XML_ERR_ERROR, NULL, 0, NULL, NULL,
 143                     NULL, val, 0, msg, val);
 144     if (ctxt != NULL)
 145         ctxt->wellFormed = 0;
 146 }
 147
 148 /************************************************************************
 149  *                                                                      *
 150  *      Parser stacks related functions and macros              *
 151  *                                                                      *
 152  ************************************************************************/
 153
 154 /**
 155  * htmlnamePush:
 156  * @ctxt:  an HTML parser context
 157  * @value:  the element name
 158  *
 159  * Pushes a new element name on top of the name stack
 160  *
 161  * Returns 0 in case of error, the index in the stack otherwise
 162  */
 163 static int
 164 htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
 165 {
 166     if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
 167         ctxt->html = 3;
 168     if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
 169         ctxt->html = 10;
 170     if (ctxt->nameNr >= ctxt->nameMax) {
 171         ctxt->nameMax *= 2;
 172         ctxt->nameTab = (const xmlChar * *)
 173                          xmlRealloc((xmlChar * *)ctxt->nameTab,
 174                                     ctxt->nameMax *
 175                                     sizeof(ctxt->nameTab[0]));
 176         if (ctxt->nameTab == NULL) {
 177             htmlErrMemory(ctxt, NULL);
 178             return (0);
 179         }
 180     }
 181     ctxt->nameTab[ctxt->nameNr] = value;
 182     ctxt->name = value;
 183     return (ctxt->nameNr++);
 184 }
 185 /**
 186  * htmlnamePop:
 187  * @ctxt: an HTML parser context
 188  *
 189  * Pops the top element name from the name stack
 190  *
 191  * Returns the name just removed
 192  */
 193 static const xmlChar *
 194 htmlnamePop(htmlParserCtxtPtr ctxt)
 195 {
 196     const xmlChar *ret;
 197
 198     if (ctxt->nameNr <= 0)
 199         return (NULL);
 200     ctxt->nameNr--;
 201     if (ctxt->nameNr < 0)
 202         return (NULL);
 203     if (ctxt->nameNr > 0)
 204         ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
 205     else
 206         ctxt->name = NULL;
 207     ret = ctxt->nameTab[ctxt->nameNr];
 208     ctxt->nameTab[ctxt->nameNr] = NULL;
 209     return (ret);
 210 }
 211
 212 /**
 213  * htmlNodeInfoPush:
 214  * @ctxt:  an HTML parser context
 215  * @value:  the node info
 216  *
 217  * Pushes a new element name on top of the node info stack
 218  *
 219  * Returns 0 in case of error, the index in the stack otherwise
 220  */
 221 static int
 222 htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
 223 {
 224     if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
 225         if (ctxt->nodeInfoMax == 0)
 226                 ctxt->nodeInfoMax = 5;
 227         ctxt->nodeInfoMax *= 2;
 228         ctxt->nodeInfoTab = (htmlParserNodeInfo *)
 229                          xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
 230                                     ctxt->nodeInfoMax *
 231                                     sizeof(ctxt->nodeInfoTab[0]));
 232         if (ctxt->nodeInfoTab == NULL) {
 233             htmlErrMemory(ctxt, NULL);
 234             return (0);
 235         }
 236     }
 237     ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
 238     ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
 239     return (ctxt->nodeInfoNr++);
 240 }
 241
 242 /**
 243  * htmlNodeInfoPop:
 244  * @ctxt:  an HTML parser context
 245  *
 246  * Pops the top element name from the node info stack
 247  *
 248  * Returns 0 in case of error, the pointer to NodeInfo otherwise
 249  */
 250 static htmlParserNodeInfo *
 251 htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
 252 {
 253     if (ctxt->nodeInfoNr <= 0)
 254         return (NULL);
 255     ctxt->nodeInfoNr--;
 256     if (ctxt->nodeInfoNr < 0)
 257         return (NULL);
 258     if (ctxt->nodeInfoNr > 0)
 259         ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
 260     else
 261         ctxt->nodeInfo = NULL;
 262     return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
 263 }
 264
 265 /*
 266  * Macros for accessing the content. Those should be used only by the parser,
 267  * and not exported.
 268  *
 269  * Dirty macros, i.e. one need to make assumption on the context to use them
 270  *
 271  *   CUR_PTR return the current pointer to the xmlChar to be parsed.
 272  *   CUR     returns the current xmlChar value, i.e. a 8 bit value if compiled
 273  *           in ISO-Latin or UTF-8, and the current 16 bit value if compiled
 274  *           in UNICODE mode. This should be used internally by the parser
 275  *           only to compare to ASCII values otherwise it would break when
 276  *           running with UTF-8 encoding.
 277  *   NXT(n)  returns the n'th next xmlChar. Same as CUR is should be used only
 278  *           to compare on ASCII based substring.
 279  *   UPP(n)  returns the n'th next xmlChar converted to uppercase. Same as CUR
 280  *           it should be used only to compare on ASCII based substring.
 281  *   SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
 282  *           strings without newlines within the parser.
 283  *
 284  * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
 285  *
 286  *   CURRENT Returns the current char value, with the full decoding of
 287  *           UTF-8 if we are using this mode. It returns an int.
 288  *   NEXT    Skip to the next character, this does the proper decoding
 289  *           in UTF-8 mode. It also pop-up unfinished entities on the fly.
 290  *   NEXTL(l) Skip the current unicode character of l xmlChars long.
 291  *   COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
 292  */
 293
 294 #define UPPER (toupper(*ctxt->input->cur))
 295
 296 #define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
 297
 298 #define NXT(val) ctxt->input->cur[(val)]
 299
 300 #define UPP(val) (toupper(ctxt->input->cur[(val)]))
 301
 302 #define CUR_PTR ctxt->input->cur
 303
 304 #define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
 305                    (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
 306         xmlParserInputShrink(ctxt->input)
 307
 308 #define GROW if ((ctxt->progressive == 0) &&                            \
 309                  (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK))   \
 310         xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
 311
 312 #define CURRENT ((int) (*ctxt->input->cur))
 313
 314 #define SKIP_BLANKS htmlSkipBlankChars(ctxt)
 315
 316 /* Inported from XML */
 317
 318 /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
 319 #define CUR ((int) (*ctxt->input->cur))
 320 #define NEXT xmlNextChar(ctxt)
 321
 322 #define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
 323
 324
 325 #define NEXTL(l) do {                                                   \
 326     if (*(ctxt->input->cur) == '\n') {                                  \
 327         ctxt->input->line++; ctxt->input->col = 1;                      \
 328     } else ctxt->input->col++;                                          \
 329     ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++;            \
 330   } while (0)
 331
 332 /************
 333     \
 334     if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt);     \
 335     if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
 336  ************/
 337
 338 #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
 339 #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
 340
 341 #define COPY_BUF(l,b,i,v)                                               \
 342     if (l == 1) b[i++] = (xmlChar) v;                                   \
 343     else i += xmlCopyChar(l,&b[i],v)
 344
 345 /**
 346  * htmlFindEncoding:
 347  * @the HTML parser context
 348  *
 349  * Ty to find and encoding in the current data available in the input
 350  * buffer this is needed to try to switch to the proper encoding when
 351  * one face a character error.
 352  * That's an heuristic, since it's operating outside of parsing it could
 353  * try to use a meta which had been commented out, that's the reason it
 354  * should only be used in case of error, not as a default.
 355  *
 356  * Returns an encoding string or NULL if not found, the string need to
 357  *   be freed
 358  */
 359 static xmlChar *
 360 htmlFindEncoding(xmlParserCtxtPtr ctxt) {
 361     const xmlChar *start, *cur, *end;
 362
 363     if ((ctxt == NULL) || (ctxt->input == NULL) ||
 364         (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
 365         (ctxt->input->buf->encoder != NULL))
 366         return(NULL);
 367     if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
 368         return(NULL);
 369
 370     start = ctxt->input->cur;
 371     end = ctxt->input->end;
 372     /* we also expect the input buffer to be zero terminated */
 373     if (*end != 0)
 374         return(NULL);
 375
 376     cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
 377     if (cur == NULL)
 378         return(NULL);
 379     cur = xmlStrcasestr(cur, BAD_CAST  "CONTENT");
 380     if (cur == NULL)
 381         return(NULL);
 382     cur = xmlStrcasestr(cur, BAD_CAST  "CHARSET=");
 383     if (cur == NULL)
 384         return(NULL);
 385     cur += 8;
 386     start = cur;
 387     while (((*cur >= 'A') && (*cur <= 'Z')) ||
 388            ((*cur >= 'a') && (*cur <= 'z')) ||
 389            ((*cur >= '0') && (*cur <= '9')) ||
 390            (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
 391            cur++;
 392     if (cur == start)
 393         return(NULL);
 394     return(xmlStrndup(start, cur - start));
 395 }
 396
 397 /**
 398  * htmlCurrentChar:
 399  * @ctxt:  the HTML parser context
 400  * @len:  pointer to the length of the char read
 401  *
 402  * The current char value, if using UTF-8 this may actually span multiple
 403  * bytes in the input buffer. Implement the end of line normalization:
 404  * 2.11 End-of-Line Handling
 405  * If the encoding is unspecified, in the case we find an ISO-Latin-1
 406  * char, then the encoding converter is plugged in automatically.
 407  *
 408  * Returns the current char value and its length
 409  */
 410
 411 static int
 412 htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
 413     if (ctxt->instate == XML_PARSER_EOF)
 414         return(0);
 415
 416     if (ctxt->token != 0) {
 417         *len = 0;
 418         return(ctxt->token);
 419     }
 420     if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
 421         /*
 422          * We are supposed to handle UTF8, check it's valid
 423          * From rfc2044: encoding of the Unicode values on UTF-8:
 424          *
 425          * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
 426          * 0000 0000-0000 007F   0xxxxxxx
 427          * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
 428          * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
 429          *
 430          * Check for the 0x110000 limit too
 431          */
 432         const unsigned char *cur = ctxt->input->cur;
 433         unsigned char c;
 434         unsigned int val;
 435
 436         c = *cur;
 437         if (c & 0x80) {
 438             if (cur[1] == 0) {
 439                 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
 440                 cur = ctxt->input->cur;
 441             }
 442             if ((cur[1] & 0xc0) != 0x80)
 443                 goto encoding_error;
 444             if ((c & 0xe0) == 0xe0) {
 445
 446                 if (cur[2] == 0) {
 447                     xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
 448                     cur = ctxt->input->cur;
 449                 }
 450                 if ((cur[2] & 0xc0) != 0x80)
 451                     goto encoding_error;
 452                 if ((c & 0xf0) == 0xf0) {
 453                     if (cur[3] == 0) {
 454                         xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
 455                         cur = ctxt->input->cur;
 456                     }
 457                     if (((c & 0xf8) != 0xf0) ||
 458                         ((cur[3] & 0xc0) != 0x80))
 459                         goto encoding_error;
 460                     /* 4-byte code */
 461                     *len = 4;
 462                     val = (cur[0] & 0x7) << 18;
 463                     val |= (cur[1] & 0x3f) << 12;
 464                     val |= (cur[2] & 0x3f) << 6;
 465                     val |= cur[3] & 0x3f;
 466                 } else {
 467                   /* 3-byte code */
 468                     *len = 3;
 469                     val = (cur[0] & 0xf) << 12;
 470                     val |= (cur[1] & 0x3f) << 6;
 471                     val |= cur[2] & 0x3f;
 472                 }
 473             } else {
 474               /* 2-byte code */
 475                 *len = 2;
 476                 val = (cur[0] & 0x1f) << 6;
 477                 val |= cur[1] & 0x3f;
 478             }
 479             if (!IS_CHAR(val)) {
 480                 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
 481                                 "Char 0x%X out of allowed range\n", val);
 482             }
 483             return(val);
 484         } else {
 485             if ((*ctxt->input->cur == 0) &&
 486                 (ctxt->input->cur < ctxt->input->end)) {
 487                     htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
 488                                 "Char 0x%X out of allowed range\n", 0);
 489                 *len = 1;
 490                 return(' ');
 491             }
 492             /* 1-byte code */
 493             *len = 1;
 494             return((int) *ctxt->input->cur);
 495         }
 496     }
 497     /*
 498      * Assume it's a fixed length encoding (1) with
 499      * a compatible encoding for the ASCII set, since
 500      * XML constructs only use < 128 chars
 501      */
 502     *len = 1;
 503     if ((int) *ctxt->input->cur < 0x80)
 504         return((int) *ctxt->input->cur);
 505
 506     /*
 507      * Humm this is bad, do an automatic flow conversion
 508      */
 509     {
 510         xmlChar * guess;
 511         xmlCharEncodingHandlerPtr handler;
 512
 513         guess = htmlFindEncoding(ctxt);
 514         if (guess == NULL) {
 515             xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
 516         } else {
 517             if (ctxt->input->encoding != NULL)
 518                 xmlFree((xmlChar *) ctxt->input->encoding);
 519             ctxt->input->encoding = guess;
 520             handler = xmlFindCharEncodingHandler((const char *) guess);
 521             if (handler != NULL) {
 522                 xmlSwitchToEncoding(ctxt, handler);
 523             } else {
 524                 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
 525                              "Unsupported encoding %s", guess, NULL);
 526             }
 527         }
 528         ctxt->charset = XML_CHAR_ENCODING_UTF8;
 529     }
 530
 531     return(xmlCurrentChar(ctxt, len));
 532
 533 encoding_error:
 534     /*
 535      * If we detect an UTF8 error that probably mean that the
 536      * input encoding didn't get properly advertized in the
 537      * declaration header. Report the error and switch the encoding
 538      * to ISO-Latin-1 (if you don't like this policy, just declare the
 539      * encoding !)
 540      */
 541     {
 542         char buffer[150];
 543
 544         if (ctxt->input->end - ctxt->input->cur >= 4) {
 545             snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
 546                             ctxt->input->cur[0], ctxt->input->cur[1],
 547                             ctxt->input->cur[2], ctxt->input->cur[3]);
 548         } else {
 549             snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
 550         }
 551         htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
 552                      "Input is not proper UTF-8, indicate encoding !\n",
 553                      BAD_CAST buffer, NULL);
 554     }
 555
 556     ctxt->charset = XML_CHAR_ENCODING_8859_1;
 557     *len = 1;
 558     return((int) *ctxt->input->cur);
 559 }
 560
 561 /**
 562  * htmlSkipBlankChars:
 563  * @ctxt:  the HTML parser context
 564  *
 565  * skip all blanks character found at that point in the input streams.
 566  *
 567  * Returns the number of space chars skipped
 568  */
 569
 570 static int
 571 htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
 572     int res = 0;
 573
 574     while (IS_BLANK_CH(*(ctxt->input->cur))) {
 575         if ((*ctxt->input->cur == 0) &&
 576             (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
 577                 xmlPopInput(ctxt);
 578         } else {
 579             if (*(ctxt->input->cur) == '\n') {
 580                 ctxt->input->line++; ctxt->input->col = 1;
 581             } else ctxt->input->col++;
 582             ctxt->input->cur++;
 583             ctxt->nbChars++;
 584             if (*ctxt->input->cur == 0)
 585                 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
 586         }
 587         res++;
 588     }
 589     return(res);
 590 }
 591
 592
 593
 594 /************************************************************************
 595  *                                                                      *
 596  *      The list of HTML elements and their properties          *
 597  *                                                                      *
 598  ************************************************************************/
 599
 600 /*
 601  *  Start Tag: 1 means the start tag can be ommited
 602  *  End Tag:   1 means the end tag can be ommited
 603  *             2 means it's forbidden (empty elements)
 604  *             3 means the tag is stylistic and should be closed easily
 605  *  Depr:      this element is deprecated
 606  *  DTD:       1 means that this element is valid only in the Loose DTD
 607  *             2 means that this element is valid only in the Frameset DTD
 608  *
 609  * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
 610         , subElements , impliedsubelt , Attributes, userdata
 611  */
 612
 613 /* Definitions and a couple of vars for HTML Elements */
 614
 615 #define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
 616 #define NB_FONTSTYLE 8
 617 #define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
 618 #define NB_PHRASE 10
 619 #define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
 620 #define NB_SPECIAL 16
 621 #define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
 622 #define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
 623 #define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
 624 #define NB_BLOCK NB_HEADING + NB_LIST + 14
 625 #define FORMCTRL "input", "select", "textarea", "label", "button"
 626 #define NB_FORMCTRL 5
 627 #define PCDATA
 628 #define NB_PCDATA 0
 629 #define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
 630 #define NB_HEADING 6
 631 #define LIST "ul", "ol", "dir", "menu"
 632 #define NB_LIST 4
 633 #define MODIFIER
 634 #define NB_MODIFIER 0
 635 #define FLOW BLOCK,INLINE
 636 #define NB_FLOW NB_BLOCK + NB_INLINE
 637 #define EMPTY NULL
 638
 639
 640 static const char* const html_flow[] = { FLOW, NULL } ;
 641 static const char* const html_inline[] = { INLINE, NULL } ;
 642
 643 /* placeholders: elts with content but no subelements */
 644 static const char* const html_pcdata[] = { NULL } ;
 645 #define html_cdata html_pcdata
 646
 647
 648 /* ... and for HTML Attributes */
 649
 650 #define COREATTRS "id", "class", "style", "title"
 651 #define NB_COREATTRS 4
 652 #define I18N "lang", "dir"
 653 #define NB_I18N 2
 654 #define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
 655 #define NB_EVENTS 9
 656 #define ATTRS COREATTRS,I18N,EVENTS
 657 #define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
 658 #define CELLHALIGN "align", "char", "charoff"
 659 #define NB_CELLHALIGN 3
 660 #define CELLVALIGN "valign"
 661 #define NB_CELLVALIGN 1
 662
 663 static const char* const html_attrs[] = { ATTRS, NULL } ;
 664 static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
 665 static const char* const core_attrs[] = { COREATTRS, NULL } ;
 666 static const char* const i18n_attrs[] = { I18N, NULL } ;
 667
 668
 669 /* Other declarations that should go inline ... */
 670 static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
 671         "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
 672         "tabindex", "onfocus", "onblur", NULL } ;
 673 static const char* const target_attr[] = { "target", NULL } ;
 674 static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
 675 static const char* const alt_attr[] = { "alt", NULL } ;
 676 static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
 677 static const char* const href_attrs[] = { "href", NULL } ;
 678 static const char* const clear_attrs[] = { "clear", NULL } ;
 679 static const char* const inline_p[] = { INLINE, "p", NULL } ;
 680
 681 static const char* const flow_param[] = { FLOW, "param", NULL } ;
 682 static const char* const applet_attrs[] = { COREATTRS , "codebase",
 683                 "archive", "alt", "name", "height", "width", "align",
 684                 "hspace", "vspace", NULL } ;
 685 static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
 686         "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
 687 static const char* const basefont_attrs[] =
 688         { "id", "size", "color", "face", NULL } ;
 689 static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
 690 static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
 691 static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
 692 static const char* const body_depr[] = { "background", "bgcolor", "text",
 693         "link", "vlink", "alink", NULL } ;
 694 static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
 695         "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
 696
 697
 698 static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
 699 static const char* const col_elt[] = { "col", NULL } ;
 700 static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
 701 static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
 702 static const char* const dl_contents[] = { "dt", "dd", NULL } ;
 703 static const char* const compact_attr[] = { "compact", NULL } ;
 704 static const char* const label_attr[] = { "label", NULL } ;
 705 static const char* const fieldset_contents[] = { FLOW, "legend" } ;
 706 static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
 707 static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
 708 static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
 709 static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
 710 static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
 711 static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
 712 static const char* const head_attrs[] = { I18N, "profile", NULL } ;
 713 static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
 714 static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
 715 static const char* const version_attr[] = { "version", NULL } ;
 716 static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
 717 static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
 718 static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
 719 static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
 720 static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
 721 static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
 722 static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
 723 static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
 724 static const char* const align_attr[] = { "align", NULL } ;
 725 static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
 726 static const char* const map_contents[] = { BLOCK, "area", NULL } ;
 727 static const char* const name_attr[] = { "name", NULL } ;
 728 static const char* const action_attr[] = { "action", NULL } ;
 729 static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
 730 static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
 731 static const char* const content_attr[] = { "content", NULL } ;
 732 static const char* const type_attr[] = { "type", NULL } ;
 733 static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
 734 static const char* const object_contents[] = { FLOW, "param", NULL } ;
 735 static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
 736 static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
 737 static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
 738 static const char* const option_elt[] = { "option", NULL } ;
 739 static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
 740 static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
 741 static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
 742 static const char* const width_attr[] = { "width", NULL } ;
 743 static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
 744 static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
 745 static const char* const language_attr[] = { "language", NULL } ;
 746 static const char* const select_content[] = { "optgroup", "option", NULL } ;
 747 static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
 748 static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
 749 static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
 750 static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
 751 static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
 752 static const char* const tr_elt[] = { "tr", NULL } ;
 753 static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
 754 static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
 755 static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
 756 static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
 757 static const char* const tr_contents[] = { "th", "td", NULL } ;
 758 static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
 759 static const char* const li_elt[] = { "li", NULL } ;
 760 static const char* const ul_depr[] = { "type", "compact", NULL} ;
 761 static const char* const dir_attr[] = { "dir", NULL} ;
 762
 763 #define DECL (const char**)
 764
 765 static const htmlElemDesc
 766 html40ElementTable[] = {
 767 { "a",          0, 0, 0, 0, 0, 0, 1, "anchor ",
 768         DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
 769 },
 770 { "abbr",       0, 0, 0, 0, 0, 0, 1, "abbreviated form",
 771         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 772 },
 773 { "acronym",    0, 0, 0, 0, 0, 0, 1, "",
 774         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 775 },
 776 { "address",    0, 0, 0, 0, 0, 0, 0, "information on author ",
 777         DECL inline_p  , NULL , DECL html_attrs, NULL, NULL
 778 },
 779 { "applet",     0, 0, 0, 0, 1, 1, 2, "java applet ",
 780         DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
 781 },
 782 { "area",       0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
 783         EMPTY ,  NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
 784 },
 785 { "b",          0, 3, 0, 0, 0, 0, 1, "bold text style",
 786         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 787 },
 788 { "base",       0, 2, 2, 1, 0, 0, 0, "document base uri ",
 789         EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
 790 },
 791 { "basefont",   0, 2, 2, 1, 1, 1, 1, "base font size " ,
 792         EMPTY , NULL , NULL, DECL basefont_attrs, NULL
 793 },
 794 { "bdo",        0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
 795         DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
 796 },
 797 { "big",        0, 3, 0, 0, 0, 0, 1, "large text style",
 798         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 799 },
 800 { "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
 801         DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
 802 },
 803 { "body",       1, 1, 0, 0, 0, 0, 0, "document body ",
 804         DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
 805 },
 806 { "br",         0, 2, 2, 1, 0, 0, 1, "forced line break ",
 807         EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
 808 },
 809 { "button",     0, 0, 0, 0, 0, 0, 2, "push button ",
 810         DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
 811 },
 812 { "caption",    0, 0, 0, 0, 0, 0, 0, "table caption ",
 813         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 814 },
 815 { "center",     0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
 816         DECL html_flow , NULL , NULL, DECL html_attrs, NULL
 817 },
 818 { "cite",       0, 0, 0, 0, 0, 0, 1, "citation",
 819         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 820 },
 821 { "code",       0, 0, 0, 0, 0, 0, 1, "computer code fragment",
 822         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 823 },
 824 { "col",        0, 2, 2, 1, 0, 0, 0, "table column ",
 825         EMPTY , NULL , DECL col_attrs , NULL, NULL
 826 },
 827 { "colgroup",   0, 1, 0, 0, 0, 0, 0, "table column group ",
 828         DECL col_elt , "col" , DECL col_attrs , NULL, NULL
 829 },
 830 { "dd",         0, 1, 0, 0, 0, 0, 0, "definition description ",
 831         DECL html_flow , NULL , DECL html_attrs, NULL, NULL
 832 },
 833 { "del",        0, 0, 0, 0, 0, 0, 2, "deleted text ",
 834         DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
 835 },
 836 { "dfn",        0, 0, 0, 0, 0, 0, 1, "instance definition",
 837         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
 838 },
 839 { "dir",        0, 0, 0, 0, 1, 1, 0, "directory list",
 840         DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
 841 },
 842 { "div",        0, 0, 0, 0, 0, 0, 0, "generic language/style container",
 843         DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
 844 },
 845 { "dl",         0, 0, 0, 0, 0, 0, 0, "definition list ",
 846         DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
 847 },
 848 { "dt",         0, 1, 0, 0, 0, 0, 0, "definition term ",
 849         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 850 },
 851 { "em",         0, 3, 0, 0, 0, 0, 1, "emphasis",
 852         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 853 },
 854 { "embed",      0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
 855         EMPTY, NULL, DECL embed_attrs, NULL, NULL
 856 },
 857 { "fieldset",   0, 0, 0, 0, 0, 0, 0, "form control group ",
 858         DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
 859 },
 860 { "font",       0, 3, 0, 0, 1, 1, 1, "local change to font ",
 861         DECL html_inline, NULL, NULL, DECL font_attrs, NULL
 862 },
 863 { "form",       0, 0, 0, 0, 0, 0, 0, "interactive form ",
 864         DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
 865 },
 866 { "frame",      0, 2, 2, 1, 0, 2, 0, "subwindow " ,
 867         EMPTY, NULL, NULL, DECL frame_attrs, NULL
 868 },
 869 { "frameset",   0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
 870         DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
 871 },
 872 { "h1",         0, 0, 0, 0, 0, 0, 0, "heading ",
 873         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 874 },
 875 { "h2",         0, 0, 0, 0, 0, 0, 0, "heading ",
 876         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 877 },
 878 { "h3",         0, 0, 0, 0, 0, 0, 0, "heading ",
 879         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 880 },
 881 { "h4",         0, 0, 0, 0, 0, 0, 0, "heading ",
 882         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 883 },
 884 { "h5",         0, 0, 0, 0, 0, 0, 0, "heading ",
 885         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 886 },
 887 { "h6",         0, 0, 0, 0, 0, 0, 0, "heading ",
 888         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 889 },
 890 { "head",       1, 1, 0, 0, 0, 0, 0, "document head ",
 891         DECL head_contents, NULL, DECL head_attrs, NULL, NULL
 892 },
 893 { "hr",         0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
 894         EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
 895 },
 896 { "html",       1, 1, 0, 0, 0, 0, 0, "document root element ",
 897         DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
 898 },
 899 { "i",          0, 3, 0, 0, 0, 0, 1, "italic text style",
 900         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 901 },
 902 { "iframe",     0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
 903         DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
 904 },
 905 { "img",        0, 2, 2, 1, 0, 0, 1, "embedded image ",
 906         EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
 907 },
 908 { "input",      0, 2, 2, 1, 0, 0, 1, "form control ",
 909         EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
 910 },
 911 { "ins",        0, 0, 0, 0, 0, 0, 2, "inserted text",
 912         DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
 913 },
 914 { "isindex",    0, 2, 2, 1, 1, 1, 0, "single line prompt ",
 915         EMPTY, NULL, NULL, DECL prompt_attrs, NULL
 916 },
 917 { "kbd",        0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
 918         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 919 },
 920 { "label",      0, 0, 0, 0, 0, 0, 1, "form field label text ",
 921         DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
 922 },
 923 { "legend",     0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
 924         DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
 925 },
 926 { "li",         0, 1, 1, 0, 0, 0, 0, "list item ",
 927         DECL html_flow, NULL, DECL html_attrs, NULL, NULL
 928 },
 929 { "link",       0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
 930         EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
 931 },
 932 { "map",        0, 0, 0, 0, 0, 0, 2, "client-side image map ",
 933         DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
 934 },
 935 { "menu",       0, 0, 0, 0, 1, 1, 0, "menu list ",
 936         DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
 937 },
 938 { "meta",       0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
 939         EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
 940 },
 941 { "noframes",   0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
 942         DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
 943 },
 944 { "noscript",   0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
 945         DECL html_flow, "div", DECL html_attrs, NULL, NULL
 946 },
 947 { "object",     0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
 948         DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
 949 },
 950 { "ol",         0, 0, 0, 0, 0, 0, 0, "ordered list ",
 951         DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
 952 },
 953 { "optgroup",   0, 0, 0, 0, 0, 0, 0, "option group ",
 954         DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
 955 },
 956 { "option",     0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
 957         DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
 958 },
 959 { "p",          0, 1, 0, 0, 0, 0, 0, "paragraph ",
 960         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
 961 },
 962 { "param",      0, 2, 2, 1, 0, 0, 0, "named property value ",
 963         EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
 964 },
 965 { "pre",        0, 0, 0, 0, 0, 0, 0, "preformatted text ",
 966         DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
 967 },
 968 { "q",          0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
 969         DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
 970 },
 971 { "s",          0, 3, 0, 0, 1, 1, 1, "strike-through text style",
 972         DECL html_inline, NULL, NULL, DECL html_attrs, NULL
 973 },
 974 { "samp",       0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
 975         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 976 },
 977 { "script",     0, 0, 0, 0, 0, 0, 2, "script statements ",
 978         DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
 979 },
 980 { "select",     0, 0, 0, 0, 0, 0, 1, "option selector ",
 981         DECL select_content, NULL, DECL select_attrs, NULL, NULL
 982 },
 983 { "small",      0, 3, 0, 0, 0, 0, 1, "small text style",
 984         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 985 },
 986 { "span",       0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
 987         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 988 },
 989 { "strike",     0, 3, 0, 0, 1, 1, 1, "strike-through text",
 990         DECL html_inline, NULL, NULL, DECL html_attrs, NULL
 991 },
 992 { "strong",     0, 3, 0, 0, 0, 0, 1, "strong emphasis",
 993         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
 994 },
 995 { "style",      0, 0, 0, 0, 0, 0, 0, "style info ",
 996         DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
 997 },
 998 { "sub",        0, 3, 0, 0, 0, 0, 1, "subscript",
 999         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1000 },
1001 { "sup",        0, 3, 0, 0, 0, 0, 1, "superscript ",
1002         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1003 },
1004 { "table",      0, 0, 0, 0, 0, 0, 0, "",
1005         DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
1006 },
1007 { "tbody",      1, 0, 0, 0, 0, 0, 0, "table body ",
1008         DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1009 },
1010 { "td",         0, 0, 0, 0, 0, 0, 0, "table data cell",
1011         DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1012 },
1013 { "textarea",   0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
1014         DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
1015 },
1016 { "tfoot",      0, 1, 0, 0, 0, 0, 0, "table footer ",
1017         DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1018 },
1019 { "th",         0, 1, 0, 0, 0, 0, 0, "table header cell",
1020         DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1021 },
1022 { "thead",      0, 1, 0, 0, 0, 0, 0, "table header ",
1023         DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1024 },
1025 { "title",      0, 0, 0, 0, 0, 0, 0, "document title ",
1026         DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
1027 },
1028 { "tr",         0, 0, 0, 0, 0, 0, 0, "table row ",
1029         DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
1030 },
1031 { "tt",         0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
1032         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1033 },
1034 { "u",          0, 3, 0, 0, 1, 1, 1, "underlined text style",
1035         DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1036 },
1037 { "ul",         0, 0, 0, 0, 0, 0, 0, "unordered list ",
1038         DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
1039 },
1040 { "var",        0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
1041         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1042 }
1043 };
1044
1045 /*
1046  * start tags that imply the end of current element
1047  */
1048 static const char * const htmlStartClose[] = {
1049 "form",         "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
1050                 "dl", "ul", "ol", "menu", "dir", "address", "pre",
1051                 "listing", "xmp", "head", NULL,
1052 "head",         "p", NULL,
1053 "title",        "p", NULL,
1054 "body",         "head", "style", "link", "title", "p", NULL,
1055 "frameset",     "head", "style", "link", "title", "p", NULL,
1056 "li",           "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
1057                 "pre", "listing", "xmp", "head", "li", NULL,
1058 "hr",           "p", "head", NULL,
1059 "h1",           "p", "head", NULL,
1060 "h2",           "p", "head", NULL,
1061 "h3",           "p", "head", NULL,
1062 "h4",           "p", "head", NULL,
1063 "h5",           "p", "head", NULL,
1064 "h6",           "p", "head", NULL,
1065 "dir",          "p", "head", NULL,
1066 "address",      "p", "head", "ul", NULL,
1067 "pre",          "p", "head", "ul", NULL,
1068 "listing",      "p", "head", NULL,
1069 "xmp",          "p", "head", NULL,
1070 "blockquote",   "p", "head", NULL,
1071 "dl",           "p", "dt", "menu", "dir", "address", "pre", "listing",
1072                 "xmp", "head", NULL,
1073 "dt",           "p", "menu", "dir", "address", "pre", "listing", "xmp",
1074                 "head", "dd", NULL,
1075 "dd",           "p", "menu", "dir", "address", "pre", "listing", "xmp",
1076                 "head", "dt", NULL,
1077 "ul",           "p", "head", "ol", "menu", "dir", "address", "pre",
1078                 "listing", "xmp", NULL,
1079 "ol",           "p", "head", "ul", NULL,
1080 "menu",         "p", "head", "ul", NULL,
1081 "p",            "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL,
1082 "div",          "p", "head", NULL,
1083 "noscript",     "p", "head", NULL,
1084 "center",       "font", "b", "i", "p", "head", NULL,
1085 "a",            "a", NULL,
1086 "caption",      "p", NULL,
1087 "colgroup",     "caption", "colgroup", "col", "p", NULL,
1088 "col",          "caption", "col", "p", NULL,
1089 "table",        "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
1090                 "listing", "xmp", "a", NULL,
1091 "th",           "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
1092 "td",           "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
1093 "tr",           "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
1094 "thead",        "caption", "col", "colgroup", NULL,
1095 "tfoot",        "th", "td", "tr", "caption", "col", "colgroup", "thead",
1096                 "tbody", "p", NULL,
1097 "tbody",        "th", "td", "tr", "caption", "col", "colgroup", "thead",
1098                 "tfoot", "tbody", "p", NULL,
1099 "optgroup",     "option", NULL,
1100 "option",       "option", NULL,
1101 "fieldset",     "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
1102                 "pre", "listing", "xmp", "a", NULL,
1103 NULL
1104 };
1105
1106 /*
1107  * The list of HTML elements which are supposed not to have
1108  * CDATA content and where a p element will be implied
1109  *
1110  * TODO: extend that list by reading the HTML SGML DTD on
1111  *       implied paragraph
1112  */
1113 static const char *const htmlNoContentElements[] = {
1114     "html",
1115     "head",
1116     NULL
1117 };
1118
1119 /*
1120  * The list of HTML attributes which are of content %Script;
1121  * NOTE: when adding ones, check htmlIsScriptAttribute() since
1122  *       it assumes the name starts with 'on'
1123  */
1124 static const char *const htmlScriptAttributes[] = {
1125     "onclick",
1126     "ondblclick",
1127     "onmousedown",
1128     "onmouseup",
1129     "onmouseover",
1130     "onmousemove",
1131     "onmouseout",
1132     "onkeypress",
1133     "onkeydown",
1134     "onkeyup",
1135     "onload",
1136     "onunload",
1137     "onfocus",
1138     "onblur",
1139     "onsubmit",
1140     "onrest",
1141     "onchange",
1142     "onselect"
1143 };
1144
1145 /*
1146  * This table is used by the htmlparser to know what to do with
1147  * broken html pages. By assigning different priorities to different
1148  * elements the parser can decide how to handle extra endtags.
1149  * Endtags are only allowed to close elements with lower or equal
1150  * priority.
1151  */
1152
1153 typedef struct {
1154     const char *name;
1155     int priority;
1156 } elementPriority;
1157
1158 static const elementPriority htmlEndPriority[] = {
1159     {"div",   150},
1160     {"td",    160},
1161     {"th",    160},
1162     {"tr",    170},
1163     {"thead", 180},
1164     {"tbody", 180},
1165     {"tfoot", 180},
1166     {"table", 190},
1167     {"head",  200},
1168     {"body",  200},
1169     {"html",  220},
1170     {NULL,    100} /* Default priority */
1171 };
1172
1173 static const char** htmlStartCloseIndex[100];
1174 static int htmlStartCloseIndexinitialized = 0;
1175
1176 /************************************************************************
1177  *                                                                      *
1178  *      functions to handle HTML specific data                  *
1179  *                                                                      *
1180  ************************************************************************/
1181
1182 /**
1183  * htmlInitAutoClose:
1184  *
1185  * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1186  * This is not reentrant. Call xmlInitParser() once before processing in
1187  * case of use in multithreaded programs.
1188  */
1189 void
1190 htmlInitAutoClose(void) {
1191     int indx, i = 0;
1192
1193     if (htmlStartCloseIndexinitialized) return;
1194
1195     for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1196     indx = 0;
1197     while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
1198         htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
1199         while (htmlStartClose[i] != NULL) i++;
1200         i++;
1201     }
1202     htmlStartCloseIndexinitialized = 1;
1203 }
1204
1205 /**
1206  * htmlTagLookup:
1207  * @tag:  The tag name in lowercase
1208  *
1209  * Lookup the HTML tag in the ElementTable
1210  *
1211  * Returns the related htmlElemDescPtr or NULL if not found.
1212  */
1213 const htmlElemDesc *
1214 htmlTagLookup(const xmlChar *tag) {
1215     unsigned int i;
1216
1217     for (i = 0; i < (sizeof(html40ElementTable) /
1218                      sizeof(html40ElementTable[0]));i++) {
1219         if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
1220             return((htmlElemDescPtr) &html40ElementTable[i]);
1221     }
1222     return(NULL);
1223 }
1224
1225 /**
1226  * htmlGetEndPriority:
1227  * @name: The name of the element to look up the priority for.
1228  *
1229  * Return value: The "endtag" priority.
1230  **/
1231 static int
1232 htmlGetEndPriority (const xmlChar *name) {
1233     int i = 0;
1234
1235     while ((htmlEndPriority[i].name != NULL) &&
1236            (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1237         i++;
1238
1239     return(htmlEndPriority[i].priority);
1240 }
1241
1242
1243 /**
1244  * htmlCheckAutoClose:
1245  * @newtag:  The new tag name
1246  * @oldtag:  The old tag name
1247  *
1248  * Checks whether the new tag is one of the registered valid tags for
1249  * closing old.
1250  * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1251  *
1252  * Returns 0 if no, 1 if yes.
1253  */
1254 static int
1255 htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1256 {
1257     int i, indx;
1258     const char **closed = NULL;
1259
1260     if (htmlStartCloseIndexinitialized == 0)
1261         htmlInitAutoClose();
1262
1263     /* inefficient, but not a big deal */
1264     for (indx = 0; indx < 100; indx++) {
1265         closed = htmlStartCloseIndex[indx];
1266         if (closed == NULL)
1267             return (0);
1268         if (xmlStrEqual(BAD_CAST * closed, newtag))
1269             break;
1270     }
1271
1272     i = closed - htmlStartClose;
1273     i++;
1274     while (htmlStartClose[i] != NULL) {
1275         if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
1276             return (1);
1277         }
1278         i++;
1279     }
1280     return (0);
1281 }
1282
1283 /**
1284  * htmlAutoCloseOnClose:
1285  * @ctxt:  an HTML parser context
1286  * @newtag:  The new tag name
1287  * @force:  force the tag closure
1288  *
1289  * The HTML DTD allows an ending tag to implicitly close other tags.
1290  */
1291 static void
1292 htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1293 {
1294     const htmlElemDesc *info;
1295     int i, priority;
1296
1297     priority = htmlGetEndPriority(newtag);
1298
1299     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1300
1301         if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1302             break;
1303         /*
1304          * A missplaced endtag can only close elements with lower
1305          * or equal priority, so if we find an element with higher
1306          * priority before we find an element with
1307          * matching name, we just ignore this endtag
1308          */
1309         if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1310             return;
1311     }
1312     if (i < 0)
1313         return;
1314
1315     while (!xmlStrEqual(newtag, ctxt->name)) {
1316         info = htmlTagLookup(ctxt->name);
1317         if ((info != NULL) && (info->endTag == 3)) {
1318             htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1319                          "Opening and ending tag mismatch: %s and %s\n",
1320                          newtag, ctxt->name);
1321         }
1322         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1323             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1324         htmlnamePop(ctxt);
1325     }
1326 }
1327
1328 /**
1329  * htmlAutoCloseOnEnd:
1330  * @ctxt:  an HTML parser context
1331  *
1332  * Close all remaining tags at the end of the stream
1333  */
1334 static void
1335 htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1336 {
1337     int i;
1338
1339     if (ctxt->nameNr == 0)
1340         return;
1341     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1342         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1343             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1344         htmlnamePop(ctxt);
1345     }
1346 }
1347
1348 /**
1349  * htmlAutoClose:
1350  * @ctxt:  an HTML parser context
1351  * @newtag:  The new tag name or NULL
1352  *
1353  * The HTML DTD allows a tag to implicitly close other tags.
1354  * The list is kept in htmlStartClose array. This function is
1355  * called when a new tag has been detected and generates the
1356  * appropriates closes if possible/needed.
1357  * If newtag is NULL this mean we are at the end of the resource
1358  * and we should check
1359  */
1360 static void
1361 htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1362 {
1363     while ((newtag != NULL) && (ctxt->name != NULL) &&
1364            (htmlCheckAutoClose(newtag, ctxt->name))) {
1365         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1366             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1367         htmlnamePop(ctxt);
1368     }
1369     if (newtag == NULL) {
1370         htmlAutoCloseOnEnd(ctxt);
1371         return;
1372     }
1373     while ((newtag == NULL) && (ctxt->name != NULL) &&
1374            ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1375             (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1376             (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
1377         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1378             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1379         htmlnamePop(ctxt);
1380     }
1381 }
1382
1383 /**
1384  * htmlAutoCloseTag:
1385  * @doc:  the HTML document
1386  * @name:  The tag name
1387  * @elem:  the HTML element
1388  *
1389  * The HTML DTD allows a tag to implicitly close other tags.
1390  * The list is kept in htmlStartClose array. This function checks
1391  * if the element or one of it's children would autoclose the
1392  * given tag.
1393  *
1394  * Returns 1 if autoclose, 0 otherwise
1395  */
1396 int
1397 htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1398     htmlNodePtr child;
1399
1400     if (elem == NULL) return(1);
1401     if (xmlStrEqual(name, elem->name)) return(0);
1402     if (htmlCheckAutoClose(elem->name, name)) return(1);
1403     child = elem->children;
1404     while (child != NULL) {
1405         if (htmlAutoCloseTag(doc, name, child)) return(1);
1406         child = child->next;
1407     }
1408     return(0);
1409 }
1410
1411 /**
1412  * htmlIsAutoClosed:
1413  * @doc:  the HTML document
1414  * @elem:  the HTML element
1415  *
1416  * The HTML DTD allows a tag to implicitly close other tags.
1417  * The list is kept in htmlStartClose array. This function checks
1418  * if a tag is autoclosed by one of it's child
1419  *
1420  * Returns 1 if autoclosed, 0 otherwise
1421  */
1422 int
1423 htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1424     htmlNodePtr child;
1425
1426     if (elem == NULL) return(1);
1427     child = elem->children;
1428     while (child != NULL) {
1429         if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1430         child = child->next;
1431     }
1432     return(0);
1433 }
1434
1435 /**
1436  * htmlCheckImplied:
1437  * @ctxt:  an HTML parser context
1438  * @newtag:  The new tag name
1439  *
1440  * The HTML DTD allows a tag to exists only implicitly
1441  * called when a new tag has been detected and generates the
1442  * appropriates implicit tags if missing
1443  */
1444 static void
1445 htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1446     int i;
1447
1448     if (ctxt->options & HTML_PARSE_NOIMPLIED)
1449         return;
1450     if (!htmlOmittedDefaultValue)
1451         return;
1452     if (xmlStrEqual(newtag, BAD_CAST"html"))
1453         return;
1454     if (ctxt->nameNr <= 0) {
1455         htmlnamePush(ctxt, BAD_CAST"html");
1456         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1457             ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1458     }
1459     if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1460         return;
1461     if ((ctxt->nameNr <= 1) &&
1462         ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1463          (xmlStrEqual(newtag, BAD_CAST"style")) ||
1464          (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1465          (xmlStrEqual(newtag, BAD_CAST"link")) ||
1466          (xmlStrEqual(newtag, BAD_CAST"title")) ||
1467          (xmlStrEqual(newtag, BAD_CAST"base")))) {
1468         if (ctxt->html >= 3) {
1469             /* we already saw or generated an <head> before */
1470             return;
1471         }
1472         /*
1473          * dropped OBJECT ... i you put it first BODY will be
1474          * assumed !
1475          */
1476         htmlnamePush(ctxt, BAD_CAST"head");
1477         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1478             ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1479     } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1480                (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1481                (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1482         if (ctxt->html >= 10) {
1483             /* we already saw or generated a <body> before */
1484             return;
1485         }
1486         for (i = 0;i < ctxt->nameNr;i++) {
1487             if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1488                 return;
1489             }
1490             if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1491                 return;
1492             }
1493         }
1494
1495         htmlnamePush(ctxt, BAD_CAST"body");
1496         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1497             ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1498     }
1499 }
1500
1501 /**
1502  * htmlCheckParagraph
1503  * @ctxt:  an HTML parser context
1504  *
1505  * Check whether a p element need to be implied before inserting
1506  * characters in the current element.
1507  *
1508  * Returns 1 if a paragraph has been inserted, 0 if not and -1
1509  *         in case of error.
1510  */
1511
1512 static int
1513 htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1514     const xmlChar *tag;
1515     int i;
1516
1517     if (ctxt == NULL)
1518         return(-1);
1519     tag = ctxt->name;
1520     if (tag == NULL) {
1521         htmlAutoClose(ctxt, BAD_CAST"p");
1522         htmlCheckImplied(ctxt, BAD_CAST"p");
1523         htmlnamePush(ctxt, BAD_CAST"p");
1524         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1525             ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1526         return(1);
1527     }
1528     if (!htmlOmittedDefaultValue)
1529         return(0);
1530     for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1531         if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1532             htmlAutoClose(ctxt, BAD_CAST"p");
1533             htmlCheckImplied(ctxt, BAD_CAST"p");
1534             htmlnamePush(ctxt, BAD_CAST"p");
1535             if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1536                 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1537             return(1);
1538         }
1539     }
1540     return(0);
1541 }
1542
1543 /**
1544  * htmlIsScriptAttribute:
1545  * @name:  an attribute name
1546  *
1547  * Check if an attribute is of content type Script
1548  *
1549  * Returns 1 is the attribute is a script 0 otherwise
1550  */
1551 int
1552 htmlIsScriptAttribute(const xmlChar *name) {
1553     unsigned int i;
1554
1555     if (name == NULL)
1556       return(0);
1557     /*
1558      * all script attributes start with 'on'
1559      */
1560     if ((name[0] != 'o') || (name[1] != 'n'))
1561       return(0);
1562     for (i = 0;
1563          i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1564          i++) {
1565         if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1566             return(1);
1567     }
1568     return(0);
1569 }
1570
1571 /************************************************************************
1572  *                                                                      *
1573  *      The list of HTML predefined entities                    *
1574  *                                                                      *
1575  ************************************************************************/
1576
1577
1578 static const htmlEntityDesc  html40EntitiesTable[] = {
1579 /*
1580  * the 4 absolute ones, plus apostrophe.
1581  */
1582 { 34,   "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1583 { 38,   "amp",  "ampersand, U+0026 ISOnum" },
1584 { 39,   "apos", "single quote" },
1585 { 60,   "lt",   "less-than sign, U+003C ISOnum" },
1586 { 62,   "gt",   "greater-than sign, U+003E ISOnum" },
1587
1588 /*
1589  * A bunch still in the 128-255 range
1590  * Replacing them depend really on the charset used.
1591  */
1592 { 160,  "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1593 { 161,  "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1594 { 162,  "cent", "cent sign, U+00A2 ISOnum" },
1595 { 163,  "pound","pound sign, U+00A3 ISOnum" },
1596 { 164,  "curren","currency sign, U+00A4 ISOnum" },
1597 { 165,  "yen",  "yen sign = yuan sign, U+00A5 ISOnum" },
1598 { 166,  "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1599 { 167,  "sect", "section sign, U+00A7 ISOnum" },
1600 { 168,  "uml",  "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1601 { 169,  "copy", "copyright sign, U+00A9 ISOnum" },
1602 { 170,  "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1603 { 171,  "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1604 { 172,  "not",  "not sign, U+00AC ISOnum" },
1605 { 173,  "shy",  "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1606 { 174,  "reg",  "registered sign = registered trade mark sign, U+00AE ISOnum" },
1607 { 175,  "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1608 { 176,  "deg",  "degree sign, U+00B0 ISOnum" },
1609 { 177,  "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1610 { 178,  "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1611 { 179,  "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1612 { 180,  "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1613 { 181,  "micro","micro sign, U+00B5 ISOnum" },
1614 { 182,  "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1615 { 183,  "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1616 { 184,  "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1617 { 185,  "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1618 { 186,  "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1619 { 187,  "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1620 { 188,  "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1621 { 189,  "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1622 { 190,  "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1623 { 191,  "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1624 { 192,  "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1625 { 193,  "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1626 { 194,  "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1627 { 195,  "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1628 { 196,  "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1629 { 197,  "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1630 { 198,  "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1631 { 199,  "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1632 { 200,  "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1633 { 201,  "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1634 { 202,  "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1635 { 203,  "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1636 { 204,  "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1637 { 205,  "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1638 { 206,  "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1639 { 207,  "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1640 { 208,  "ETH",  "latin capital letter ETH, U+00D0 ISOlat1" },
1641 { 209,  "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1642 { 210,  "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1643 { 211,  "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1644 { 212,  "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1645 { 213,  "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1646 { 214,  "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1647 { 215,  "times","multiplication sign, U+00D7 ISOnum" },
1648 { 216,  "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1649 { 217,  "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1650 { 218,  "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1651 { 219,  "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1652 { 220,  "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1653 { 221,  "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1654 { 222,  "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1655 { 223,  "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1656 { 224,  "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1657 { 225,  "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1658 { 226,  "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1659 { 227,  "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1660 { 228,  "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1661 { 229,  "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1662 { 230,  "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1663 { 231,  "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1664 { 232,  "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1665 { 233,  "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1666 { 234,  "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1667 { 235,  "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1668 { 236,  "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1669 { 237,  "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1670 { 238,  "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1671 { 239,  "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1672 { 240,  "eth",  "latin small letter eth, U+00F0 ISOlat1" },
1673 { 241,  "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1674 { 242,  "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1675 { 243,  "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1676 { 244,  "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1677 { 245,  "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1678 { 246,  "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1679 { 247,  "divide","division sign, U+00F7 ISOnum" },
1680 { 248,  "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1681 { 249,  "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1682 { 250,  "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1683 { 251,  "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1684 { 252,  "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1685 { 253,  "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1686 { 254,  "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1687 { 255,  "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1688
1689 { 338,  "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1690 { 339,  "oelig","latin small ligature oe, U+0153 ISOlat2" },
1691 { 352,  "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1692 { 353,  "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1693 { 376,  "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1694
1695 /*
1696  * Anything below should really be kept as entities references
1697  */
1698 { 402,  "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1699
1700 { 710,  "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1701 { 732,  "tilde","small tilde, U+02DC ISOdia" },
1702
1703 { 913,  "Alpha","greek capital letter alpha, U+0391" },
1704 { 914,  "Beta", "greek capital letter beta, U+0392" },
1705 { 915,  "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1706 { 916,  "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1707 { 917,  "Epsilon","greek capital letter epsilon, U+0395" },
1708 { 918,  "Zeta", "greek capital letter zeta, U+0396" },
1709 { 919,  "Eta",  "greek capital letter eta, U+0397" },
1710 { 920,  "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1711 { 921,  "Iota", "greek capital letter iota, U+0399" },
1712 { 922,  "Kappa","greek capital letter kappa, U+039A" },
1713 { 923,  "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1714 { 924,  "Mu",   "greek capital letter mu, U+039C" },
1715 { 925,  "Nu",   "greek capital letter nu, U+039D" },
1716 { 926,  "Xi",   "greek capital letter xi, U+039E ISOgrk3" },
1717 { 927,  "Omicron","greek capital letter omicron, U+039F" },
1718 { 928,  "Pi",   "greek capital letter pi, U+03A0 ISOgrk3" },
1719 { 929,  "Rho",  "greek capital letter rho, U+03A1" },
1720 { 931,  "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1721 { 932,  "Tau",  "greek capital letter tau, U+03A4" },
1722 { 933,  "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1723 { 934,  "Phi",  "greek capital letter phi, U+03A6 ISOgrk3" },
1724 { 935,  "Chi",  "greek capital letter chi, U+03A7" },
1725 { 936,  "Psi",  "greek capital letter psi, U+03A8 ISOgrk3" },
1726 { 937,  "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1727
1728 { 945,  "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1729 { 946,  "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1730 { 947,  "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1731 { 948,  "delta","greek small letter delta, U+03B4 ISOgrk3" },
1732 { 949,  "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1733 { 950,  "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1734 { 951,  "eta",  "greek small letter eta, U+03B7 ISOgrk3" },
1735 { 952,  "theta","greek small letter theta, U+03B8 ISOgrk3" },
1736 { 953,  "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1737 { 954,  "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1738 { 955,  "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1739 { 956,  "mu",   "greek small letter mu, U+03BC ISOgrk3" },
1740 { 957,  "nu",   "greek small letter nu, U+03BD ISOgrk3" },
1741 { 958,  "xi",   "greek small letter xi, U+03BE ISOgrk3" },
1742 { 959,  "omicron","greek small letter omicron, U+03BF NEW" },
1743 { 960,  "pi",   "greek small letter pi, U+03C0 ISOgrk3" },
1744 { 961,  "rho",  "greek small letter rho, U+03C1 ISOgrk3" },
1745 { 962,  "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1746 { 963,  "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1747 { 964,  "tau",  "greek small letter tau, U+03C4 ISOgrk3" },
1748 { 965,  "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1749 { 966,  "phi",  "greek small letter phi, U+03C6 ISOgrk3" },
1750 { 967,  "chi",  "greek small letter chi, U+03C7 ISOgrk3" },
1751 { 968,  "psi",  "greek small letter psi, U+03C8 ISOgrk3" },
1752 { 969,  "omega","greek small letter omega, U+03C9 ISOgrk3" },
1753 { 977,  "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1754 { 978,  "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1755 { 982,  "piv",  "greek pi symbol, U+03D6 ISOgrk3" },
1756
1757 { 8194, "ensp", "en space, U+2002 ISOpub" },
1758 { 8195, "emsp", "em space, U+2003 ISOpub" },
1759 { 8201, "thinsp","thin space, U+2009 ISOpub" },
1760 { 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1761 { 8205, "zwj",  "zero width joiner, U+200D NEW RFC 2070" },
1762 { 8206, "lrm",  "left-to-right mark, U+200E NEW RFC 2070" },
1763 { 8207, "rlm",  "right-to-left mark, U+200F NEW RFC 2070" },
1764 { 8211, "ndash","en dash, U+2013 ISOpub" },
1765 { 8212, "mdash","em dash, U+2014 ISOpub" },
1766 { 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1767 { 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1768 { 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1769 { 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1770 { 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1771 { 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1772 { 8224, "dagger","dagger, U+2020 ISOpub" },
1773 { 8225, "Dagger","double dagger, U+2021 ISOpub" },
1774
1775 { 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1776 { 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1777
1778 { 8240, "permil","per mille sign, U+2030 ISOtech" },
1779
1780 { 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1781 { 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1782
1783 { 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1784 { 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1785
1786 { 8254, "oline","overline = spacing overscore, U+203E NEW" },
1787 { 8260, "frasl","fraction slash, U+2044 NEW" },
1788
1789 { 8364, "euro", "euro sign, U+20AC NEW" },
1790
1791 { 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1792 { 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1793 { 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1794 { 8482, "trade","trade mark sign, U+2122 ISOnum" },
1795 { 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1796 { 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1797 { 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1798 { 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1799 { 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1800 { 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1801 { 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1802 { 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1803 { 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1804 { 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1805 { 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1806 { 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1807
1808 { 8704, "forall","for all, U+2200 ISOtech" },
1809 { 8706, "part", "partial differential, U+2202 ISOtech" },
1810 { 8707, "exist","there exists, U+2203 ISOtech" },
1811 { 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1812 { 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1813 { 8712, "isin", "element of, U+2208 ISOtech" },
1814 { 8713, "notin","not an element of, U+2209 ISOtech" },
1815 { 8715, "ni",   "contains as member, U+220B ISOtech" },
1816 { 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1817 { 8721, "sum",  "n-ary summation, U+2211 ISOamsb" },
1818 { 8722, "minus","minus sign, U+2212 ISOtech" },
1819 { 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1820 { 8730, "radic","square root = radical sign, U+221A ISOtech" },
1821 { 8733, "prop", "proportional to, U+221D ISOtech" },
1822 { 8734, "infin","infinity, U+221E ISOtech" },
1823 { 8736, "ang",  "angle, U+2220 ISOamso" },
1824 { 8743, "and",  "logical and = wedge, U+2227 ISOtech" },
1825 { 8744, "or",   "logical or = vee, U+2228 ISOtech" },
1826 { 8745, "cap",  "intersection = cap, U+2229 ISOtech" },
1827 { 8746, "cup",  "union = cup, U+222A ISOtech" },
1828 { 8747, "int",  "integral, U+222B ISOtech" },
1829 { 8756, "there4","therefore, U+2234 ISOtech" },
1830 { 8764, "sim",  "tilde operator = varies with = similar to, U+223C ISOtech" },
1831 { 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1832 { 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1833 { 8800, "ne",   "not equal to, U+2260 ISOtech" },
1834 { 8801, "equiv","identical to, U+2261 ISOtech" },
1835 { 8804, "le",   "less-than or equal to, U+2264 ISOtech" },
1836 { 8805, "ge",   "greater-than or equal to, U+2265 ISOtech" },
1837 { 8834, "sub",  "subset of, U+2282 ISOtech" },
1838 { 8835, "sup",  "superset of, U+2283 ISOtech" },
1839 { 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1840 { 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1841 { 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1842 { 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1843 { 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1844 { 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1845 { 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1846 { 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1847 { 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1848 { 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1849 { 8971, "rfloor","right floor, U+230B ISOamsc" },
1850 { 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1851 { 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1852 { 9674, "loz",  "lozenge, U+25CA ISOpub" },
1853
1854 { 9824, "spades","black spade suit, U+2660 ISOpub" },
1855 { 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1856 { 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1857 { 9830, "diams","black diamond suit, U+2666 ISOpub" },
1858
1859 };
1860
1861 /************************************************************************
1862  *                                                                      *
1863  *              Commodity functions to handle entities                  *
1864  *                                                                      *
1865  ************************************************************************/
1866
1867 /*
1868  * Macro used to grow the current buffer.
1869  */
1870 #define growBuffer(buffer) {                                            \
1871     xmlChar *tmp;                                                       \
1872     buffer##_size *= 2;                                                 \
1873     tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1874     if (tmp == NULL) {                                          \
1875         htmlErrMemory(ctxt, "growing buffer\n");                        \
1876         xmlFree(buffer);                                                \
1877         return(NULL);                                                   \
1878     }                                                                   \
1879     buffer = tmp;                                                       \
1880 }
1881
1882 /**
1883  * htmlEntityLookup:
1884  * @name: the entity name
1885  *
1886  * Lookup the given entity in EntitiesTable
1887  *
1888  * TODO: the linear scan is really ugly, an hash table is really needed.
1889  *
1890  * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1891  */
1892 const htmlEntityDesc *
1893 htmlEntityLookup(const xmlChar *name) {
1894     unsigned int i;
1895
1896     for (i = 0;i < (sizeof(html40EntitiesTable)/
1897                     sizeof(html40EntitiesTable[0]));i++) {
1898         if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1899             return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1900         }
1901     }
1902     return(NULL);
1903 }
1904
1905 /**
1906  * htmlEntityValueLookup:
1907  * @value: the entity's unicode value
1908  *
1909  * Lookup the given entity in EntitiesTable
1910  *
1911  * TODO: the linear scan is really ugly, an hash table is really needed.
1912  *
1913  * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1914  */
1915 const htmlEntityDesc *
1916 htmlEntityValueLookup(unsigned int value) {
1917     unsigned int i;
1918
1919     for (i = 0;i < (sizeof(html40EntitiesTable)/
1920                     sizeof(html40EntitiesTable[0]));i++) {
1921         if (html40EntitiesTable[i].value >= value) {
1922             if (html40EntitiesTable[i].value > value)
1923                 break;
1924             return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1925         }
1926     }
1927     return(NULL);
1928 }
1929
1930 /**
1931  * UTF8ToHtml:
1932  * @out:  a pointer to an array of bytes to store the result
1933  * @outlen:  the length of @out
1934  * @in:  a pointer to an array of UTF-8 chars
1935  * @inlen:  the length of @in
1936  *
1937  * Take a block of UTF-8 chars in and try to convert it to an ASCII
1938  * plus HTML entities block of chars out.
1939  *
1940  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1941  * The value of @inlen after return is the number of octets consumed
1942  *     as the return value is positive, else unpredictable.
1943  * The value of @outlen after return is the number of octets consumed.
1944  */
1945 int
1946 UTF8ToHtml(unsigned char* out, int *outlen,
1947               const unsigned char* in, int *inlen) {
1948     const unsigned char* processed = in;
1949     const unsigned char* outend;
1950     const unsigned char* outstart = out;
1951     const unsigned char* instart = in;
1952     const unsigned char* inend;
1953     unsigned int c, d;
1954     int trailing;
1955
1956     if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
1957     if (in == NULL) {
1958         /*
1959          * initialization nothing to do
1960          */
1961         *outlen = 0;
1962         *inlen = 0;
1963         return(0);
1964     }
1965     inend = in + (*inlen);
1966     outend = out + (*outlen);
1967     while (in < inend) {
1968         d = *in++;
1969         if      (d < 0x80)  { c= d; trailing= 0; }
1970         else if (d < 0xC0) {
1971             /* trailing byte in leading position */
1972             *outlen = out - outstart;
1973             *inlen = processed - instart;
1974             return(-2);
1975         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
1976         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
1977         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
1978         else {
1979             /* no chance for this in Ascii */
1980             *outlen = out - outstart;
1981             *inlen = processed - instart;
1982             return(-2);
1983         }
1984
1985         if (inend - in < trailing) {
1986             break;
1987         }
1988
1989         for ( ; trailing; trailing--) {
1990             if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1991                 break;
1992             c <<= 6;
1993             c |= d & 0x3F;
1994         }
1995
1996         /* assertion: c is a single UTF-4 value */
1997         if (c < 0x80) {
1998             if (out + 1 >= outend)
1999                 break;
2000             *out++ = c;
2001         } else {
2002             int len;
2003             const htmlEntityDesc * ent;
2004             const char *cp;
2005             char nbuf[16];
2006
2007             /*
2008              * Try to lookup a predefined HTML entity for it
2009              */
2010
2011             ent = htmlEntityValueLookup(c);
2012             if (ent == NULL) {
2013               snprintf(nbuf, sizeof(nbuf), "#%u", c);
2014               cp = nbuf;
2015             }
2016             else
2017               cp = ent->name;
2018             len = strlen(cp);
2019             if (out + 2 + len >= outend)
2020                 break;
2021             *out++ = '&';
2022             memcpy(out, cp, len);
2023             out += len;
2024             *out++ = ';';
2025         }
2026         processed = in;
2027     }
2028     *outlen = out - outstart;
2029     *inlen = processed - instart;
2030     return(0);
2031 }
2032
2033 /**
2034  * htmlEncodeEntities:
2035  * @out:  a pointer to an array of bytes to store the result
2036  * @outlen:  the length of @out
2037  * @in:  a pointer to an array of UTF-8 chars
2038  * @inlen:  the length of @in
2039  * @quoteChar: the quote character to escape (' or ") or zero.
2040  *
2041  * Take a block of UTF-8 chars in and try to convert it to an ASCII
2042  * plus HTML entities block of chars out.
2043  *
2044  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2045  * The value of @inlen after return is the number of octets consumed
2046  *     as the return value is positive, else unpredictable.
2047  * The value of @outlen after return is the number of octets consumed.
2048  */
2049 int
2050 htmlEncodeEntities(unsigned char* out, int *outlen,
2051                    const unsigned char* in, int *inlen, int quoteChar) {
2052     const unsigned char* processed = in;
2053     const unsigned char* outend;
2054     const unsigned char* outstart = out;
2055     const unsigned char* instart = in;
2056     const unsigned char* inend;
2057     unsigned int c, d;
2058     int trailing;
2059
2060     if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2061         return(-1);
2062     outend = out + (*outlen);
2063     inend = in + (*inlen);
2064     while (in < inend) {
2065         d = *in++;
2066         if      (d < 0x80)  { c= d; trailing= 0; }
2067         else if (d < 0xC0) {
2068             /* trailing byte in leading position */
2069             *outlen = out - outstart;
2070             *inlen = processed - instart;
2071             return(-2);
2072         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
2073         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
2074         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
2075         else {
2076             /* no chance for this in Ascii */
2077             *outlen = out - outstart;
2078             *inlen = processed - instart;
2079             return(-2);
2080         }
2081
2082         if (inend - in < trailing)
2083             break;
2084
2085         while (trailing--) {
2086             if (((d= *in++) & 0xC0) != 0x80) {
2087                 *outlen = out - outstart;
2088                 *inlen = processed - instart;
2089                 return(-2);
2090             }
2091             c <<= 6;
2092             c |= d & 0x3F;
2093         }
2094
2095         /* assertion: c is a single UTF-4 value */
2096         if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2097             (c != '&') && (c != '<') && (c != '>')) {
2098             if (out >= outend)
2099                 break;
2100             *out++ = c;
2101         } else {
2102             const htmlEntityDesc * ent;
2103             const char *cp;
2104             char nbuf[16];
2105             int len;
2106
2107             /*
2108              * Try to lookup a predefined HTML entity for it
2109              */
2110             ent = htmlEntityValueLookup(c);
2111             if (ent == NULL) {
2112                 snprintf(nbuf, sizeof(nbuf), "#%u", c);
2113                 cp = nbuf;
2114             }
2115             else
2116                 cp = ent->name;
2117             len = strlen(cp);
2118             if (out + 2 + len > outend)
2119                 break;
2120             *out++ = '&';
2121             memcpy(out, cp, len);
2122             out += len;
2123             *out++ = ';';
2124         }
2125         processed = in;
2126     }
2127     *outlen = out - outstart;
2128     *inlen = processed - instart;
2129     return(0);
2130 }
2131
2132 /************************************************************************
2133  *                                                                      *
2134  *              Commodity functions to handle streams                   *
2135  *                                                                      *
2136  ************************************************************************/
2137
2138 /**
2139  * htmlNewInputStream:
2140  * @ctxt:  an HTML parser context
2141  *
2142  * Create a new input stream structure
2143  * Returns the new input stream or NULL
2144  */
2145 static htmlParserInputPtr
2146 htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2147     htmlParserInputPtr input;
2148
2149     input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2150     if (input == NULL) {
2151         htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
2152         return(NULL);
2153     }
2154     memset(input, 0, sizeof(htmlParserInput));
2155     input->filename = NULL;
2156     input->directory = NULL;
2157     input->base = NULL;
2158     input->cur = NULL;
2159     input->buf = NULL;
2160     input->line = 1;
2161     input->col = 1;
2162     input->buf = NULL;
2163     input->free = NULL;
2164     input->version = NULL;
2165     input->consumed = 0;
2166     input->length = 0;
2167     return(input);
2168 }
2169
2170
2171 /************************************************************************
2172  *                                                                      *
2173  *              Commodity functions, cleanup needed ?                   *
2174  *                                                                      *
2175  ************************************************************************/
2176 /*
2177  * all tags allowing pc data from the html 4.01 loose dtd
2178  * NOTE: it might be more apropriate to integrate this information
2179  * into the html40ElementTable array but I don't want to risk any
2180  * binary incomptibility
2181  */
2182 static const char *allowPCData[] = {
2183     "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2184     "blockquote", "body", "button", "caption", "center", "cite", "code",
2185     "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2186     "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2187     "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2188     "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2189 };
2190
2191 /**
2192  * areBlanks:
2193  * @ctxt:  an HTML parser context
2194  * @str:  a xmlChar *
2195  * @len:  the size of @str
2196  *
2197  * Is this a sequence of blank chars that one can ignore ?
2198  *
2199  * Returns 1 if ignorable 0 otherwise.
2200  */
2201
2202 static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2203     unsigned int i;
2204     int j;
2205     xmlNodePtr lastChild;
2206     xmlDtdPtr dtd;
2207
2208     for (j = 0;j < len;j++)
2209         if (!(IS_BLANK_CH(str[j]))) return(0);
2210
2211     if (CUR == 0) return(1);
2212     if (CUR != '<') return(0);
2213     if (ctxt->name == NULL)
2214         return(1);
2215     if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2216         return(1);
2217     if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2218         return(1);
2219
2220     /* Only strip CDATA children of the body tag for strict HTML DTDs */
2221     if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2222         dtd = xmlGetIntSubset(ctxt->myDoc);
2223         if (dtd != NULL && dtd->ExternalID != NULL) {
2224             if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2225                     !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2226                 return(1);
2227         }
2228     }
2229
2230     if (ctxt->node == NULL) return(0);
2231     lastChild = xmlGetLastChild(ctxt->node);
2232     while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2233         lastChild = lastChild->prev;
2234     if (lastChild == NULL) {
2235         if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2236             (ctxt->node->content != NULL)) return(0);
2237         /* keep ws in constructs like ...<b> </b>...
2238            for all tags "b" allowing PCDATA */
2239         for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2240             if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2241                 return(0);
2242             }
2243         }
2244     } else if (xmlNodeIsText(lastChild)) {
2245         return(0);
2246     } else {
2247         /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2248            for all tags "p" allowing PCDATA */
2249         for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2250             if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2251                 return(0);
2252             }
2253         }
2254     }
2255     return(1);
2256 }
2257
2258 /**
2259  * htmlNewDocNoDtD:
2260  * @URI:  URI for the dtd, or NULL
2261  * @ExternalID:  the external ID of the DTD, or NULL
2262  *
2263  * Creates a new HTML document without a DTD node if @URI and @ExternalID
2264  * are NULL
2265  *
2266  * Returns a new document, do not initialize the DTD if not provided
2267  */
2268 htmlDocPtr
2269 htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2270     xmlDocPtr cur;
2271
2272     /*
2273      * Allocate a new document and fill the fields.
2274      */
2275     cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2276     if (cur == NULL) {
2277         htmlErrMemory(NULL, "HTML document creation failed\n");
2278         return(NULL);
2279     }
2280     memset(cur, 0, sizeof(xmlDoc));
2281
2282     cur->type = XML_HTML_DOCUMENT_NODE;
2283     cur->version = NULL;
2284     cur->intSubset = NULL;
2285     cur->doc = cur;
2286     cur->name = NULL;
2287     cur->children = NULL;
2288     cur->extSubset = NULL;
2289     cur->oldNs = NULL;
2290     cur->encoding = NULL;
2291     cur->standalone = 1;
2292     cur->compression = 0;
2293     cur->ids = NULL;
2294     cur->refs = NULL;
2295     cur->_private = NULL;
2296     cur->charset = XML_CHAR_ENCODING_UTF8;
2297     cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
2298     if ((ExternalID != NULL) ||
2299         (URI != NULL))
2300         xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2301     return(cur);
2302 }
2303
2304 /**
2305  * htmlNewDoc:
2306  * @URI:  URI for the dtd, or NULL
2307  * @ExternalID:  the external ID of the DTD, or NULL
2308  *
2309  * Creates a new HTML document
2310  *
2311  * Returns a new document
2312  */
2313 htmlDocPtr
2314 htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2315     if ((URI == NULL) && (ExternalID == NULL))
2316         return(htmlNewDocNoDtD(
2317                     BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2318                     BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2319
2320     return(htmlNewDocNoDtD(URI, ExternalID));
2321 }
2322
2323
2324 /************************************************************************
2325  *                                                                      *
2326  *                      The parser itself                               *
2327  *      Relates to http://www.w3.org/TR/html40                          *
2328  *                                                                      *
2329  ************************************************************************/
2330
2331 /************************************************************************
2332  *                                                                      *
2333  *                      The parser itself                               *
2334  *                                                                      *
2335  ************************************************************************/
2336
2337 static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2338
2339 /**
2340  * htmlParseHTMLName:
2341  * @ctxt:  an HTML parser context
2342  *
2343  * parse an HTML tag or attribute name, note that we convert it to lowercase
2344  * since HTML names are not case-sensitive.
2345  *
2346  * Returns the Tag Name parsed or NULL
2347  */
2348
2349 static const xmlChar *
2350 htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2351     int i = 0;
2352     xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2353
2354     if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2355         (CUR != ':') && (CUR != '.')) return(NULL);
2356
2357     while ((i < HTML_PARSER_BUFFER_SIZE) &&
2358            ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
2359            (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2360            (CUR == '.'))) {
2361         if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2362         else loc[i] = CUR;
2363         i++;
2364
2365         NEXT;
2366     }
2367
2368     return(xmlDictLookup(ctxt->dict, loc, i));
2369 }
2370
2371
2372 /**
2373  * htmlParseHTMLName_nonInvasive:
2374  * @ctxt:  an HTML parser context
2375  *
2376  * parse an HTML tag or attribute name, note that we convert it to lowercase
2377  * since HTML names are not case-sensitive, this doesn't consume the data
2378  * from the stream, it's a look-ahead
2379  *
2380  * Returns the Tag Name parsed or NULL
2381  */
2382
2383 static const xmlChar *
2384 htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2385     int i = 0;
2386     xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2387
2388     if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2389         (NXT(1) != ':')) return(NULL);
2390
2391     while ((i < HTML_PARSER_BUFFER_SIZE) &&
2392            ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2393            (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2394         if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2395         else loc[i] = NXT(1+i);
2396         i++;
2397     }
2398
2399     return(xmlDictLookup(ctxt->dict, loc, i));
2400 }
2401
2402
2403 /**
2404  * htmlParseName:
2405  * @ctxt:  an HTML parser context
2406  *
2407  * parse an HTML name, this routine is case sensitive.
2408  *
2409  * Returns the Name parsed or NULL
2410  */
2411
2412 static const xmlChar *
2413 htmlParseName(htmlParserCtxtPtr ctxt) {
2414     const xmlChar *in;
2415     const xmlChar *ret;
2416     int count = 0;
2417
2418     GROW;
2419
2420     /*
2421      * Accelerator for simple ASCII names
2422      */
2423     in = ctxt->input->cur;
2424     if (((*in >= 0x61) && (*in <= 0x7A)) ||
2425         ((*in >= 0x41) && (*in <= 0x5A)) ||
2426         (*in == '_') || (*in == ':')) {
2427         in++;
2428         while (((*in >= 0x61) && (*in <= 0x7A)) ||
2429                ((*in >= 0x41) && (*in <= 0x5A)) ||
2430                ((*in >= 0x30) && (*in <= 0x39)) ||
2431                (*in == '_') || (*in == '-') ||
2432                (*in == ':') || (*in == '.'))
2433             in++;
2434         if ((*in > 0) && (*in < 0x80)) {
2435             count = in - ctxt->input->cur;
2436             ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2437             ctxt->input->cur = in;
2438             ctxt->nbChars += count;
2439             ctxt->input->col += count;
2440             return(ret);
2441         }
2442     }
2443     return(htmlParseNameComplex(ctxt));
2444 }
2445
2446 static const xmlChar *
2447 htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2448     int len = 0, l;
2449     int c;
2450     int count = 0;
2451
2452     /*
2453      * Handler for more complex cases
2454      */
2455     GROW;
2456     c = CUR_CHAR(l);
2457     if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2458         (!IS_LETTER(c) && (c != '_') &&
2459          (c != ':'))) {
2460         return(NULL);
2461     }
2462
2463     while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2464            ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2465             (c == '.') || (c == '-') ||
2466             (c == '_') || (c == ':') ||
2467             (IS_COMBINING(c)) ||
2468             (IS_EXTENDER(c)))) {
2469         if (count++ > 100) {
2470             count = 0;
2471             GROW;
2472         }
2473         len += l;
2474         NEXTL(l);
2475         c = CUR_CHAR(l);
2476     }
2477     return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
2478 }
2479
2480
2481 /**
2482  * htmlParseHTMLAttribute:
2483  * @ctxt:  an HTML parser context
2484  * @stop:  a char stop value
2485  *
2486  * parse an HTML attribute value till the stop (quote), if
2487  * stop is 0 then it stops at the first space
2488  *
2489  * Returns the attribute parsed or NULL
2490  */
2491
2492 static xmlChar *
2493 htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2494     xmlChar *buffer = NULL;
2495     int buffer_size = 0;
2496     xmlChar *out = NULL;
2497     const xmlChar *name = NULL;
2498     const xmlChar *cur = NULL;
2499     const htmlEntityDesc * ent;
2500
2501     /*
2502      * allocate a translation buffer.
2503      */
2504     buffer_size = HTML_PARSER_BUFFER_SIZE;
2505     buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
2506     if (buffer == NULL) {
2507         htmlErrMemory(ctxt, "buffer allocation failed\n");
2508         return(NULL);
2509     }
2510     out = buffer;
2511
2512     /*
2513      * Ok loop until we reach one of the ending chars
2514      */
2515     while ((CUR != 0) && (CUR != stop)) {
2516         if ((stop == 0) && (CUR == '>')) break;
2517         if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2518         if (CUR == '&') {
2519             if (NXT(1) == '#') {
2520                 unsigned int c;
2521                 int bits;
2522
2523                 c = htmlParseCharRef(ctxt);
2524                 if      (c <    0x80)
2525                         { *out++  = c;                bits= -6; }
2526                 else if (c <   0x800)
2527                         { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2528                 else if (c < 0x10000)
2529                         { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2530                 else
2531                         { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2532
2533                 for ( ; bits >= 0; bits-= 6) {
2534                     *out++  = ((c >> bits) & 0x3F) | 0x80;
2535                 }
2536
2537                 if (out - buffer > buffer_size - 100) {
2538                         int indx = out - buffer;
2539
2540                         growBuffer(buffer);
2541                         out = &buffer[indx];
2542                 }
2543             } else {
2544                 ent = htmlParseEntityRef(ctxt, &name);
2545                 if (name == NULL) {
2546                     *out++ = '&';
2547                     if (out - buffer > buffer_size - 100) {
2548                         int indx = out - buffer;
2549
2550                         growBuffer(buffer);
2551                         out = &buffer[indx];
2552                     }
2553                 } else if (ent == NULL) {
2554                     *out++ = '&';
2555                     cur = name;
2556                     while (*cur != 0) {
2557                         if (out - buffer > buffer_size - 100) {
2558                             int indx = out - buffer;
2559
2560                             growBuffer(buffer);
2561                             out = &buffer[indx];
2562                         }
2563                         *out++ = *cur++;
2564                     }
2565                 } else {
2566                     unsigned int c;
2567                     int bits;
2568
2569                     if (out - buffer > buffer_size - 100) {
2570                         int indx = out - buffer;
2571
2572                         growBuffer(buffer);
2573                         out = &buffer[indx];
2574                     }
2575                     c = ent->value;
2576                     if      (c <    0x80)
2577                         { *out++  = c;                bits= -6; }
2578                     else if (c <   0x800)
2579                         { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2580                     else if (c < 0x10000)
2581                         { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2582                     else
2583                         { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2584
2585                     for ( ; bits >= 0; bits-= 6) {
2586                         *out++  = ((c >> bits) & 0x3F) | 0x80;
2587                     }
2588                 }
2589             }
2590         } else {
2591             unsigned int c;
2592             int bits, l;
2593
2594             if (out - buffer > buffer_size - 100) {
2595                 int indx = out - buffer;
2596
2597                 growBuffer(buffer);
2598                 out = &buffer[indx];
2599             }
2600             c = CUR_CHAR(l);
2601             if      (c <    0x80)
2602                     { *out++  = c;                bits= -6; }
2603             else if (c <   0x800)
2604                     { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2605             else if (c < 0x10000)
2606                     { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2607             else
2608                     { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2609
2610             for ( ; bits >= 0; bits-= 6) {
2611                 *out++  = ((c >> bits) & 0x3F) | 0x80;
2612             }
2613             NEXT;
2614         }
2615     }
2616     *out = 0;
2617     return(buffer);
2618 }
2619
2620 /**
2621  * htmlParseEntityRef:
2622  * @ctxt:  an HTML parser context
2623  * @str:  location to store the entity name
2624  *
2625  * parse an HTML ENTITY references
2626  *
2627  * [68] EntityRef ::= '&' Name ';'
2628  *
2629  * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2630  *         if non-NULL *str will have to be freed by the caller.
2631  */
2632 const htmlEntityDesc *
2633 htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2634     const xmlChar *name;
2635     const htmlEntityDesc * ent = NULL;
2636
2637     if (str != NULL) *str = NULL;
2638     if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
2639
2640     if (CUR == '&') {
2641         NEXT;
2642         name = htmlParseName(ctxt);
2643         if (name == NULL) {
2644             htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2645                          "htmlParseEntityRef: no name\n", NULL, NULL);
2646         } else {
2647             GROW;
2648             if (CUR == ';') {
2649                 if (str != NULL)
2650                     *str = name;
2651
2652                 /*
2653                  * Lookup the entity in the table.
2654                  */
2655                 ent = htmlEntityLookup(name);
2656                 if (ent != NULL) /* OK that's ugly !!! */
2657                     NEXT;
2658             } else {
2659                 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2660                              "htmlParseEntityRef: expecting ';'\n",
2661                              NULL, NULL);
2662                 if (str != NULL)
2663                     *str = name;
2664             }
2665         }
2666     }
2667     return(ent);
2668 }
2669
2670 /**
2671  * htmlParseAttValue:
2672  * @ctxt:  an HTML parser context
2673  *
2674  * parse a value for an attribute
2675  * Note: the parser won't do substitution of entities here, this
2676  * will be handled later in xmlStringGetNodeList, unless it was
2677  * asked for ctxt->replaceEntities != 0
2678  *
2679  * Returns the AttValue parsed or NULL.
2680  */
2681
2682 static xmlChar *
2683 htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2684     xmlChar *ret = NULL;
2685
2686     if (CUR == '"') {
2687         NEXT;
2688         ret = htmlParseHTMLAttribute(ctxt, '"');
2689         if (CUR != '"') {
2690             htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2691                          "AttValue: \" expected\n", NULL, NULL);
2692         } else
2693             NEXT;
2694     } else if (CUR == '\'') {
2695         NEXT;
2696         ret = htmlParseHTMLAttribute(ctxt, '\'');
2697         if (CUR != '\'') {
2698             htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2699                          "AttValue: ' expected\n", NULL, NULL);
2700         } else
2701             NEXT;
2702     } else {
2703         /*
2704          * That's an HTMLism, the attribute value may not be quoted
2705          */
2706         ret = htmlParseHTMLAttribute(ctxt, 0);
2707         if (ret == NULL) {
2708             htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2709                          "AttValue: no value found\n", NULL, NULL);
2710         }
2711     }
2712     return(ret);
2713 }
2714
2715 /**
2716  * htmlParseSystemLiteral:
2717  * @ctxt:  an HTML parser context
2718  *
2719  * parse an HTML Literal
2720  *
2721  * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2722  *
2723  * Returns the SystemLiteral parsed or NULL
2724  */
2725
2726 static xmlChar *
2727 htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2728     const xmlChar *q;
2729     xmlChar *ret = NULL;
2730
2731     if (CUR == '"') {
2732         NEXT;
2733         q = CUR_PTR;
2734         while ((IS_CHAR_CH(CUR)) && (CUR != '"'))
2735             NEXT;
2736         if (!IS_CHAR_CH(CUR)) {
2737             htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2738                          "Unfinished SystemLiteral\n", NULL, NULL);
2739         } else {
2740             ret = xmlStrndup(q, CUR_PTR - q);
2741             NEXT;
2742         }
2743     } else if (CUR == '\'') {
2744         NEXT;
2745         q = CUR_PTR;
2746         while ((IS_CHAR_CH(CUR)) && (CUR != '\''))
2747             NEXT;
2748         if (!IS_CHAR_CH(CUR)) {
2749             htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2750                          "Unfinished SystemLiteral\n", NULL, NULL);
2751         } else {
2752             ret = xmlStrndup(q, CUR_PTR - q);
2753             NEXT;
2754         }
2755     } else {
2756         htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2757                      " or ' expected\n", NULL, NULL);
2758     }
2759
2760     return(ret);
2761 }
2762
2763 /**
2764  * htmlParsePubidLiteral:
2765  * @ctxt:  an HTML parser context
2766  *
2767  * parse an HTML public literal
2768  *
2769  * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2770  *
2771  * Returns the PubidLiteral parsed or NULL.
2772  */
2773
2774 static xmlChar *
2775 htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2776     const xmlChar *q;
2777     xmlChar *ret = NULL;
2778     /*
2779      * Name ::= (Letter | '_') (NameChar)*
2780      */
2781     if (CUR == '"') {
2782         NEXT;
2783         q = CUR_PTR;
2784         while (IS_PUBIDCHAR_CH(CUR)) NEXT;
2785         if (CUR != '"') {
2786             htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2787                          "Unfinished PubidLiteral\n", NULL, NULL);
2788         } else {
2789             ret = xmlStrndup(q, CUR_PTR - q);
2790             NEXT;
2791         }
2792     } else if (CUR == '\'') {
2793         NEXT;
2794         q = CUR_PTR;
2795         while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))
2796             NEXT;
2797         if (CUR != '\'') {
2798             htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2799                          "Unfinished PubidLiteral\n", NULL, NULL);
2800         } else {
2801             ret = xmlStrndup(q, CUR_PTR - q);
2802             NEXT;
2803         }
2804     } else {
2805         htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2806                      "PubidLiteral \" or ' expected\n", NULL, NULL);
2807     }
2808
2809     return(ret);
2810 }
2811
2812 /**
2813  * htmlParseScript:
2814  * @ctxt:  an HTML parser context
2815  *
2816  * parse the content of an HTML SCRIPT or STYLE element
2817  * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2818  * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2819  * http://www.w3.org/TR/html4/types.html#type-script
2820  * http://www.w3.org/TR/html4/types.html#h-6.15
2821  * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2822  *
2823  * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2824  * element and the value of intrinsic event attributes. User agents must
2825  * not evaluate script data as HTML markup but instead must pass it on as
2826  * data to a script engine.
2827  * NOTES:
2828  * - The content is passed like CDATA
2829  * - the attributes for style and scripting "onXXX" are also described
2830  *   as CDATA but SGML allows entities references in attributes so their
2831  *   processing is identical as other attributes
2832  */
2833 static void
2834 htmlParseScript(htmlParserCtxtPtr ctxt) {
2835     xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2836     int nbchar = 0;
2837     int cur,l;
2838
2839     SHRINK;
2840     cur = CUR_CHAR(l);
2841     while (IS_CHAR_CH(cur)) {
2842         if ((cur == '<') && (NXT(1) == '/')) {
2843             /*
2844              * One should break here, the specification is clear:
2845              * Authors should therefore escape "</" within the content.
2846              * Escape mechanisms are specific to each scripting or
2847              * style sheet language.
2848              *
2849              * In recovery mode, only break if end tag match the
2850              * current tag, effectively ignoring all tags inside the
2851              * script/style block and treating the entire block as
2852              * CDATA.
2853              */
2854             if (ctxt->recovery) {
2855                 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
2856                                    xmlStrlen(ctxt->name)) == 0)
2857                 {
2858                     break; /* while */
2859                 } else {
2860                     htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
2861                                  "Element %s embeds close tag\n",
2862                                  ctxt->name, NULL);
2863                 }
2864             } else {
2865                 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2866                     ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2867                 {
2868                     break; /* while */
2869                 }
2870             }
2871         }
2872         COPY_BUF(l,buf,nbchar,cur);
2873         if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2874             if (ctxt->sax->cdataBlock!= NULL) {
2875                 /*
2876                  * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2877                  */
2878                 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2879             } else if (ctxt->sax->characters != NULL) {
2880                 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2881             }
2882             nbchar = 0;
2883         }
2884         GROW;
2885         NEXTL(l);
2886         cur = CUR_CHAR(l);
2887     }
2888
2889     if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {
2890         htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2891                         "Invalid char in CDATA 0x%X\n", cur);
2892         NEXT;
2893     }
2894
2895     if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2896         if (ctxt->sax->cdataBlock!= NULL) {
2897             /*
2898              * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2899              */
2900             ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2901         } else if (ctxt->sax->characters != NULL) {
2902             ctxt->sax->characters(ctxt->userData, buf, nbchar);
2903         }
2904     }
2905 }
2906
2907
2908 /**
2909  * htmlParseCharData:
2910  * @ctxt:  an HTML parser context
2911  *
2912  * parse a CharData section.
2913  * if we are within a CDATA section ']]>' marks an end of section.
2914  *
2915  * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2916  */
2917
2918 static void
2919 htmlParseCharData(htmlParserCtxtPtr ctxt) {
2920     xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2921     int nbchar = 0;
2922     int cur, l;
2923     int chunk = 0;
2924
2925     SHRINK;
2926     cur = CUR_CHAR(l);
2927     while (((cur != '<') || (ctxt->token == '<')) &&
2928            ((cur != '&') || (ctxt->token == '&')) &&
2929            (cur != 0)) {
2930         if (!(IS_CHAR(cur))) {
2931             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2932                         "Invalid char in CDATA 0x%X\n", cur);
2933         } else {
2934             COPY_BUF(l,buf,nbchar,cur);
2935         }
2936         if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2937             /*
2938              * Ok the segment is to be consumed as chars.
2939              */
2940             if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2941                 if (areBlanks(ctxt, buf, nbchar)) {
2942                     if (ctxt->sax->ignorableWhitespace != NULL)
2943                         ctxt->sax->ignorableWhitespace(ctxt->userData,
2944                                                        buf, nbchar);
2945                 } else {
2946                     htmlCheckParagraph(ctxt);
2947                     if (ctxt->sax->characters != NULL)
2948                         ctxt->sax->characters(ctxt->userData, buf, nbchar);
2949                 }
2950             }
2951             nbchar = 0;
2952         }
2953         NEXTL(l);
2954         chunk++;
2955         if (chunk > HTML_PARSER_BUFFER_SIZE) {
2956             chunk = 0;
2957             SHRINK;
2958             GROW;
2959         }
2960         cur = CUR_CHAR(l);
2961         if (cur == 0) {
2962             SHRINK;
2963             GROW;
2964             cur = CUR_CHAR(l);
2965         }
2966     }
2967     if (nbchar != 0) {
2968         buf[nbchar] = 0;
2969
2970         /*
2971          * Ok the segment is to be consumed as chars.
2972          */
2973         if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2974             if (areBlanks(ctxt, buf, nbchar)) {
2975                 if (ctxt->sax->ignorableWhitespace != NULL)
2976                     ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2977             } else {
2978                 htmlCheckParagraph(ctxt);
2979                 if (ctxt->sax->characters != NULL)
2980                     ctxt->sax->characters(ctxt->userData, buf, nbchar);
2981             }
2982         }
2983     } else {
2984         /*
2985          * Loop detection
2986          */
2987         if (cur == 0)
2988             ctxt->instate = XML_PARSER_EOF;
2989     }
2990 }
2991
2992 /**
2993  * htmlParseExternalID:
2994  * @ctxt:  an HTML parser context
2995  * @publicID:  a xmlChar** receiving PubidLiteral
2996  *
2997  * Parse an External ID or a Public ID
2998  *
2999  * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
3000  *                   | 'PUBLIC' S PubidLiteral S SystemLiteral
3001  *
3002  * [83] PublicID ::= 'PUBLIC' S PubidLiteral
3003  *
3004  * Returns the function returns SystemLiteral and in the second
3005  *                case publicID receives PubidLiteral, is strict is off
3006  *                it is possible to return NULL and have publicID set.
3007  */
3008
3009 static xmlChar *
3010 htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
3011     xmlChar *URI = NULL;
3012
3013     if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3014          (UPP(2) == 'S') && (UPP(3) == 'T') &&
3015          (UPP(4) == 'E') && (UPP(5) == 'M')) {
3016         SKIP(6);
3017         if (!IS_BLANK_CH(CUR)) {
3018             htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3019                          "Space required after 'SYSTEM'\n", NULL, NULL);
3020         }
3021         SKIP_BLANKS;
3022         URI = htmlParseSystemLiteral(ctxt);
3023         if (URI == NULL) {
3024             htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
3025                          "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
3026         }
3027     } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
3028                (UPP(2) == 'B') && (UPP(3) == 'L') &&
3029                (UPP(4) == 'I') && (UPP(5) == 'C')) {
3030         SKIP(6);
3031         if (!IS_BLANK_CH(CUR)) {
3032             htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3033                          "Space required after 'PUBLIC'\n", NULL, NULL);
3034         }
3035         SKIP_BLANKS;
3036         *publicID = htmlParsePubidLiteral(ctxt);
3037         if (*publicID == NULL) {
3038             htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
3039                          "htmlParseExternalID: PUBLIC, no Public Identifier\n",
3040                          NULL, NULL);
3041         }
3042         SKIP_BLANKS;
3043         if ((CUR == '"') || (CUR == '\'')) {
3044             URI = htmlParseSystemLiteral(ctxt);
3045         }
3046     }
3047     return(URI);
3048 }
3049
3050 /**
3051  * xmlParsePI:
3052  * @ctxt:  an XML parser context
3053  *
3054  * parse an XML Processing Instruction.
3055  *
3056  * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
3057  */
3058 static void
3059 htmlParsePI(htmlParserCtxtPtr ctxt) {
3060     xmlChar *buf = NULL;
3061     int len = 0;
3062     int size = HTML_PARSER_BUFFER_SIZE;
3063     int cur, l;
3064     const xmlChar *target;
3065     xmlParserInputState state;
3066     int count = 0;
3067
3068     if ((RAW == '<') && (NXT(1) == '?')) {
3069         state = ctxt->instate;
3070         ctxt->instate = XML_PARSER_PI;
3071         /*
3072          * this is a Processing Instruction.
3073          */
3074         SKIP(2);
3075         SHRINK;
3076
3077         /*
3078          * Parse the target name and check for special support like
3079          * namespace.
3080          */
3081         target = htmlParseName(ctxt);
3082         if (target != NULL) {
3083             if (RAW == '>') {
3084                 SKIP(1);
3085
3086                 /*
3087                  * SAX: PI detected.
3088                  */
3089                 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3090                     (ctxt->sax->processingInstruction != NULL))
3091                     ctxt->sax->processingInstruction(ctxt->userData,
3092                                                      target, NULL);
3093                 ctxt->instate = state;
3094                 return;
3095             }
3096             buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3097             if (buf == NULL) {
3098                 htmlErrMemory(ctxt, NULL);
3099                 ctxt->instate = state;
3100                 return;
3101             }
3102             cur = CUR;
3103             if (!IS_BLANK(cur)) {
3104                 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3105                           "ParsePI: PI %s space expected\n", target, NULL);
3106             }
3107             SKIP_BLANKS;
3108             cur = CUR_CHAR(l);
3109             while (IS_CHAR(cur) && (cur != '>')) {
3110                 if (len + 5 >= size) {
3111                     xmlChar *tmp;
3112
3113                     size *= 2;
3114                     tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3115                     if (tmp == NULL) {
3116                         htmlErrMemory(ctxt, NULL);
3117                         xmlFree(buf);
3118                         ctxt->instate = state;
3119                         return;
3120                     }
3121                     buf = tmp;
3122                 }
3123                 count++;
3124                 if (count > 50) {
3125                     GROW;
3126                     count = 0;
3127                 }
3128                 COPY_BUF(l,buf,len,cur);
3129                 NEXTL(l);
3130                 cur = CUR_CHAR(l);
3131                 if (cur == 0) {
3132                     SHRINK;
3133                     GROW;
3134                     cur = CUR_CHAR(l);
3135                 }
3136             }
3137             buf[len] = 0;
3138             if (cur != '>') {
3139                 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3140                       "ParsePI: PI %s never end ...\n", target, NULL);
3141             } else {
3142                 SKIP(1);
3143
3144                 /*
3145                  * SAX: PI detected.
3146                  */
3147                 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3148                     (ctxt->sax->processingInstruction != NULL))
3149                     ctxt->sax->processingInstruction(ctxt->userData,
3150                                                      target, buf);
3151             }
3152             xmlFree(buf);
3153         } else {
3154             htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
3155                          "PI is not started correctly", NULL, NULL);
3156         }
3157         ctxt->instate = state;
3158     }
3159 }
3160
3161 /**
3162  * htmlParseComment:
3163  * @ctxt:  an HTML parser context
3164  *
3165  * Parse an XML (SGML) comment <!-- .... -->
3166  *
3167  * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3168  */
3169 static void
3170 htmlParseComment(htmlParserCtxtPtr ctxt) {
3171     xmlChar *buf = NULL;
3172     int len;
3173     int size = HTML_PARSER_BUFFER_SIZE;
3174     int q, ql;
3175     int r, rl;
3176     int cur, l;
3177     xmlParserInputState state;
3178
3179     /*
3180      * Check that there is a comment right here.
3181      */
3182     if ((RAW != '<') || (NXT(1) != '!') ||
3183         (NXT(2) != '-') || (NXT(3) != '-')) return;
3184
3185     state = ctxt->instate;
3186     ctxt->instate = XML_PARSER_COMMENT;
3187     SHRINK;
3188     SKIP(4);
3189     buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3190     if (buf == NULL) {
3191         htmlErrMemory(ctxt, "buffer allocation failed\n");
3192         ctxt->instate = state;
3193         return;
3194     }
3195     q = CUR_CHAR(ql);
3196     NEXTL(ql);
3197     r = CUR_CHAR(rl);
3198     NEXTL(rl);
3199     cur = CUR_CHAR(l);
3200     len = 0;
3201     while (IS_CHAR(cur) &&
3202            ((cur != '>') ||
3203             (r != '-') || (q != '-'))) {
3204         if (len + 5 >= size) {
3205             xmlChar *tmp;
3206
3207             size *= 2;
3208             tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3209             if (tmp == NULL) {
3210                 xmlFree(buf);
3211                 htmlErrMemory(ctxt, "growing buffer failed\n");
3212                 ctxt->instate = state;
3213                 return;
3214             }
3215             buf = tmp;
3216         }
3217         COPY_BUF(ql,buf,len,q);
3218         q = r;
3219         ql = rl;
3220         r = cur;
3221         rl = l;
3222         NEXTL(l);
3223         cur = CUR_CHAR(l);
3224         if (cur == 0) {
3225             SHRINK;
3226             GROW;
3227             cur = CUR_CHAR(l);
3228         }
3229     }
3230     buf[len] = 0;
3231     if (!IS_CHAR(cur)) {
3232         htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3233                      "Comment not terminated \n<!--%.50s\n", buf, NULL);
3234         xmlFree(buf);
3235     } else {
3236         NEXT;
3237         if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3238             (!ctxt->disableSAX))
3239             ctxt->sax->comment(ctxt->userData, buf);
3240         xmlFree(buf);
3241     }
3242     ctxt->instate = state;
3243 }
3244
3245 /**
3246  * htmlParseCharRef:
3247  * @ctxt:  an HTML parser context
3248  *
3249  * parse Reference declarations
3250  *
3251  * [66] CharRef ::= '&#' [0-9]+ ';' |
3252  *                  '&#x' [0-9a-fA-F]+ ';'
3253  *
3254  * Returns the value parsed (as an int)
3255  */
3256 int
3257 htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3258     int val = 0;
3259
3260     if ((ctxt == NULL) || (ctxt->input == NULL)) {
3261         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3262                      "htmlParseCharRef: context error\n",
3263                      NULL, NULL);
3264         return(0);
3265     }
3266     if ((CUR == '&') && (NXT(1) == '#') &&
3267         ((NXT(2) == 'x') || NXT(2) == 'X')) {
3268         SKIP(3);
3269         while (CUR != ';') {
3270             if ((CUR >= '0') && (CUR <= '9'))
3271                 val = val * 16 + (CUR - '0');
3272             else if ((CUR >= 'a') && (CUR <= 'f'))
3273                 val = val * 16 + (CUR - 'a') + 10;
3274             else if ((CUR >= 'A') && (CUR <= 'F'))
3275                 val = val * 16 + (CUR - 'A') + 10;
3276             else {
3277                 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3278                              "htmlParseCharRef: missing semicolumn\n",
3279                              NULL, NULL);
3280                 break;
3281             }
3282             NEXT;
3283         }
3284         if (CUR == ';')
3285             NEXT;
3286     } else if  ((CUR == '&') && (NXT(1) == '#')) {
3287         SKIP(2);
3288         while (CUR != ';') {
3289             if ((CUR >= '0') && (CUR <= '9'))
3290                 val = val * 10 + (CUR - '0');
3291             else {
3292                 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3293                              "htmlParseCharRef: missing semicolumn\n",
3294                              NULL, NULL);
3295                 break;
3296             }
3297             NEXT;
3298         }
3299         if (CUR == ';')
3300             NEXT;
3301     } else {
3302         htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3303                      "htmlParseCharRef: invalid value\n", NULL, NULL);
3304     }
3305     /*
3306      * Check the value IS_CHAR ...
3307      */
3308     if (IS_CHAR(val)) {
3309         return(val);
3310     } else {
3311         htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3312                         "htmlParseCharRef: invalid xmlChar value %d\n",
3313                         val);
3314     }
3315     return(0);
3316 }
3317
3318
3319 /**
3320  * htmlParseDocTypeDecl:
3321  * @ctxt:  an HTML parser context
3322  *
3323  * parse a DOCTYPE declaration
3324  *
3325  * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3326  *                      ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3327  */
3328
3329 static void
3330 htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3331     const xmlChar *name;
3332     xmlChar *ExternalID = NULL;
3333     xmlChar *URI = NULL;
3334
3335     /*
3336      * We know that '<!DOCTYPE' has been detected.
3337      */
3338     SKIP(9);
3339
3340     SKIP_BLANKS;
3341
3342     /*
3343      * Parse the DOCTYPE name.
3344      */
3345     name = htmlParseName(ctxt);
3346     if (name == NULL) {
3347         htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3348                      "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3349                      NULL, NULL);
3350     }
3351     /*
3352      * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3353      */
3354
3355     SKIP_BLANKS;
3356
3357     /*
3358      * Check for SystemID and ExternalID
3359      */
3360     URI = htmlParseExternalID(ctxt, &ExternalID);
3361     SKIP_BLANKS;
3362
3363     /*
3364      * We should be at the end of the DOCTYPE declaration.
3365      */
3366     if (CUR != '>') {
3367         htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3368                      "DOCTYPE improperly terminated\n", NULL, NULL);
3369         /* We shouldn't try to resynchronize ... */
3370     }
3371     NEXT;
3372
3373     /*
3374      * Create or update the document accordingly to the DOCTYPE
3375      */
3376     if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3377         (!ctxt->disableSAX))
3378         ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3379
3380     /*
3381      * Cleanup, since we don't use all those identifiers
3382      */
3383     if (URI != NULL) xmlFree(URI);
3384     if (ExternalID != NULL) xmlFree(ExternalID);
3385 }
3386
3387 /**
3388  * htmlParseAttribute:
3389  * @ctxt:  an HTML parser context
3390  * @value:  a xmlChar ** used to store the value of the attribute
3391  *
3392  * parse an attribute
3393  *
3394  * [41] Attribute ::= Name Eq AttValue
3395  *
3396  * [25] Eq ::= S? '=' S?
3397  *
3398  * With namespace:
3399  *
3400  * [NS 11] Attribute ::= QName Eq AttValue
3401  *
3402  * Also the case QName == xmlns:??? is handled independently as a namespace
3403  * definition.
3404  *
3405  * Returns the attribute name, and the value in *value.
3406  */
3407
3408 static const xmlChar *
3409 htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3410     const xmlChar *name;
3411     xmlChar *val = NULL;
3412
3413     *value = NULL;
3414     name = htmlParseHTMLName(ctxt);
3415     if (name == NULL) {
3416         htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3417                      "error parsing attribute name\n", NULL, NULL);
3418         return(NULL);
3419     }
3420
3421     /*
3422      * read the value
3423      */
3424     SKIP_BLANKS;
3425     if (CUR == '=') {
3426         NEXT;
3427         SKIP_BLANKS;
3428         val = htmlParseAttValue(ctxt);
3429     }
3430
3431     *value = val;
3432     return(name);
3433 }
3434
3435 /**
3436  * htmlCheckEncoding:
3437  * @ctxt:  an HTML parser context
3438  * @attvalue: the attribute value
3439  *
3440  * Checks an http-equiv attribute from a Meta tag to detect
3441  * the encoding
3442  * If a new encoding is detected the parser is switched to decode
3443  * it and pass UTF8
3444  */
3445 static void
3446 htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3447     const xmlChar *encoding;
3448
3449     if ((ctxt == NULL) || (attvalue == NULL))
3450         return;
3451
3452     /* do not change encoding */
3453     if (ctxt->input->encoding != NULL)
3454         return;
3455
3456     encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
3457     if (encoding != NULL) {
3458         encoding += 8;
3459     } else {
3460         encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
3461         if (encoding != NULL)
3462             encoding += 9;
3463     }
3464     if (encoding != NULL) {
3465         xmlCharEncoding enc;
3466         xmlCharEncodingHandlerPtr handler;
3467
3468         while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3469
3470         if (ctxt->input->encoding != NULL)
3471             xmlFree((xmlChar *) ctxt->input->encoding);
3472         ctxt->input->encoding = xmlStrdup(encoding);
3473
3474         enc = xmlParseCharEncoding((const char *) encoding);
3475         /*
3476          * registered set of known encodings
3477          */
3478         if (enc != XML_CHAR_ENCODING_ERROR) {
3479             if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
3480                  (enc == XML_CHAR_ENCODING_UTF16BE) ||
3481                  (enc == XML_CHAR_ENCODING_UCS4LE) ||
3482                  (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3483                 (ctxt->input->buf != NULL) &&
3484                 (ctxt->input->buf->encoder == NULL)) {
3485                 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3486                              "htmlCheckEncoding: wrong encoding meta\n",
3487                              NULL, NULL);
3488             } else {
3489                 xmlSwitchEncoding(ctxt, enc);
3490             }
3491             ctxt->charset = XML_CHAR_ENCODING_UTF8;
3492         } else {
3493             /*
3494              * fallback for unknown encodings
3495              */
3496             handler = xmlFindCharEncodingHandler((const char *) encoding);
3497             if (handler != NULL) {
3498                 xmlSwitchToEncoding(ctxt, handler);
3499                 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3500             } else {
3501                 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
3502             }
3503         }
3504
3505         if ((ctxt->input->buf != NULL) &&
3506             (ctxt->input->buf->encoder != NULL) &&
3507             (ctxt->input->buf->raw != NULL) &&
3508             (ctxt->input->buf->buffer != NULL)) {
3509             int nbchars;
3510             int processed;
3511
3512             /*
3513              * convert as much as possible to the parser reading buffer.
3514              */
3515             processed = ctxt->input->cur - ctxt->input->base;
3516             xmlBufferShrink(ctxt->input->buf->buffer, processed);
3517             nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
3518                                        ctxt->input->buf->buffer,
3519                                        ctxt->input->buf->raw);
3520             if (nbchars < 0) {
3521                 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3522                              "htmlCheckEncoding: encoder error\n",
3523                              NULL, NULL);
3524             }
3525             ctxt->input->base =
3526             ctxt->input->cur = ctxt->input->buf->buffer->content;
3527             ctxt->input->end =
3528                           &ctxt->input->base[ctxt->input->buf->buffer->use];
3529         }
3530     }
3531 }
3532
3533 /**
3534  * htmlCheckMeta:
3535  * @ctxt:  an HTML parser context
3536  * @atts:  the attributes values
3537  *
3538  * Checks an attributes from a Meta tag
3539  */
3540 static void
3541 htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3542     int i;
3543     const xmlChar *att, *value;
3544     int http = 0;
3545     const xmlChar *content = NULL;
3546
3547     if ((ctxt == NULL) || (atts == NULL))
3548         return;
3549
3550     i = 0;
3551     att = atts[i++];
3552     while (att != NULL) {
3553         value = atts[i++];
3554         if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3555          && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3556             http = 1;
3557         else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3558             content = value;
3559         att = atts[i++];
3560     }
3561     if ((http) && (content != NULL))
3562         htmlCheckEncoding(ctxt, content);
3563
3564 }
3565
3566 /**
3567  * htmlParseStartTag:
3568  * @ctxt:  an HTML parser context
3569  *
3570  * parse a start of tag either for rule element or
3571  * EmptyElement. In both case we don't parse the tag closing chars.
3572  *
3573  * [40] STag ::= '<' Name (S Attribute)* S? '>'
3574  *
3575  * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3576  *
3577  * With namespace:
3578  *
3579  * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3580  *
3581  * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3582  *
3583  * Returns 0 in case of success, -1 in case of error and 1 if discarded
3584  */
3585
3586 static int
3587 htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3588     const xmlChar *name;
3589     const xmlChar *attname;
3590     xmlChar *attvalue;
3591     const xmlChar **atts;
3592     int nbatts = 0;
3593     int maxatts;
3594     int meta = 0;
3595     int i;
3596     int discardtag = 0;
3597
3598     if (ctxt->instate == XML_PARSER_EOF)
3599         return(-1);
3600     if ((ctxt == NULL) || (ctxt->input == NULL)) {
3601         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3602                      "htmlParseStartTag: context error\n", NULL, NULL);
3603         return -1;
3604     }
3605     if (CUR != '<') return -1;
3606     NEXT;
3607
3608     atts = ctxt->atts;
3609     maxatts = ctxt->maxatts;
3610
3611     GROW;
3612     name = htmlParseHTMLName(ctxt);
3613     if (name == NULL) {
3614         htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3615                      "htmlParseStartTag: invalid element name\n",
3616                      NULL, NULL);
3617         /* Dump the bogus tag like browsers do */
3618         while ((IS_CHAR_CH(CUR)) && (CUR != '>') &&
3619                (ctxt->instate != XML_PARSER_EOF))
3620             NEXT;
3621         return -1;
3622     }
3623     if (xmlStrEqual(name, BAD_CAST"meta"))
3624         meta = 1;
3625
3626     /*
3627      * Check for auto-closure of HTML elements.
3628      */
3629     htmlAutoClose(ctxt, name);
3630
3631     /*
3632      * Check for implied HTML elements.
3633      */
3634     htmlCheckImplied(ctxt, name);
3635
3636     /*
3637      * Avoid html at any level > 0, head at any level != 1
3638      * or any attempt to recurse body
3639      */
3640     if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3641         htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3642                      "htmlParseStartTag: misplaced <html> tag\n",
3643                      name, NULL);
3644         discardtag = 1;
3645         ctxt->depth++;
3646     }
3647     if ((ctxt->nameNr != 1) &&
3648         (xmlStrEqual(name, BAD_CAST"head"))) {
3649         htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3650                      "htmlParseStartTag: misplaced <head> tag\n",
3651                      name, NULL);
3652         discardtag = 1;
3653         ctxt->depth++;
3654     }
3655     if (xmlStrEqual(name, BAD_CAST"body")) {
3656         int indx;
3657         for (indx = 0;indx < ctxt->nameNr;indx++) {
3658             if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
3659                 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3660                              "htmlParseStartTag: misplaced <body> tag\n",
3661                              name, NULL);
3662                 discardtag = 1;
3663                 ctxt->depth++;
3664             }
3665         }
3666     }
3667
3668     /*
3669      * Now parse the attributes, it ends up with the ending
3670      *
3671      * (S Attribute)* S?
3672      */
3673     SKIP_BLANKS;
3674     while ((IS_CHAR_CH(CUR)) &&
3675            (CUR != '>') &&
3676            ((CUR != '/') || (NXT(1) != '>'))) {
3677         long cons = ctxt->nbChars;
3678
3679         GROW;
3680         attname = htmlParseAttribute(ctxt, &attvalue);
3681         if (attname != NULL) {
3682
3683             /*
3684              * Well formedness requires at most one declaration of an attribute
3685              */
3686             for (i = 0; i < nbatts;i += 2) {
3687                 if (xmlStrEqual(atts[i], attname)) {
3688                     htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3689                                  "Attribute %s redefined\n", attname, NULL);
3690                     if (attvalue != NULL)
3691                         xmlFree(attvalue);
3692                     goto failed;
3693                 }
3694             }
3695
3696             /*
3697              * Add the pair to atts
3698              */
3699             if (atts == NULL) {
3700                 maxatts = 22; /* allow for 10 attrs by default */
3701                 atts = (const xmlChar **)
3702                        xmlMalloc(maxatts * sizeof(xmlChar *));
3703                 if (atts == NULL) {
3704                     htmlErrMemory(ctxt, NULL);
3705                     if (attvalue != NULL)
3706                         xmlFree(attvalue);
3707                     goto failed;
3708                 }
3709                 ctxt->atts = atts;
3710                 ctxt->maxatts = maxatts;
3711             } else if (nbatts + 4 > maxatts) {
3712                 const xmlChar **n;
3713
3714                 maxatts *= 2;
3715                 n = (const xmlChar **) xmlRealloc((void *) atts,
3716                                              maxatts * sizeof(const xmlChar *));
3717                 if (n == NULL) {
3718                     htmlErrMemory(ctxt, NULL);
3719                     if (attvalue != NULL)
3720                         xmlFree(attvalue);
3721                     goto failed;
3722                 }
3723                 atts = n;
3724                 ctxt->atts = atts;
3725                 ctxt->maxatts = maxatts;
3726             }
3727             atts[nbatts++] = attname;
3728             atts[nbatts++] = attvalue;
3729             atts[nbatts] = NULL;
3730             atts[nbatts + 1] = NULL;
3731         }
3732         else {
3733             if (attvalue != NULL)
3734                 xmlFree(attvalue);
3735             /* Dump the bogus attribute string up to the next blank or
3736              * the end of the tag. */
3737             while ((IS_CHAR_CH(CUR)) &&
3738                    !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
3739                    ((CUR != '/') || (NXT(1) != '>')))
3740                 NEXT;
3741         }
3742
3743 failed:
3744         SKIP_BLANKS;
3745         if (cons == ctxt->nbChars) {
3746             htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3747                          "htmlParseStartTag: problem parsing attributes\n",
3748                          NULL, NULL);
3749             break;
3750         }
3751     }
3752
3753     /*
3754      * Handle specific association to the META tag
3755      */
3756     if (meta && (nbatts != 0))
3757         htmlCheckMeta(ctxt, atts);
3758
3759     /*
3760      * SAX: Start of Element !
3761      */
3762     if (!discardtag) {
3763         htmlnamePush(ctxt, name);
3764         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3765             if (nbatts != 0)
3766                 ctxt->sax->startElement(ctxt->userData, name, atts);
3767             else
3768                 ctxt->sax->startElement(ctxt->userData, name, NULL);
3769         }
3770     }
3771
3772     if (atts != NULL) {
3773         for (i = 1;i < nbatts;i += 2) {
3774             if (atts[i] != NULL)
3775                 xmlFree((xmlChar *) atts[i]);
3776         }
3777     }
3778
3779     return(discardtag);
3780 }
3781
3782 /**
3783  * htmlParseEndTag:
3784  * @ctxt:  an HTML parser context
3785  *
3786  * parse an end of tag
3787  *
3788  * [42] ETag ::= '</' Name S? '>'
3789  *
3790  * With namespace
3791  *
3792  * [NS 9] ETag ::= '</' QName S? '>'
3793  *
3794  * Returns 1 if the current level should be closed.
3795  */
3796
3797 static int
3798 htmlParseEndTag(htmlParserCtxtPtr ctxt)
3799 {
3800     const xmlChar *name;
3801     const xmlChar *oldname;
3802     int i, ret;
3803
3804     if ((CUR != '<') || (NXT(1) != '/')) {
3805         htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3806                      "htmlParseEndTag: '</' not found\n", NULL, NULL);
3807         return (0);
3808     }
3809     SKIP(2);
3810
3811     name = htmlParseHTMLName(ctxt);
3812     if (name == NULL)
3813         return (0);
3814     /*
3815      * We should definitely be at the ending "S? '>'" part
3816      */
3817     SKIP_BLANKS;
3818     if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
3819         htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3820                      "End tag : expected '>'\n", NULL, NULL);
3821         if (ctxt->recovery) {
3822             /*
3823              * We're not at the ending > !!
3824              * Error, unless in recover mode where we search forwards
3825              * until we find a >
3826              */
3827             while (CUR != '\0' && CUR != '>') NEXT;
3828             NEXT;
3829         }
3830     } else
3831         NEXT;
3832
3833     /*
3834      * if we ignored misplaced tags in htmlParseStartTag don't pop them
3835      * out now.
3836      */
3837     if ((ctxt->depth > 0) &&
3838         (xmlStrEqual(name, BAD_CAST "html") ||
3839          xmlStrEqual(name, BAD_CAST "body") ||
3840          xmlStrEqual(name, BAD_CAST "head"))) {
3841         ctxt->depth--;
3842         return (0);
3843     }
3844
3845     /*
3846      * If the name read is not one of the element in the parsing stack
3847      * then return, it's just an error.
3848      */
3849     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
3850         if (xmlStrEqual(name, ctxt->nameTab[i]))
3851             break;
3852     }
3853     if (i < 0) {
3854         htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3855                      "Unexpected end tag : %s\n", name, NULL);
3856         return (0);
3857     }
3858
3859
3860     /*
3861      * Check for auto-closure of HTML elements.
3862      */
3863
3864     htmlAutoCloseOnClose(ctxt, name);
3865
3866     /*
3867      * Well formedness constraints, opening and closing must match.
3868      * With the exception that the autoclose may have popped stuff out
3869      * of the stack.
3870      */
3871     if (!xmlStrEqual(name, ctxt->name)) {
3872         if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
3873             htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3874                          "Opening and ending tag mismatch: %s and %s\n",
3875                          name, ctxt->name);
3876         }
3877     }
3878
3879     /*
3880      * SAX: End of Tag
3881      */
3882     oldname = ctxt->name;
3883     if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
3884         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3885             ctxt->sax->endElement(ctxt->userData, name);
3886         htmlnamePop(ctxt);
3887         ret = 1;
3888     } else {
3889         ret = 0;
3890     }
3891
3892     return (ret);
3893 }
3894
3895
3896 /**
3897  * htmlParseReference:
3898  * @ctxt:  an HTML parser context
3899  *
3900  * parse and handle entity references in content,
3901  * this will end-up in a call to character() since this is either a
3902  * CharRef, or a predefined entity.
3903  */
3904 static void
3905 htmlParseReference(htmlParserCtxtPtr ctxt) {
3906     const htmlEntityDesc * ent;
3907     xmlChar out[6];
3908     const xmlChar *name;
3909     if (CUR != '&') return;
3910
3911     if (NXT(1) == '#') {
3912         unsigned int c;
3913         int bits, i = 0;
3914
3915         c = htmlParseCharRef(ctxt);
3916         if (c == 0)
3917             return;
3918
3919         if      (c <    0x80) { out[i++]= c;                bits= -6; }
3920         else if (c <   0x800) { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
3921         else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
3922         else                  { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
3923
3924         for ( ; bits >= 0; bits-= 6) {
3925             out[i++]= ((c >> bits) & 0x3F) | 0x80;
3926         }
3927         out[i] = 0;
3928
3929         htmlCheckParagraph(ctxt);
3930         if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3931             ctxt->sax->characters(ctxt->userData, out, i);
3932     } else {
3933         ent = htmlParseEntityRef(ctxt, &name);
3934         if (name == NULL) {
3935             htmlCheckParagraph(ctxt);
3936             if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3937                 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3938             return;
3939         }
3940         if ((ent == NULL) || !(ent->value > 0)) {
3941             htmlCheckParagraph(ctxt);
3942             if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3943                 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3944                 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3945                 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3946             }
3947         } else {
3948             unsigned int c;
3949             int bits, i = 0;
3950
3951             c = ent->value;
3952             if      (c <    0x80)
3953                     { out[i++]= c;                bits= -6; }
3954             else if (c <   0x800)
3955                     { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
3956             else if (c < 0x10000)
3957                     { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
3958             else
3959                     { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
3960
3961             for ( ; bits >= 0; bits-= 6) {
3962                 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3963             }
3964             out[i] = 0;
3965
3966             htmlCheckParagraph(ctxt);
3967             if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3968                 ctxt->sax->characters(ctxt->userData, out, i);
3969         }
3970     }
3971 }
3972
3973 /**
3974  * htmlParseContent:
3975  * @ctxt:  an HTML parser context
3976  *
3977  * Parse a content: comment, sub-element, reference or text.
3978  * Kept for compatibility with old code
3979  */
3980
3981 static void
3982 htmlParseContent(htmlParserCtxtPtr ctxt) {
3983     xmlChar *currentNode;
3984     int depth;
3985     const xmlChar *name;
3986
3987     currentNode = xmlStrdup(ctxt->name);
3988     depth = ctxt->nameNr;
3989     while (1) {
3990         long cons = ctxt->nbChars;
3991
3992         GROW;
3993
3994         if (ctxt->instate == XML_PARSER_EOF)
3995             break;
3996
3997         /*
3998          * Our tag or one of it's parent or children is ending.
3999          */
4000         if ((CUR == '<') && (NXT(1) == '/')) {
4001             if (htmlParseEndTag(ctxt) &&
4002                 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4003                 if (currentNode != NULL)
4004                     xmlFree(currentNode);
4005                 return;
4006             }
4007             continue; /* while */
4008         }
4009
4010         else if ((CUR == '<') &&
4011                  ((IS_ASCII_LETTER(NXT(1))) ||
4012                   (NXT(1) == '_') || (NXT(1) == ':'))) {
4013             name = htmlParseHTMLName_nonInvasive(ctxt);
4014             if (name == NULL) {
4015                 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4016                          "htmlParseStartTag: invalid element name\n",
4017                          NULL, NULL);
4018                 /* Dump the bogus tag like browsers do */
4019         while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
4020                     NEXT;
4021
4022                 if (currentNode != NULL)
4023                     xmlFree(currentNode);
4024                 return;
4025             }
4026
4027             if (ctxt->name != NULL) {
4028                 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4029                     htmlAutoClose(ctxt, name);
4030                     continue;
4031                 }
4032             }
4033         }
4034
4035         /*
4036          * Has this node been popped out during parsing of
4037          * the next element
4038          */
4039         if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4040             (!xmlStrEqual(currentNode, ctxt->name)))
4041              {
4042             if (currentNode != NULL) xmlFree(currentNode);
4043             return;
4044         }
4045
4046         if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4047             (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4048             /*
4049              * Handle SCRIPT/STYLE separately
4050              */
4051             htmlParseScript(ctxt);
4052         } else {
4053             /*
4054              * Sometimes DOCTYPE arrives in the middle of the document
4055              */
4056             if ((CUR == '<') && (NXT(1) == '!') &&
4057                 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4058                 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4059                 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4060                 (UPP(8) == 'E')) {
4061                 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4062                              "Misplaced DOCTYPE declaration\n",
4063                              BAD_CAST "DOCTYPE" , NULL);
4064                 htmlParseDocTypeDecl(ctxt);
4065             }
4066
4067             /*
4068              * First case :  a comment
4069              */
4070             if ((CUR == '<') && (NXT(1) == '!') &&
4071                 (NXT(2) == '-') && (NXT(3) == '-')) {
4072                 htmlParseComment(ctxt);
4073             }
4074
4075             /*
4076              * Second case : a Processing Instruction.
4077              */
4078             else if ((CUR == '<') && (NXT(1) == '?')) {
4079                 htmlParsePI(ctxt);
4080             }
4081
4082             /*
4083              * Third case :  a sub-element.
4084              */
4085             else if (CUR == '<') {
4086                 htmlParseElement(ctxt);
4087             }
4088
4089             /*
4090              * Fourth case : a reference. If if has not been resolved,
4091              *    parsing returns it's Name, create the node
4092              */
4093             else if (CUR == '&') {
4094                 htmlParseReference(ctxt);
4095             }
4096
4097             /*
4098              * Fifth case : end of the resource
4099              */
4100             else if (CUR == 0) {
4101                 htmlAutoCloseOnEnd(ctxt);
4102                 break;
4103             }
4104
4105             /*
4106              * Last case, text. Note that References are handled directly.
4107              */
4108             else {
4109                 htmlParseCharData(ctxt);
4110             }
4111
4112             if (cons == ctxt->nbChars) {
4113                 if (ctxt->node != NULL) {
4114                     htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4115                                  "detected an error in element content\n",
4116                                  NULL, NULL);
4117                 }
4118                 break;
4119             }
4120         }
4121         GROW;
4122     }
4123     if (currentNode != NULL) xmlFree(currentNode);
4124 }
4125
4126 /**
4127  * htmlParseElement:
4128  * @ctxt:  an HTML parser context
4129  *
4130  * parse an HTML element, this is highly recursive
4131  * this is kept for compatibility with previous code versions
4132  *
4133  * [39] element ::= EmptyElemTag | STag content ETag
4134  *
4135  * [41] Attribute ::= Name Eq AttValue
4136  */
4137
4138 void
4139 htmlParseElement(htmlParserCtxtPtr ctxt) {
4140     const xmlChar *name;
4141     xmlChar *currentNode = NULL;
4142     const htmlElemDesc * info;
4143     htmlParserNodeInfo node_info;
4144     int failed;
4145     int depth;
4146     const xmlChar *oldptr;
4147
4148     if ((ctxt == NULL) || (ctxt->input == NULL)) {
4149         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4150                      "htmlParseElement: context error\n", NULL, NULL);
4151         return;
4152     }
4153
4154     if (ctxt->instate == XML_PARSER_EOF)
4155         return;
4156
4157     /* Capture start position */
4158     if (ctxt->record_info) {
4159         node_info.begin_pos = ctxt->input->consumed +
4160                           (CUR_PTR - ctxt->input->base);
4161         node_info.begin_line = ctxt->input->line;
4162     }
4163
4164     failed = htmlParseStartTag(ctxt);
4165     name = ctxt->name;
4166     if ((failed == -1) || (name == NULL)) {
4167         if (CUR == '>')
4168             NEXT;
4169         return;
4170     }
4171
4172     /*
4173      * Lookup the info for that element.
4174      */
4175     info = htmlTagLookup(name);
4176     if (info == NULL) {
4177         htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4178                      "Tag %s invalid\n", name, NULL);
4179     }
4180
4181     /*
4182      * Check for an Empty Element labeled the XML/SGML way
4183      */
4184     if ((CUR == '/') && (NXT(1) == '>')) {
4185         SKIP(2);
4186         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4187             ctxt->sax->endElement(ctxt->userData, name);
4188         htmlnamePop(ctxt);
4189         return;
4190     }
4191
4192     if (CUR == '>') {
4193         NEXT;
4194     } else {
4195         htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4196                      "Couldn't find end of Start Tag %s\n", name, NULL);
4197
4198         /*
4199          * end of parsing of this node.
4200          */
4201         if (xmlStrEqual(name, ctxt->name)) {
4202             nodePop(ctxt);
4203             htmlnamePop(ctxt);
4204         }
4205
4206         /*
4207          * Capture end position and add node
4208          */
4209         if (ctxt->record_info) {
4210            node_info.end_pos = ctxt->input->consumed +
4211                               (CUR_PTR - ctxt->input->base);
4212            node_info.end_line = ctxt->input->line;
4213            node_info.node = ctxt->node;
4214            xmlParserAddNodeInfo(ctxt, &node_info);
4215         }
4216         return;
4217     }
4218
4219     /*
4220      * Check for an Empty Element from DTD definition
4221      */
4222     if ((info != NULL) && (info->empty)) {
4223         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4224             ctxt->sax->endElement(ctxt->userData, name);
4225         htmlnamePop(ctxt);
4226         return;
4227     }
4228
4229     /*
4230      * Parse the content of the element:
4231      */
4232     currentNode = xmlStrdup(ctxt->name);
4233     depth = ctxt->nameNr;
4234     while (IS_CHAR_CH(CUR)) {
4235         oldptr = ctxt->input->cur;
4236         htmlParseContent(ctxt);
4237         if (oldptr==ctxt->input->cur) break;
4238         if (ctxt->nameNr < depth) break;
4239     }
4240
4241     /*
4242      * Capture end position and add node
4243      */
4244     if ( currentNode != NULL && ctxt->record_info ) {
4245        node_info.end_pos = ctxt->input->consumed +
4246                           (CUR_PTR - ctxt->input->base);
4247        node_info.end_line = ctxt->input->line;
4248        node_info.node = ctxt->node;
4249        xmlParserAddNodeInfo(ctxt, &node_info);
4250     }
4251     if (!IS_CHAR_CH(CUR)) {
4252         htmlAutoCloseOnEnd(ctxt);
4253     }
4254
4255     if (currentNode != NULL)
4256         xmlFree(currentNode);
4257 }
4258
4259 static void
4260 htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
4261     /*
4262      * Capture end position and add node
4263      */
4264     if ( ctxt->node != NULL && ctxt->record_info ) {
4265        ctxt->nodeInfo->end_pos = ctxt->input->consumed +
4266                                 (CUR_PTR - ctxt->input->base);
4267        ctxt->nodeInfo->end_line = ctxt->input->line;
4268        ctxt->nodeInfo->node = ctxt->node;
4269        xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
4270        htmlNodeInfoPop(ctxt);
4271     }
4272     if (!IS_CHAR_CH(CUR)) {
4273        htmlAutoCloseOnEnd(ctxt);
4274     }
4275 }
4276
4277 /**
4278  * htmlParseElementInternal:
4279  * @ctxt:  an HTML parser context
4280  *
4281  * parse an HTML element, new version, non recursive
4282  *
4283  * [39] element ::= EmptyElemTag | STag content ETag
4284  *
4285  * [41] Attribute ::= Name Eq AttValue
4286  */
4287
4288 static void
4289 htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4290     const xmlChar *name;
4291     const htmlElemDesc * info;
4292     htmlParserNodeInfo node_info;
4293     int failed;
4294
4295     if ((ctxt == NULL) || (ctxt->input == NULL)) {
4296         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4297                      "htmlParseElementInternal: context error\n", NULL, NULL);
4298         return;
4299     }
4300
4301     if (ctxt->instate == XML_PARSER_EOF)
4302         return;
4303
4304     /* Capture start position */
4305     if (ctxt->record_info) {
4306         node_info.begin_pos = ctxt->input->consumed +
4307                           (CUR_PTR - ctxt->input->base);
4308         node_info.begin_line = ctxt->input->line;
4309     }
4310
4311     failed = htmlParseStartTag(ctxt);
4312     name = ctxt->name;
4313     if ((failed == -1) || (name == NULL)) {
4314         if (CUR == '>')
4315             NEXT;
4316         return;
4317     }
4318
4319     /*
4320      * Lookup the info for that element.
4321      */
4322     info = htmlTagLookup(name);
4323     if (info == NULL) {
4324         htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4325                      "Tag %s invalid\n", name, NULL);
4326     }
4327
4328     /*
4329      * Check for an Empty Element labeled the XML/SGML way
4330      */
4331     if ((CUR == '/') && (NXT(1) == '>')) {
4332         SKIP(2);
4333         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4334             ctxt->sax->endElement(ctxt->userData, name);
4335         htmlnamePop(ctxt);
4336         return;
4337     }
4338
4339     if (CUR == '>') {
4340         NEXT;
4341     } else {
4342         htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4343                      "Couldn't find end of Start Tag %s\n", name, NULL);
4344
4345         /*
4346          * end of parsing of this node.
4347          */
4348         if (xmlStrEqual(name, ctxt->name)) {
4349             nodePop(ctxt);
4350             htmlnamePop(ctxt);
4351         }
4352
4353         if (ctxt->record_info)
4354             htmlNodeInfoPush(ctxt, &node_info);
4355         htmlParserFinishElementParsing(ctxt);
4356         return;
4357     }
4358
4359     /*
4360      * Check for an Empty Element from DTD definition
4361      */
4362     if ((info != NULL) && (info->empty)) {
4363         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4364             ctxt->sax->endElement(ctxt->userData, name);
4365         htmlnamePop(ctxt);
4366         return;
4367     }
4368
4369     if (ctxt->record_info)
4370         htmlNodeInfoPush(ctxt, &node_info);
4371 }
4372
4373 /**
4374  * htmlParseContentInternal:
4375  * @ctxt:  an HTML parser context
4376  *
4377  * Parse a content: comment, sub-element, reference or text.
4378  * New version for non recursive htmlParseElementInternal
4379  */
4380
4381 static void
4382 htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
4383     xmlChar *currentNode;
4384     int depth;
4385     const xmlChar *name;
4386
4387     currentNode = xmlStrdup(ctxt->name);
4388     depth = ctxt->nameNr;
4389     while (1) {
4390         long cons = ctxt->nbChars;
4391
4392         GROW;
4393
4394         if (ctxt->instate == XML_PARSER_EOF)
4395             break;
4396
4397         /*
4398          * Our tag or one of it's parent or children is ending.
4399          */
4400         if ((CUR == '<') && (NXT(1) == '/')) {
4401             if (htmlParseEndTag(ctxt) &&
4402                 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4403                 if (currentNode != NULL)
4404                     xmlFree(currentNode);
4405
4406                 currentNode = xmlStrdup(ctxt->name);
4407                 depth = ctxt->nameNr;
4408             }
4409             continue; /* while */
4410         }
4411
4412         else if ((CUR == '<') &&
4413                  ((IS_ASCII_LETTER(NXT(1))) ||
4414                   (NXT(1) == '_') || (NXT(1) == ':'))) {
4415             name = htmlParseHTMLName_nonInvasive(ctxt);
4416             if (name == NULL) {
4417                 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4418                          "htmlParseStartTag: invalid element name\n",
4419                          NULL, NULL);
4420                 /* Dump the bogus tag like browsers do */
4421                 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
4422                     NEXT;
4423
4424                 htmlParserFinishElementParsing(ctxt);
4425                 if (currentNode != NULL)
4426                     xmlFree(currentNode);
4427
4428                 currentNode = xmlStrdup(ctxt->name);
4429                 depth = ctxt->nameNr;
4430                 continue;
4431             }
4432
4433             if (ctxt->name != NULL) {
4434                 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4435                     htmlAutoClose(ctxt, name);
4436                     continue;
4437                 }
4438             }
4439         }
4440
4441         /*
4442          * Has this node been popped out during parsing of
4443          * the next element
4444          */
4445         if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4446             (!xmlStrEqual(currentNode, ctxt->name)))
4447              {
4448             htmlParserFinishElementParsing(ctxt);
4449             if (currentNode != NULL) xmlFree(currentNode);
4450
4451             currentNode = xmlStrdup(ctxt->name);
4452             depth = ctxt->nameNr;
4453             continue;
4454         }
4455
4456         if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4457             (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4458             /*
4459              * Handle SCRIPT/STYLE separately
4460              */
4461             htmlParseScript(ctxt);
4462         } else {
4463             /*
4464              * Sometimes DOCTYPE arrives in the middle of the document
4465              */
4466             if ((CUR == '<') && (NXT(1) == '!') &&
4467                 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4468                 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4469                 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4470                 (UPP(8) == 'E')) {
4471                 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4472                              "Misplaced DOCTYPE declaration\n",
4473                              BAD_CAST "DOCTYPE" , NULL);
4474                 htmlParseDocTypeDecl(ctxt);
4475             }
4476
4477             /*
4478              * First case :  a comment
4479              */
4480             if ((CUR == '<') && (NXT(1) == '!') &&
4481                 (NXT(2) == '-') && (NXT(3) == '-')) {
4482                 htmlParseComment(ctxt);
4483             }
4484
4485             /*
4486              * Second case : a Processing Instruction.
4487              */
4488             else if ((CUR == '<') && (NXT(1) == '?')) {
4489                 htmlParsePI(ctxt);
4490             }
4491
4492             /*
4493              * Third case :  a sub-element.
4494              */
4495             else if (CUR == '<') {
4496                 htmlParseElementInternal(ctxt);
4497                 if (currentNode != NULL) xmlFree(currentNode);
4498
4499                 currentNode = xmlStrdup(ctxt->name);
4500                 depth = ctxt->nameNr;
4501             }
4502
4503             /*
4504              * Fourth case : a reference. If if has not been resolved,
4505              *    parsing returns it's Name, create the node
4506              */
4507             else if (CUR == '&') {
4508                 htmlParseReference(ctxt);
4509             }
4510
4511             /*
4512              * Fifth case : end of the resource
4513              */
4514             else if (CUR == 0) {
4515                 htmlAutoCloseOnEnd(ctxt);
4516                 break;
4517             }
4518
4519             /*
4520              * Last case, text. Note that References are handled directly.
4521              */
4522             else {
4523                 htmlParseCharData(ctxt);
4524             }
4525
4526             if (cons == ctxt->nbChars) {
4527                 if (ctxt->node != NULL) {
4528                     htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4529                                  "detected an error in element content\n",
4530                                  NULL, NULL);
4531                 }
4532                 break;
4533             }
4534         }
4535         GROW;
4536     }
4537     if (currentNode != NULL) xmlFree(currentNode);
4538 }
4539
4540 /**
4541  * htmlParseContent:
4542  * @ctxt:  an HTML parser context
4543  *
4544  * Parse a content: comment, sub-element, reference or text.
4545  * This is the entry point when called from parser.c
4546  */
4547
4548 void
4549 __htmlParseContent(void *ctxt) {
4550     if (ctxt != NULL)
4551         htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
4552 }
4553
4554 /**
4555  * htmlParseDocument:
4556  * @ctxt:  an HTML parser context
4557  *
4558  * parse an HTML document (and build a tree if using the standard SAX
4559  * interface).
4560  *
4561  * Returns 0, -1 in case of error. the parser context is augmented
4562  *                as a result of the parsing.
4563  */
4564
4565 int
4566 htmlParseDocument(htmlParserCtxtPtr ctxt) {
4567     xmlChar start[4];
4568     xmlCharEncoding enc;
4569     xmlDtdPtr dtd;
4570
4571     xmlInitParser();
4572
4573     htmlDefaultSAXHandlerInit();
4574
4575     if ((ctxt == NULL) || (ctxt->input == NULL)) {
4576         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4577                      "htmlParseDocument: context error\n", NULL, NULL);
4578         return(XML_ERR_INTERNAL_ERROR);
4579     }
4580     ctxt->html = 1;
4581     ctxt->linenumbers = 1;
4582     GROW;
4583     /*
4584      * SAX: beginning of the document processing.
4585      */
4586     if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4587         ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4588
4589     if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
4590         ((ctxt->input->end - ctxt->input->cur) >= 4)) {
4591         /*
4592          * Get the 4 first bytes and decode the charset
4593          * if enc != XML_CHAR_ENCODING_NONE
4594          * plug some encoding conversion routines.
4595          */
4596         start[0] = RAW;
4597         start[1] = NXT(1);
4598         start[2] = NXT(2);
4599         start[3] = NXT(3);
4600         enc = xmlDetectCharEncoding(&start[0], 4);
4601         if (enc != XML_CHAR_ENCODING_NONE) {
4602             xmlSwitchEncoding(ctxt, enc);
4603         }
4604     }
4605
4606     /*
4607      * Wipe out everything which is before the first '<'
4608      */
4609     SKIP_BLANKS;
4610     if (CUR == 0) {
4611         htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4612                      "Document is empty\n", NULL, NULL);
4613     }
4614
4615     if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4616         ctxt->sax->startDocument(ctxt->userData);
4617
4618
4619     /*
4620      * Parse possible comments and PIs before any content
4621      */
4622     while (((CUR == '<') && (NXT(1) == '!') &&
4623             (NXT(2) == '-') && (NXT(3) == '-')) ||
4624            ((CUR == '<') && (NXT(1) == '?'))) {
4625         htmlParseComment(ctxt);
4626         htmlParsePI(ctxt);
4627         SKIP_BLANKS;
4628     }
4629
4630
4631     /*
4632      * Then possibly doc type declaration(s) and more Misc
4633      * (doctypedecl Misc*)?
4634      */
4635     if ((CUR == '<') && (NXT(1) == '!') &&
4636         (UPP(2) == 'D') && (UPP(3) == 'O') &&
4637         (UPP(4) == 'C') && (UPP(5) == 'T') &&
4638         (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4639         (UPP(8) == 'E')) {
4640         htmlParseDocTypeDecl(ctxt);
4641     }
4642     SKIP_BLANKS;
4643
4644     /*
4645      * Parse possible comments and PIs before any content
4646      */
4647     while (((CUR == '<') && (NXT(1) == '!') &&
4648             (NXT(2) == '-') && (NXT(3) == '-')) ||
4649            ((CUR == '<') && (NXT(1) == '?'))) {
4650         htmlParseComment(ctxt);
4651         htmlParsePI(ctxt);
4652         SKIP_BLANKS;
4653     }
4654
4655     /*
4656      * Time to start parsing the tree itself
4657      */
4658     htmlParseContentInternal(ctxt);
4659
4660     /*
4661      * autoclose
4662      */
4663     if (CUR == 0)
4664         htmlAutoCloseOnEnd(ctxt);
4665
4666
4667     /*
4668      * SAX: end of the document processing.
4669      */
4670     if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4671         ctxt->sax->endDocument(ctxt->userData);
4672
4673     if (ctxt->myDoc != NULL) {
4674         dtd = xmlGetIntSubset(ctxt->myDoc);
4675         if (dtd == NULL)
4676             ctxt->myDoc->intSubset =
4677                 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
4678                     BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4679                     BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4680     }
4681     if (! ctxt->wellFormed) return(-1);
4682     return(0);
4683 }
4684
4685
4686 /************************************************************************
4687  *                                                                      *
4688  *                      Parser contexts handling                        *
4689  *                                                                      *
4690  ************************************************************************/
4691
4692 /**
4693  * htmlInitParserCtxt:
4694  * @ctxt:  an HTML parser context
4695  *
4696  * Initialize a parser context
4697  *
4698  * Returns 0 in case of success and -1 in case of error
4699  */
4700
4701 static int
4702 htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4703 {
4704     htmlSAXHandler *sax;
4705
4706     if (ctxt == NULL) return(-1);
4707     memset(ctxt, 0, sizeof(htmlParserCtxt));
4708
4709     ctxt->dict = xmlDictCreate();
4710     if (ctxt->dict == NULL) {
4711         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4712         return(-1);
4713     }
4714     sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4715     if (sax == NULL) {
4716         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4717         return(-1);
4718     }
4719     else
4720         memset(sax, 0, sizeof(htmlSAXHandler));
4721
4722     /* Allocate the Input stack */
4723     ctxt->inputTab = (htmlParserInputPtr *)
4724                       xmlMalloc(5 * sizeof(htmlParserInputPtr));
4725     if (ctxt->inputTab == NULL) {
4726         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4727         ctxt->inputNr = 0;
4728         ctxt->inputMax = 0;
4729         ctxt->input = NULL;
4730         return(-1);
4731     }
4732     ctxt->inputNr = 0;
4733     ctxt->inputMax = 5;
4734     ctxt->input = NULL;
4735     ctxt->version = NULL;
4736     ctxt->encoding = NULL;
4737     ctxt->standalone = -1;
4738     ctxt->instate = XML_PARSER_START;
4739
4740     /* Allocate the Node stack */
4741     ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4742     if (ctxt->nodeTab == NULL) {
4743         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4744         ctxt->nodeNr = 0;
4745         ctxt->nodeMax = 0;
4746         ctxt->node = NULL;
4747         ctxt->inputNr = 0;
4748         ctxt->inputMax = 0;
4749         ctxt->input = NULL;
4750         return(-1);
4751     }
4752     ctxt->nodeNr = 0;
4753     ctxt->nodeMax = 10;
4754     ctxt->node = NULL;
4755
4756     /* Allocate the Name stack */
4757     ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
4758     if (ctxt->nameTab == NULL) {
4759         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4760         ctxt->nameNr = 0;
4761         ctxt->nameMax = 0;
4762         ctxt->name = NULL;
4763         ctxt->nodeNr = 0;
4764         ctxt->nodeMax = 0;
4765         ctxt->node = NULL;
4766         ctxt->inputNr = 0;
4767         ctxt->inputMax = 0;
4768         ctxt->input = NULL;
4769         return(-1);
4770     }
4771     ctxt->nameNr = 0;
4772     ctxt->nameMax = 10;
4773     ctxt->name = NULL;
4774
4775     ctxt->nodeInfoTab = NULL;
4776     ctxt->nodeInfoNr  = 0;
4777     ctxt->nodeInfoMax = 0;
4778
4779     if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
4780     else {
4781         ctxt->sax = sax;
4782         memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
4783     }
4784     ctxt->userData = ctxt;
4785     ctxt->myDoc = NULL;
4786     ctxt->wellFormed = 1;
4787     ctxt->replaceEntities = 0;
4788     ctxt->linenumbers = xmlLineNumbersDefaultValue;
4789     ctxt->html = 1;
4790     ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
4791     ctxt->vctxt.userData = ctxt;
4792     ctxt->vctxt.error = xmlParserValidityError;
4793     ctxt->vctxt.warning = xmlParserValidityWarning;
4794     ctxt->record_info = 0;
4795     ctxt->validate = 0;
4796     ctxt->nbChars = 0;
4797     ctxt->checkIndex = 0;
4798     ctxt->catalogs = NULL;
4799     xmlInitNodeInfoSeq(&ctxt->node_seq);
4800     return(0);
4801 }
4802
4803 /**
4804  * htmlFreeParserCtxt:
4805  * @ctxt:  an HTML parser context
4806  *
4807  * Free all the memory used by a parser context. However the parsed
4808  * document in ctxt->myDoc is not freed.
4809  */
4810
4811 void
4812 htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4813 {
4814     xmlFreeParserCtxt(ctxt);
4815 }
4816
4817 /**
4818  * htmlNewParserCtxt:
4819  *
4820  * Allocate and initialize a new parser context.
4821  *
4822  * Returns the htmlParserCtxtPtr or NULL in case of allocation error
4823  */
4824
4825 htmlParserCtxtPtr
4826 htmlNewParserCtxt(void)
4827 {
4828     xmlParserCtxtPtr ctxt;
4829
4830     ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4831     if (ctxt == NULL) {
4832         htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
4833         return(NULL);
4834     }
4835     memset(ctxt, 0, sizeof(xmlParserCtxt));
4836     if (htmlInitParserCtxt(ctxt) < 0) {
4837         htmlFreeParserCtxt(ctxt);
4838         return(NULL);
4839     }
4840     return(ctxt);
4841 }
4842
4843 /**
4844  * htmlCreateMemoryParserCtxt:
4845  * @buffer:  a pointer to a char array
4846  * @size:  the size of the array
4847  *
4848  * Create a parser context for an HTML in-memory document.
4849  *
4850  * Returns the new parser context or NULL
4851  */
4852 htmlParserCtxtPtr
4853 htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4854     xmlParserCtxtPtr ctxt;
4855     xmlParserInputPtr input;
4856     xmlParserInputBufferPtr buf;
4857
4858     if (buffer == NULL)
4859         return(NULL);
4860     if (size <= 0)
4861         return(NULL);
4862
4863     ctxt = htmlNewParserCtxt();
4864     if (ctxt == NULL)
4865         return(NULL);
4866
4867     buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
4868     if (buf == NULL) return(NULL);
4869
4870     input = xmlNewInputStream(ctxt);
4871     if (input == NULL) {
4872         xmlFreeParserCtxt(ctxt);
4873         return(NULL);
4874     }
4875
4876     input->filename = NULL;
4877     input->buf = buf;
4878     input->base = input->buf->buffer->content;
4879     input->cur = input->buf->buffer->content;
4880     input->end = &input->buf->buffer->content[input->buf->buffer->use];
4881
4882     inputPush(ctxt, input);
4883     return(ctxt);
4884 }
4885
4886 /**
4887  * htmlCreateDocParserCtxt:
4888  * @cur:  a pointer to an array of xmlChar
4889  * @encoding:  a free form C string describing the HTML document encoding, or NULL
4890  *
4891  * Create a parser context for an HTML document.
4892  *
4893  * TODO: check the need to add encoding handling there
4894  *
4895  * Returns the new parser context or NULL
4896  */
4897 static htmlParserCtxtPtr
4898 htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
4899     int len;
4900     htmlParserCtxtPtr ctxt;
4901
4902     if (cur == NULL)
4903         return(NULL);
4904     len = xmlStrlen(cur);
4905     ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
4906     if (ctxt == NULL)
4907         return(NULL);
4908
4909     if (encoding != NULL) {
4910         xmlCharEncoding enc;
4911         xmlCharEncodingHandlerPtr handler;
4912
4913         if (ctxt->input->encoding != NULL)
4914             xmlFree((xmlChar *) ctxt->input->encoding);
4915         ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
4916
4917         enc = xmlParseCharEncoding(encoding);
4918         /*
4919          * registered set of known encodings
4920          */
4921         if (enc != XML_CHAR_ENCODING_ERROR) {
4922             xmlSwitchEncoding(ctxt, enc);
4923             if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
4924                 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4925                              "Unsupported encoding %s\n",
4926                              (const xmlChar *) encoding, NULL);
4927             }
4928         } else {
4929             /*
4930              * fallback for unknown encodings
4931              */
4932             handler = xmlFindCharEncodingHandler((const char *) encoding);
4933             if (handler != NULL) {
4934                 xmlSwitchToEncoding(ctxt, handler);
4935             } else {
4936                 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4937                              "Unsupported encoding %s\n",
4938                              (const xmlChar *) encoding, NULL);
4939             }
4940         }
4941     }
4942     return(ctxt);
4943 }
4944
4945 #ifdef LIBXML_PUSH_ENABLED
4946 /************************************************************************
4947  *                                                                      *
4948  *      Progressive parsing interfaces                          *
4949  *                                                                      *
4950  ************************************************************************/
4951
4952 /**
4953  * htmlParseLookupSequence:
4954  * @ctxt:  an HTML parser context
4955  * @first:  the first char to lookup
4956  * @next:  the next char to lookup or zero
4957  * @third:  the next char to lookup or zero
4958  * @comment: flag to force checking inside comments
4959  *
4960  * Try to find if a sequence (first, next, third) or  just (first next) or
4961  * (first) is available in the input stream.
4962  * This function has a side effect of (possibly) incrementing ctxt->checkIndex
4963  * to avoid rescanning sequences of bytes, it DOES change the state of the
4964  * parser, do not use liberally.
4965  * This is basically similar to xmlParseLookupSequence()
4966  *
4967  * Returns the index to the current parsing point if the full sequence
4968  *      is available, -1 otherwise.
4969  */
4970 static int
4971 htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
4972                         xmlChar next, xmlChar third, int iscomment,
4973                         int ignoreattrval)
4974 {
4975     int base, len;
4976     htmlParserInputPtr in;
4977     const xmlChar *buf;
4978     int incomment = 0;
4979     int invalue = 0;
4980     char valdellim = 0x0;
4981
4982     in = ctxt->input;
4983     if (in == NULL)
4984         return (-1);
4985
4986     base = in->cur - in->base;
4987     if (base < 0)
4988         return (-1);
4989
4990     if (ctxt->checkIndex > base)
4991         base = ctxt->checkIndex;
4992
4993     if (in->buf == NULL) {
4994         buf = in->base;
4995         len = in->length;
4996     } else {
4997         buf = in->buf->buffer->content;
4998         len = in->buf->buffer->use;
4999     }
5000
5001     /* take into account the sequence length */
5002     if (third)
5003         len -= 2;
5004     else if (next)
5005         len--;
5006     for (; base < len; base++) {
5007         if ((!incomment) && (base + 4 < len) && (!iscomment)) {
5008             if ((buf[base] == '<') && (buf[base + 1] == '!') &&
5009                 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
5010                 incomment = 1;
5011                 /* do not increment past <! - some people use <!--> */
5012                 base += 2;
5013             }
5014         }
5015         if (ignoreattrval) {
5016             if (buf[base] == '"' || buf[base] == '\'') {
5017                 if (invalue) {
5018                     if (buf[base] == valdellim) {
5019                         invalue = 0;
5020                         continue;
5021                     }
5022                 } else {
5023                     valdellim = buf[base];
5024                     invalue = 1;
5025                     continue;
5026                 }
5027             } else if (invalue) {
5028                 continue;
5029             }
5030         }
5031         if (incomment) {
5032             if (base + 3 > len)
5033                 return (-1);
5034             if ((buf[base] == '-') && (buf[base + 1] == '-') &&
5035                 (buf[base + 2] == '>')) {
5036                 incomment = 0;
5037                 base += 2;
5038             }
5039             continue;
5040         }
5041         if (buf[base] == first) {
5042             if (third != 0) {
5043                 if ((buf[base + 1] != next) || (buf[base + 2] != third))
5044                     continue;
5045             } else if (next != 0) {
5046                 if (buf[base + 1] != next)
5047                     continue;
5048             }
5049             ctxt->checkIndex = 0;
5050 #ifdef DEBUG_PUSH
5051             if (next == 0)
5052                 xmlGenericError(xmlGenericErrorContext,
5053                                 "HPP: lookup '%c' found at %d\n",
5054                                 first, base);
5055             else if (third == 0)
5056                 xmlGenericError(xmlGenericErrorContext,
5057                                 "HPP: lookup '%c%c' found at %d\n",
5058                                 first, next, base);
5059             else
5060                 xmlGenericError(xmlGenericErrorContext,
5061                                 "HPP: lookup '%c%c%c' found at %d\n",
5062                                 first, next, third, base);
5063 #endif
5064             return (base - (in->cur - in->base));
5065         }
5066     }
5067     if ((!incomment) && (!invalue))
5068         ctxt->checkIndex = base;
5069 #ifdef DEBUG_PUSH
5070     if (next == 0)
5071         xmlGenericError(xmlGenericErrorContext,
5072                         "HPP: lookup '%c' failed\n", first);
5073     else if (third == 0)
5074         xmlGenericError(xmlGenericErrorContext,
5075                         "HPP: lookup '%c%c' failed\n", first, next);
5076     else
5077         xmlGenericError(xmlGenericErrorContext,
5078                         "HPP: lookup '%c%c%c' failed\n", first, next,
5079                         third);
5080 #endif
5081     return (-1);
5082 }
5083
5084 /**
5085  * htmlParseLookupChars:
5086  * @ctxt: an HTML parser context
5087  * @stop: Array of chars, which stop the lookup.
5088  * @stopLen: Length of stop-Array
5089  *
5090  * Try to find if any char of the stop-Array is available in the input
5091  * stream.
5092  * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5093  * to avoid rescanning sequences of bytes, it DOES change the state of the
5094  * parser, do not use liberally.
5095  *
5096  * Returns the index to the current parsing point if a stopChar
5097  *      is available, -1 otherwise.
5098  */
5099 static int
5100 htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop,
5101                      int stopLen)
5102 {
5103     int base, len;
5104     htmlParserInputPtr in;
5105     const xmlChar *buf;
5106     int incomment = 0;
5107     int i;
5108
5109     in = ctxt->input;
5110     if (in == NULL)
5111         return (-1);
5112
5113     base = in->cur - in->base;
5114     if (base < 0)
5115         return (-1);
5116
5117     if (ctxt->checkIndex > base)
5118         base = ctxt->checkIndex;
5119
5120     if (in->buf == NULL) {
5121         buf = in->base;
5122         len = in->length;
5123     } else {
5124         buf = in->buf->buffer->content;
5125         len = in->buf->buffer->use;
5126     }
5127
5128     for (; base < len; base++) {
5129         if (!incomment && (base + 4 < len)) {
5130             if ((buf[base] == '<') && (buf[base + 1] == '!') &&
5131                 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
5132                 incomment = 1;
5133                 /* do not increment past <! - some people use <!--> */
5134                 base += 2;
5135             }
5136         }
5137         if (incomment) {
5138             if (base + 3 > len)
5139                 return (-1);
5140             if ((buf[base] == '-') && (buf[base + 1] == '-') &&
5141                 (buf[base + 2] == '>')) {
5142                 incomment = 0;
5143                 base += 2;
5144             }
5145             continue;
5146         }
5147         for (i = 0; i < stopLen; ++i) {
5148             if (buf[base] == stop[i]) {
5149                 ctxt->checkIndex = 0;
5150                 return (base - (in->cur - in->base));
5151             }
5152         }
5153     }
5154     ctxt->checkIndex = base;
5155     return (-1);
5156 }
5157
5158 /**
5159  * htmlParseTryOrFinish:
5160  * @ctxt:  an HTML parser context
5161  * @terminate:  last chunk indicator
5162  *
5163  * Try to progress on parsing
5164  *
5165  * Returns zero if no parsing was possible
5166  */
5167 static int
5168 htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
5169     int ret = 0;
5170     htmlParserInputPtr in;
5171     int avail = 0;
5172     xmlChar cur, next;
5173
5174 #ifdef DEBUG_PUSH
5175     switch (ctxt->instate) {
5176         case XML_PARSER_EOF:
5177             xmlGenericError(xmlGenericErrorContext,
5178                     "HPP: try EOF\n"); break;
5179         case XML_PARSER_START:
5180             xmlGenericError(xmlGenericErrorContext,
5181                     "HPP: try START\n"); break;
5182         case XML_PARSER_MISC:
5183             xmlGenericError(xmlGenericErrorContext,
5184                     "HPP: try MISC\n");break;
5185         case XML_PARSER_COMMENT:
5186             xmlGenericError(xmlGenericErrorContext,
5187                     "HPP: try COMMENT\n");break;
5188         case XML_PARSER_PROLOG:
5189             xmlGenericError(xmlGenericErrorContext,
5190                     "HPP: try PROLOG\n");break;
5191         case XML_PARSER_START_TAG:
5192             xmlGenericError(xmlGenericErrorContext,
5193                     "HPP: try START_TAG\n");break;
5194         case XML_PARSER_CONTENT:
5195             xmlGenericError(xmlGenericErrorContext,
5196                     "HPP: try CONTENT\n");break;
5197         case XML_PARSER_CDATA_SECTION:
5198             xmlGenericError(xmlGenericErrorContext,
5199                     "HPP: try CDATA_SECTION\n");break;
5200         case XML_PARSER_END_TAG:
5201             xmlGenericError(xmlGenericErrorContext,
5202                     "HPP: try END_TAG\n");break;
5203         case XML_PARSER_ENTITY_DECL:
5204             xmlGenericError(xmlGenericErrorContext,
5205                     "HPP: try ENTITY_DECL\n");break;
5206         case XML_PARSER_ENTITY_VALUE:
5207             xmlGenericError(xmlGenericErrorContext,
5208                     "HPP: try ENTITY_VALUE\n");break;
5209         case XML_PARSER_ATTRIBUTE_VALUE:
5210             xmlGenericError(xmlGenericErrorContext,
5211                     "HPP: try ATTRIBUTE_VALUE\n");break;
5212         case XML_PARSER_DTD:
5213             xmlGenericError(xmlGenericErrorContext,
5214                     "HPP: try DTD\n");break;
5215         case XML_PARSER_EPILOG:
5216             xmlGenericError(xmlGenericErrorContext,
5217                     "HPP: try EPILOG\n");break;
5218         case XML_PARSER_PI:
5219             xmlGenericError(xmlGenericErrorContext,
5220                     "HPP: try PI\n");break;
5221         case XML_PARSER_SYSTEM_LITERAL:
5222             xmlGenericError(xmlGenericErrorContext,
5223                     "HPP: try SYSTEM_LITERAL\n");break;
5224     }
5225 #endif
5226
5227     while (1) {
5228
5229         in = ctxt->input;
5230         if (in == NULL) break;
5231         if (in->buf == NULL)
5232             avail = in->length - (in->cur - in->base);
5233         else
5234             avail = in->buf->buffer->use - (in->cur - in->base);
5235         if ((avail == 0) && (terminate)) {
5236             htmlAutoCloseOnEnd(ctxt);
5237             if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5238                 /*
5239                  * SAX: end of the document processing.
5240                  */
5241                 ctxt->instate = XML_PARSER_EOF;
5242                 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5243                     ctxt->sax->endDocument(ctxt->userData);
5244             }
5245         }
5246         if (avail < 1)
5247             goto done;
5248         cur = in->cur[0];
5249         if (cur == 0) {
5250             SKIP(1);
5251             continue;
5252         }
5253
5254         switch (ctxt->instate) {
5255             case XML_PARSER_EOF:
5256                 /*
5257                  * Document parsing is done !
5258                  */
5259                 goto done;
5260             case XML_PARSER_START:
5261                 /*
5262                  * Very first chars read from the document flow.
5263                  */
5264                 cur = in->cur[0];
5265                 if (IS_BLANK_CH(cur)) {
5266                     SKIP_BLANKS;
5267                     if (in->buf == NULL)
5268                         avail = in->length - (in->cur - in->base);
5269                     else
5270                         avail = in->buf->buffer->use - (in->cur - in->base);
5271                 }
5272                 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
5273                     ctxt->sax->setDocumentLocator(ctxt->userData,
5274                                                   &xmlDefaultSAXLocator);
5275                 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
5276                     (!ctxt->disableSAX))
5277                     ctxt->sax->startDocument(ctxt->userData);
5278
5279                 cur = in->cur[0];
5280                 next = in->cur[1];
5281                 if ((cur == '<') && (next == '!') &&
5282                     (UPP(2) == 'D') && (UPP(3) == 'O') &&
5283                     (UPP(4) == 'C') && (UPP(5) == 'T') &&
5284                     (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5285                     (UPP(8) == 'E')) {
5286                     if ((!terminate) &&
5287                         (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5288                         goto done;
5289 #ifdef DEBUG_PUSH
5290                     xmlGenericError(xmlGenericErrorContext,
5291                             "HPP: Parsing internal subset\n");
5292 #endif
5293                     htmlParseDocTypeDecl(ctxt);
5294                     ctxt->instate = XML_PARSER_PROLOG;
5295 #ifdef DEBUG_PUSH
5296                     xmlGenericError(xmlGenericErrorContext,
5297                             "HPP: entering PROLOG\n");
5298 #endif
5299                 } else {
5300                     ctxt->instate = XML_PARSER_MISC;
5301 #ifdef DEBUG_PUSH
5302                     xmlGenericError(xmlGenericErrorContext,
5303                             "HPP: entering MISC\n");
5304 #endif
5305                 }
5306                 break;
5307             case XML_PARSER_MISC:
5308                 SKIP_BLANKS;
5309                 if (in->buf == NULL)
5310                     avail = in->length - (in->cur - in->base);
5311                 else
5312                     avail = in->buf->buffer->use - (in->cur - in->base);
5313                 if (avail < 2)
5314                     goto done;
5315                 cur = in->cur[0];
5316                 next = in->cur[1];
5317                 if ((cur == '<') && (next == '!') &&
5318                     (in->cur[2] == '-') && (in->cur[3] == '-')) {
5319                     if ((!terminate) &&
5320                         (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
5321                         goto done;
5322 #ifdef DEBUG_PUSH
5323                     xmlGenericError(xmlGenericErrorContext,
5324                             "HPP: Parsing Comment\n");
5325 #endif
5326                     htmlParseComment(ctxt);
5327                     ctxt->instate = XML_PARSER_MISC;
5328                 } else if ((cur == '<') && (next == '?')) {
5329                     if ((!terminate) &&
5330                         (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5331                         goto done;
5332 #ifdef DEBUG_PUSH
5333                     xmlGenericError(xmlGenericErrorContext,
5334                             "HPP: Parsing PI\n");
5335 #endif
5336                     htmlParsePI(ctxt);
5337                     ctxt->instate = XML_PARSER_MISC;
5338                 } else if ((cur == '<') && (next == '!') &&
5339                     (UPP(2) == 'D') && (UPP(3) == 'O') &&
5340                     (UPP(4) == 'C') && (UPP(5) == 'T') &&
5341                     (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5342                     (UPP(8) == 'E')) {
5343                     if ((!terminate) &&
5344                         (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5345                         goto done;
5346 #ifdef DEBUG_PUSH
5347                     xmlGenericError(xmlGenericErrorContext,
5348                             "HPP: Parsing internal subset\n");
5349 #endif
5350                     htmlParseDocTypeDecl(ctxt);
5351                     ctxt->instate = XML_PARSER_PROLOG;
5352 #ifdef DEBUG_PUSH
5353                     xmlGenericError(xmlGenericErrorContext,
5354                             "HPP: entering PROLOG\n");
5355 #endif
5356                 } else if ((cur == '<') && (next == '!') &&
5357                            (avail < 9)) {
5358                     goto done;
5359                 } else {
5360                     ctxt->instate = XML_PARSER_START_TAG;
5361 #ifdef DEBUG_PUSH
5362                     xmlGenericError(xmlGenericErrorContext,
5363                             "HPP: entering START_TAG\n");
5364 #endif
5365                 }
5366                 break;
5367             case XML_PARSER_PROLOG:
5368                 SKIP_BLANKS;
5369                 if (in->buf == NULL)
5370                     avail = in->length - (in->cur - in->base);
5371                 else
5372                     avail = in->buf->buffer->use - (in->cur - in->base);
5373                 if (avail < 2)
5374                     goto done;
5375                 cur = in->cur[0];
5376                 next = in->cur[1];
5377                 if ((cur == '<') && (next == '!') &&
5378                     (in->cur[2] == '-') && (in->cur[3] == '-')) {
5379                     if ((!terminate) &&
5380                         (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
5381                         goto done;
5382 #ifdef DEBUG_PUSH
5383                     xmlGenericError(xmlGenericErrorContext,
5384                             "HPP: Parsing Comment\n");
5385 #endif
5386                     htmlParseComment(ctxt);
5387                     ctxt->instate = XML_PARSER_PROLOG;
5388                 } else if ((cur == '<') && (next == '?')) {
5389                     if ((!terminate) &&
5390                         (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5391                         goto done;
5392 #ifdef DEBUG_PUSH
5393                     xmlGenericError(xmlGenericErrorContext,
5394                             "HPP: Parsing PI\n");
5395 #endif
5396                     htmlParsePI(ctxt);
5397                     ctxt->instate = XML_PARSER_PROLOG;
5398                 } else if ((cur == '<') && (next == '!') &&
5399                            (avail < 4)) {
5400                     goto done;
5401                 } else {
5402                     ctxt->instate = XML_PARSER_START_TAG;
5403 #ifdef DEBUG_PUSH
5404                     xmlGenericError(xmlGenericErrorContext,
5405                             "HPP: entering START_TAG\n");
5406 #endif
5407                 }
5408                 break;
5409             case XML_PARSER_EPILOG:
5410                 if (in->buf == NULL)
5411                     avail = in->length - (in->cur - in->base);
5412                 else
5413                     avail = in->buf->buffer->use - (in->cur - in->base);
5414                 if (avail < 1)
5415                     goto done;
5416                 cur = in->cur[0];
5417                 if (IS_BLANK_CH(cur)) {
5418                     htmlParseCharData(ctxt);
5419                     goto done;
5420                 }
5421                 if (avail < 2)
5422                     goto done;
5423                 next = in->cur[1];
5424                 if ((cur == '<') && (next == '!') &&
5425                     (in->cur[2] == '-') && (in->cur[3] == '-')) {
5426                     if ((!terminate) &&
5427                         (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
5428                         goto done;
5429 #ifdef DEBUG_PUSH
5430                     xmlGenericError(xmlGenericErrorContext,
5431                             "HPP: Parsing Comment\n");
5432 #endif
5433                     htmlParseComment(ctxt);
5434                     ctxt->instate = XML_PARSER_EPILOG;
5435                 } else if ((cur == '<') && (next == '?')) {
5436                     if ((!terminate) &&
5437                         (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5438                         goto done;
5439 #ifdef DEBUG_PUSH
5440                     xmlGenericError(xmlGenericErrorContext,
5441                             "HPP: Parsing PI\n");
5442 #endif
5443                     htmlParsePI(ctxt);
5444                     ctxt->instate = XML_PARSER_EPILOG;
5445                 } else if ((cur == '<') && (next == '!') &&
5446                            (avail < 4)) {
5447                     goto done;
5448                 } else {
5449                     ctxt->errNo = XML_ERR_DOCUMENT_END;
5450                     ctxt->wellFormed = 0;
5451                     ctxt->instate = XML_PARSER_EOF;
5452 #ifdef DEBUG_PUSH
5453                     xmlGenericError(xmlGenericErrorContext,
5454                             "HPP: entering EOF\n");
5455 #endif
5456                     if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5457                         ctxt->sax->endDocument(ctxt->userData);
5458                     goto done;
5459                 }
5460                 break;
5461             case XML_PARSER_START_TAG: {
5462                 const xmlChar *name;
5463                 int failed;
5464                 const htmlElemDesc * info;
5465
5466                 if (avail < 2)
5467                     goto done;
5468                 cur = in->cur[0];
5469                 if (cur != '<') {
5470                     ctxt->instate = XML_PARSER_CONTENT;
5471 #ifdef DEBUG_PUSH
5472                     xmlGenericError(xmlGenericErrorContext,
5473                             "HPP: entering CONTENT\n");
5474 #endif
5475                     break;
5476                 }
5477                 if (in->cur[1] == '/') {
5478                     ctxt->instate = XML_PARSER_END_TAG;
5479                     ctxt->checkIndex = 0;
5480 #ifdef DEBUG_PUSH
5481                     xmlGenericError(xmlGenericErrorContext,
5482                             "HPP: entering END_TAG\n");
5483 #endif
5484                     break;
5485                 }
5486                 if ((!terminate) &&
5487                     (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5488                     goto done;
5489
5490                 failed = htmlParseStartTag(ctxt);
5491                 name = ctxt->name;
5492                 if ((failed == -1) ||
5493                     (name == NULL)) {
5494                     if (CUR == '>')
5495                         NEXT;
5496                     break;
5497                 }
5498
5499                 /*
5500                  * Lookup the info for that element.
5501                  */
5502                 info = htmlTagLookup(name);
5503                 if (info == NULL) {
5504                     htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5505                                  "Tag %s invalid\n", name, NULL);
5506                 }
5507
5508                 /*
5509                  * Check for an Empty Element labeled the XML/SGML way
5510                  */
5511                 if ((CUR == '/') && (NXT(1) == '>')) {
5512                     SKIP(2);
5513                     if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5514                         ctxt->sax->endElement(ctxt->userData, name);
5515                     htmlnamePop(ctxt);
5516                     ctxt->instate = XML_PARSER_CONTENT;
5517 #ifdef DEBUG_PUSH
5518                     xmlGenericError(xmlGenericErrorContext,
5519                             "HPP: entering CONTENT\n");
5520 #endif
5521                     break;
5522                 }
5523
5524                 if (CUR == '>') {
5525                     NEXT;
5526                 } else {
5527                     htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5528                                  "Couldn't find end of Start Tag %s\n",
5529                                  name, NULL);
5530
5531                     /*
5532                      * end of parsing of this node.
5533                      */
5534                     if (xmlStrEqual(name, ctxt->name)) {
5535                         nodePop(ctxt);
5536                         htmlnamePop(ctxt);
5537                     }
5538
5539                     ctxt->instate = XML_PARSER_CONTENT;
5540 #ifdef DEBUG_PUSH
5541                     xmlGenericError(xmlGenericErrorContext,
5542                             "HPP: entering CONTENT\n");
5543 #endif
5544                     break;
5545                 }
5546
5547                 /*
5548                  * Check for an Empty Element from DTD definition
5549                  */
5550                 if ((info != NULL) && (info->empty)) {
5551                     if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5552                         ctxt->sax->endElement(ctxt->userData, name);
5553                     htmlnamePop(ctxt);
5554                 }
5555                 ctxt->instate = XML_PARSER_CONTENT;
5556 #ifdef DEBUG_PUSH
5557                 xmlGenericError(xmlGenericErrorContext,
5558                         "HPP: entering CONTENT\n");
5559 #endif
5560                 break;
5561             }
5562             case XML_PARSER_CONTENT: {
5563                 long cons;
5564                 /*
5565                  * Handle preparsed entities and charRef
5566                  */
5567                 if (ctxt->token != 0) {
5568                     xmlChar chr[2] = { 0 , 0 } ;
5569
5570                     chr[0] = (xmlChar) ctxt->token;
5571                     htmlCheckParagraph(ctxt);
5572                     if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5573                         ctxt->sax->characters(ctxt->userData, chr, 1);
5574                     ctxt->token = 0;
5575                     ctxt->checkIndex = 0;
5576                 }
5577                 if ((avail == 1) && (terminate)) {
5578                     cur = in->cur[0];
5579                     if ((cur != '<') && (cur != '&')) {
5580                         if (ctxt->sax != NULL) {
5581                             if (IS_BLANK_CH(cur)) {
5582                                 if (ctxt->sax->ignorableWhitespace != NULL)
5583                                     ctxt->sax->ignorableWhitespace(
5584                                             ctxt->userData, &cur, 1);
5585                             } else {
5586                                 htmlCheckParagraph(ctxt);
5587                                 if (ctxt->sax->characters != NULL)
5588                                     ctxt->sax->characters(
5589                                             ctxt->userData, &cur, 1);
5590                             }
5591                         }
5592                         ctxt->token = 0;
5593                         ctxt->checkIndex = 0;
5594                         in->cur++;
5595                         break;
5596                     }
5597                 }
5598                 if (avail < 2)
5599                     goto done;
5600                 cur = in->cur[0];
5601                 next = in->cur[1];
5602                 cons = ctxt->nbChars;
5603                 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5604                     (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5605                     /*
5606                      * Handle SCRIPT/STYLE separately
5607                      */
5608                     if (!terminate) {
5609                         int idx;
5610                         xmlChar val;
5611
5612                         idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0, 1);
5613                         if (idx < 0)
5614                             goto done;
5615                         val = in->cur[idx + 2];
5616                         if (val == 0) /* bad cut of input */
5617                             goto done;
5618                     }
5619                     htmlParseScript(ctxt);
5620                     if ((cur == '<') && (next == '/')) {
5621                         ctxt->instate = XML_PARSER_END_TAG;
5622                         ctxt->checkIndex = 0;
5623 #ifdef DEBUG_PUSH
5624                         xmlGenericError(xmlGenericErrorContext,
5625                                 "HPP: entering END_TAG\n");
5626 #endif
5627                         break;
5628                     }
5629                 } else {
5630                     /*
5631                      * Sometimes DOCTYPE arrives in the middle of the document
5632                      */
5633                     if ((cur == '<') && (next == '!') &&
5634                         (UPP(2) == 'D') && (UPP(3) == 'O') &&
5635                         (UPP(4) == 'C') && (UPP(5) == 'T') &&
5636                         (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5637                         (UPP(8) == 'E')) {
5638                         if ((!terminate) &&
5639                             (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5640                             goto done;
5641                         htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5642                                      "Misplaced DOCTYPE declaration\n",
5643                                      BAD_CAST "DOCTYPE" , NULL);
5644                         htmlParseDocTypeDecl(ctxt);
5645                     } else if ((cur == '<') && (next == '!') &&
5646                         (in->cur[2] == '-') && (in->cur[3] == '-')) {
5647                         if ((!terminate) &&
5648                             (htmlParseLookupSequence(
5649                                 ctxt, '-', '-', '>', 1, 1) < 0))
5650                             goto done;
5651 #ifdef DEBUG_PUSH
5652                         xmlGenericError(xmlGenericErrorContext,
5653                                 "HPP: Parsing Comment\n");
5654 #endif
5655                         htmlParseComment(ctxt);
5656                         ctxt->instate = XML_PARSER_CONTENT;
5657                     } else if ((cur == '<') && (next == '?')) {
5658                         if ((!terminate) &&
5659                             (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5660                             goto done;
5661 #ifdef DEBUG_PUSH
5662                         xmlGenericError(xmlGenericErrorContext,
5663                                 "HPP: Parsing PI\n");
5664 #endif
5665                         htmlParsePI(ctxt);
5666                         ctxt->instate = XML_PARSER_CONTENT;
5667                     } else if ((cur == '<') && (next == '!') && (avail < 4)) {
5668                         goto done;
5669                     } else if ((cur == '<') && (next == '/')) {
5670                         ctxt->instate = XML_PARSER_END_TAG;
5671                         ctxt->checkIndex = 0;
5672 #ifdef DEBUG_PUSH
5673                         xmlGenericError(xmlGenericErrorContext,
5674                                 "HPP: entering END_TAG\n");
5675 #endif
5676                         break;
5677                     } else if (cur == '<') {
5678                         ctxt->instate = XML_PARSER_START_TAG;
5679                         ctxt->checkIndex = 0;
5680 #ifdef DEBUG_PUSH
5681                         xmlGenericError(xmlGenericErrorContext,
5682                                 "HPP: entering START_TAG\n");
5683 #endif
5684                         break;
5685                     } else if (cur == '&') {
5686                         if ((!terminate) &&
5687                             (htmlParseLookupChars(ctxt,
5688                                                   BAD_CAST "; >/", 4) < 0))
5689                             goto done;
5690 #ifdef DEBUG_PUSH
5691                         xmlGenericError(xmlGenericErrorContext,
5692                                 "HPP: Parsing Reference\n");
5693 #endif
5694                         /* TODO: check generation of subtrees if noent !!! */
5695                         htmlParseReference(ctxt);
5696                     } else {
5697                         /*
5698                          * check that the text sequence is complete
5699                          * before handing out the data to the parser
5700                          * to avoid problems with erroneous end of
5701                          * data detection.
5702                          */
5703                         if ((!terminate) &&
5704                             (htmlParseLookupChars(ctxt, BAD_CAST "<&", 2) < 0))
5705                             goto done;
5706                         ctxt->checkIndex = 0;
5707 #ifdef DEBUG_PUSH
5708                         xmlGenericError(xmlGenericErrorContext,
5709                                 "HPP: Parsing char data\n");
5710 #endif
5711                         htmlParseCharData(ctxt);
5712                     }
5713                 }
5714                 if (cons == ctxt->nbChars) {
5715                     if (ctxt->node != NULL) {
5716                         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5717                                      "detected an error in element content\n",
5718                                      NULL, NULL);
5719                     }
5720                     NEXT;
5721                     break;
5722                 }
5723
5724                 break;
5725             }
5726             case XML_PARSER_END_TAG:
5727                 if (avail < 2)
5728                     goto done;
5729                 if ((!terminate) &&
5730                     (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5731                     goto done;
5732                 htmlParseEndTag(ctxt);
5733                 if (ctxt->nameNr == 0) {
5734                     ctxt->instate = XML_PARSER_EPILOG;
5735                 } else {
5736                     ctxt->instate = XML_PARSER_CONTENT;
5737                 }
5738                 ctxt->checkIndex = 0;
5739 #ifdef DEBUG_PUSH
5740                 xmlGenericError(xmlGenericErrorContext,
5741                         "HPP: entering CONTENT\n");
5742 #endif
5743                 break;
5744             case XML_PARSER_CDATA_SECTION:
5745                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5746                         "HPP: internal error, state == CDATA\n",
5747                              NULL, NULL);
5748                 ctxt->instate = XML_PARSER_CONTENT;
5749                 ctxt->checkIndex = 0;
5750 #ifdef DEBUG_PUSH
5751                 xmlGenericError(xmlGenericErrorContext,
5752                         "HPP: entering CONTENT\n");
5753 #endif
5754                 break;
5755             case XML_PARSER_DTD:
5756                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5757                         "HPP: internal error, state == DTD\n",
5758                              NULL, NULL);
5759                 ctxt->instate = XML_PARSER_CONTENT;
5760                 ctxt->checkIndex = 0;
5761 #ifdef DEBUG_PUSH
5762                 xmlGenericError(xmlGenericErrorContext,
5763                         "HPP: entering CONTENT\n");
5764 #endif
5765                 break;
5766             case XML_PARSER_COMMENT:
5767                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5768                         "HPP: internal error, state == COMMENT\n",
5769                              NULL, NULL);
5770                 ctxt->instate = XML_PARSER_CONTENT;
5771                 ctxt->checkIndex = 0;
5772 #ifdef DEBUG_PUSH
5773                 xmlGenericError(xmlGenericErrorContext,
5774                         "HPP: entering CONTENT\n");
5775 #endif
5776                 break;
5777             case XML_PARSER_PI:
5778                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5779                         "HPP: internal error, state == PI\n",
5780                              NULL, NULL);
5781                 ctxt->instate = XML_PARSER_CONTENT;
5782                 ctxt->checkIndex = 0;
5783 #ifdef DEBUG_PUSH
5784                 xmlGenericError(xmlGenericErrorContext,
5785                         "HPP: entering CONTENT\n");
5786 #endif
5787                 break;
5788             case XML_PARSER_ENTITY_DECL:
5789                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5790                         "HPP: internal error, state == ENTITY_DECL\n",
5791                              NULL, NULL);
5792                 ctxt->instate = XML_PARSER_CONTENT;
5793                 ctxt->checkIndex = 0;
5794 #ifdef DEBUG_PUSH
5795                 xmlGenericError(xmlGenericErrorContext,
5796                         "HPP: entering CONTENT\n");
5797 #endif
5798                 break;
5799             case XML_PARSER_ENTITY_VALUE:
5800                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5801                         "HPP: internal error, state == ENTITY_VALUE\n",
5802                              NULL, NULL);
5803                 ctxt->instate = XML_PARSER_CONTENT;
5804                 ctxt->checkIndex = 0;
5805 #ifdef DEBUG_PUSH
5806                 xmlGenericError(xmlGenericErrorContext,
5807                         "HPP: entering DTD\n");
5808 #endif
5809                 break;
5810             case XML_PARSER_ATTRIBUTE_VALUE:
5811                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5812                         "HPP: internal error, state == ATTRIBUTE_VALUE\n",
5813                              NULL, NULL);
5814                 ctxt->instate = XML_PARSER_START_TAG;
5815                 ctxt->checkIndex = 0;
5816 #ifdef DEBUG_PUSH
5817                 xmlGenericError(xmlGenericErrorContext,
5818                         "HPP: entering START_TAG\n");
5819 #endif
5820                 break;
5821             case XML_PARSER_SYSTEM_LITERAL:
5822                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5823                     "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
5824                              NULL, NULL);
5825                 ctxt->instate = XML_PARSER_CONTENT;
5826                 ctxt->checkIndex = 0;
5827 #ifdef DEBUG_PUSH
5828                 xmlGenericError(xmlGenericErrorContext,
5829                         "HPP: entering CONTENT\n");
5830 #endif
5831                 break;
5832             case XML_PARSER_IGNORE:
5833                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5834                         "HPP: internal error, state == XML_PARSER_IGNORE\n",
5835                              NULL, NULL);
5836                 ctxt->instate = XML_PARSER_CONTENT;
5837                 ctxt->checkIndex = 0;
5838 #ifdef DEBUG_PUSH
5839                 xmlGenericError(xmlGenericErrorContext,
5840                         "HPP: entering CONTENT\n");
5841 #endif
5842                 break;
5843             case XML_PARSER_PUBLIC_LITERAL:
5844                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5845                         "HPP: internal error, state == XML_PARSER_LITERAL\n",
5846                              NULL, NULL);
5847                 ctxt->instate = XML_PARSER_CONTENT;
5848                 ctxt->checkIndex = 0;
5849 #ifdef DEBUG_PUSH
5850                 xmlGenericError(xmlGenericErrorContext,
5851                         "HPP: entering CONTENT\n");
5852 #endif
5853                 break;
5854
5855         }
5856     }
5857 done:
5858     if ((avail == 0) && (terminate)) {
5859         htmlAutoCloseOnEnd(ctxt);
5860         if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5861             /*
5862              * SAX: end of the document processing.
5863              */
5864             ctxt->instate = XML_PARSER_EOF;
5865             if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5866                 ctxt->sax->endDocument(ctxt->userData);
5867         }
5868     }
5869     if ((ctxt->myDoc != NULL) &&
5870         ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
5871          (ctxt->instate == XML_PARSER_EPILOG))) {
5872         xmlDtdPtr dtd;
5873         dtd = xmlGetIntSubset(ctxt->myDoc);
5874         if (dtd == NULL)
5875             ctxt->myDoc->intSubset =
5876                 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
5877                     BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5878                     BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5879     }
5880 #ifdef DEBUG_PUSH
5881     xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
5882 #endif
5883     return(ret);
5884 }
5885
5886 /**
5887  * htmlParseChunk:
5888  * @ctxt:  an HTML parser context
5889  * @chunk:  an char array
5890  * @size:  the size in byte of the chunk
5891  * @terminate:  last chunk indicator
5892  *
5893  * Parse a Chunk of memory
5894  *
5895  * Returns zero if no error, the xmlParserErrors otherwise.
5896  */
5897 int
5898 htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5899               int terminate) {
5900     if ((ctxt == NULL) || (ctxt->input == NULL)) {
5901         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5902                      "htmlParseChunk: context error\n", NULL, NULL);
5903         return(XML_ERR_INTERNAL_ERROR);
5904     }
5905     if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5906         (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF))  {
5907         int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5908         int cur = ctxt->input->cur - ctxt->input->base;
5909         int res;
5910
5911         res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5912         if (res < 0) {
5913             ctxt->errNo = XML_PARSER_EOF;
5914             ctxt->disableSAX = 1;
5915             return (XML_PARSER_EOF);
5916         }
5917         ctxt->input->base = ctxt->input->buf->buffer->content + base;
5918         ctxt->input->cur = ctxt->input->base + cur;
5919         ctxt->input->end =
5920           &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
5921 #ifdef DEBUG_PUSH
5922         xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5923 #endif
5924
5925 #if 0
5926         if ((terminate) || (ctxt->input->buf->buffer->use > 80))
5927             htmlParseTryOrFinish(ctxt, terminate);
5928 #endif
5929     } else if (ctxt->instate != XML_PARSER_EOF) {
5930         if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
5931             xmlParserInputBufferPtr in = ctxt->input->buf;
5932             if ((in->encoder != NULL) && (in->buffer != NULL) &&
5933                     (in->raw != NULL)) {
5934                 int nbchars;
5935
5936                 nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
5937                 if (nbchars < 0) {
5938                     htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
5939                                  "encoder error\n", NULL, NULL);
5940                     return(XML_ERR_INVALID_ENCODING);
5941                 }
5942             }
5943         }
5944     }
5945     htmlParseTryOrFinish(ctxt, terminate);
5946     if (terminate) {
5947         if ((ctxt->instate != XML_PARSER_EOF) &&
5948             (ctxt->instate != XML_PARSER_EPILOG) &&
5949             (ctxt->instate != XML_PARSER_MISC)) {
5950             ctxt->errNo = XML_ERR_DOCUMENT_END;
5951             ctxt->wellFormed = 0;
5952         }
5953         if (ctxt->instate != XML_PARSER_EOF) {
5954             if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5955                 ctxt->sax->endDocument(ctxt->userData);
5956         }
5957         ctxt->instate = XML_PARSER_EOF;
5958     }
5959     return((xmlParserErrors) ctxt->errNo);
5960 }
5961
5962 /************************************************************************
5963  *                                                                      *
5964  *                      User entry points                               *
5965  *                                                                      *
5966  ************************************************************************/
5967
5968 /**
5969  * htmlCreatePushParserCtxt:
5970  * @sax:  a SAX handler
5971  * @user_data:  The user data returned on SAX callbacks
5972  * @chunk:  a pointer to an array of chars
5973  * @size:  number of chars in the array
5974  * @filename:  an optional file name or URI
5975  * @enc:  an optional encoding
5976  *
5977  * Create a parser context for using the HTML parser in push mode
5978  * The value of @filename is used for fetching external entities
5979  * and error/warning reports.
5980  *
5981  * Returns the new parser context or NULL
5982  */
5983 htmlParserCtxtPtr
5984 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
5985                          const char *chunk, int size, const char *filename,
5986                          xmlCharEncoding enc) {
5987     htmlParserCtxtPtr ctxt;
5988     htmlParserInputPtr inputStream;
5989     xmlParserInputBufferPtr buf;
5990
5991     xmlInitParser();
5992
5993     buf = xmlAllocParserInputBuffer(enc);
5994     if (buf == NULL) return(NULL);
5995
5996     ctxt = htmlNewParserCtxt();
5997     if (ctxt == NULL) {
5998         xmlFreeParserInputBuffer(buf);
5999         return(NULL);
6000     }
6001     if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
6002         ctxt->charset=XML_CHAR_ENCODING_UTF8;
6003     if (sax != NULL) {
6004         if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
6005             xmlFree(ctxt->sax);
6006         ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
6007         if (ctxt->sax == NULL) {
6008             xmlFree(buf);
6009             xmlFree(ctxt);
6010             return(NULL);
6011         }
6012         memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
6013         if (user_data != NULL)
6014             ctxt->userData = user_data;
6015     }
6016     if (filename == NULL) {
6017         ctxt->directory = NULL;
6018     } else {
6019         ctxt->directory = xmlParserGetDirectory(filename);
6020     }
6021
6022     inputStream = htmlNewInputStream(ctxt);
6023     if (inputStream == NULL) {
6024         xmlFreeParserCtxt(ctxt);
6025         xmlFree(buf);
6026         return(NULL);
6027     }
6028
6029     if (filename == NULL)
6030         inputStream->filename = NULL;
6031     else
6032         inputStream->filename = (char *)
6033             xmlCanonicPath((const xmlChar *) filename);
6034     inputStream->buf = buf;
6035     inputStream->base = inputStream->buf->buffer->content;
6036     inputStream->cur = inputStream->buf->buffer->content;
6037     inputStream->end =
6038         &inputStream->buf->buffer->content[inputStream->buf->buffer->use];
6039
6040     inputPush(ctxt, inputStream);
6041
6042     if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6043         (ctxt->input->buf != NULL))  {
6044         int base = ctxt->input->base - ctxt->input->buf->buffer->content;
6045         int cur = ctxt->input->cur - ctxt->input->base;
6046
6047         xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6048
6049         ctxt->input->base = ctxt->input->buf->buffer->content + base;
6050         ctxt->input->cur = ctxt->input->base + cur;
6051         ctxt->input->end =
6052             &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
6053 #ifdef DEBUG_PUSH
6054         xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6055 #endif
6056     }
6057     ctxt->progressive = 1;
6058
6059     return(ctxt);
6060 }
6061 #endif /* LIBXML_PUSH_ENABLED */
6062
6063 /**
6064  * htmlSAXParseDoc:
6065  * @cur:  a pointer to an array of xmlChar
6066  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6067  * @sax:  the SAX handler block
6068  * @userData: if using SAX, this pointer will be provided on callbacks.
6069  *
6070  * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
6071  * to handle parse events. If sax is NULL, fallback to the default DOM
6072  * behavior and return a tree.
6073  *
6074  * Returns the resulting document tree unless SAX is NULL or the document is
6075  *     not well formed.
6076  */
6077
6078 htmlDocPtr
6079 htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
6080     htmlDocPtr ret;
6081     htmlParserCtxtPtr ctxt;
6082
6083     xmlInitParser();
6084
6085     if (cur == NULL) return(NULL);
6086
6087
6088     ctxt = htmlCreateDocParserCtxt(cur, encoding);
6089     if (ctxt == NULL) return(NULL);
6090     if (sax != NULL) {
6091         if (ctxt->sax != NULL) xmlFree (ctxt->sax);
6092         ctxt->sax = sax;
6093         ctxt->userData = userData;
6094     }
6095
6096     htmlParseDocument(ctxt);
6097     ret = ctxt->myDoc;
6098     if (sax != NULL) {
6099         ctxt->sax = NULL;
6100         ctxt->userData = NULL;
6101     }
6102     htmlFreeParserCtxt(ctxt);
6103
6104     return(ret);
6105 }
6106
6107 /**
6108  * htmlParseDoc:
6109  * @cur:  a pointer to an array of xmlChar
6110  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6111  *
6112  * parse an HTML in-memory document and build a tree.
6113  *
6114  * Returns the resulting document tree
6115  */
6116
6117 htmlDocPtr
6118 htmlParseDoc(xmlChar *cur, const char *encoding) {
6119     return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
6120 }
6121
6122
6123 /**
6124  * htmlCreateFileParserCtxt:
6125  * @filename:  the filename
6126  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6127  *
6128  * Create a parser context for a file content.
6129  * Automatic support for ZLIB/Compress compressed document is provided
6130  * by default if found at compile-time.
6131  *
6132  * Returns the new parser context or NULL
6133  */
6134 htmlParserCtxtPtr
6135 htmlCreateFileParserCtxt(const char *filename, const char *encoding)
6136 {
6137     htmlParserCtxtPtr ctxt;
6138     htmlParserInputPtr inputStream;
6139     char *canonicFilename;
6140     /* htmlCharEncoding enc; */
6141     xmlChar *content, *content_line = (xmlChar *) "charset=";
6142
6143     if (filename == NULL)
6144         return(NULL);
6145
6146     ctxt = htmlNewParserCtxt();
6147     if (ctxt == NULL) {
6148         return(NULL);
6149     }
6150     canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
6151     if (canonicFilename == NULL) {
6152 #ifdef LIBXML_SAX1_ENABLED
6153         if (xmlDefaultSAXHandler.error != NULL) {
6154             xmlDefaultSAXHandler.error(NULL, "out of memory\n");
6155         }
6156 #endif
6157         xmlFreeParserCtxt(ctxt);
6158         return(NULL);
6159     }
6160
6161     inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
6162     xmlFree(canonicFilename);
6163     if (inputStream == NULL) {
6164         xmlFreeParserCtxt(ctxt);
6165         return(NULL);
6166     }
6167
6168     inputPush(ctxt, inputStream);
6169
6170     /* set encoding */
6171     if (encoding) {
6172         content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
6173         if (content) {
6174             strcpy ((char *)content, (char *)content_line);
6175             strcat ((char *)content, (char *)encoding);
6176             htmlCheckEncoding (ctxt, content);
6177             xmlFree (content);
6178         }
6179     }
6180
6181     return(ctxt);
6182 }
6183
6184 /**
6185  * htmlSAXParseFile:
6186  * @filename:  the filename
6187  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6188  * @sax:  the SAX handler block
6189  * @userData: if using SAX, this pointer will be provided on callbacks.
6190  *
6191  * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6192  * compressed document is provided by default if found at compile-time.
6193  * It use the given SAX function block to handle the parsing callback.
6194  * If sax is NULL, fallback to the default DOM tree building routines.
6195  *
6196  * Returns the resulting document tree unless SAX is NULL or the document is
6197  *     not well formed.
6198  */
6199
6200 htmlDocPtr
6201 htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
6202                  void *userData) {
6203     htmlDocPtr ret;
6204     htmlParserCtxtPtr ctxt;
6205     htmlSAXHandlerPtr oldsax = NULL;
6206
6207     xmlInitParser();
6208
6209     ctxt = htmlCreateFileParserCtxt(filename, encoding);
6210     if (ctxt == NULL) return(NULL);
6211     if (sax != NULL) {
6212         oldsax = ctxt->sax;
6213         ctxt->sax = sax;
6214         ctxt->userData = userData;
6215     }
6216
6217     htmlParseDocument(ctxt);
6218
6219     ret = ctxt->myDoc;
6220     if (sax != NULL) {
6221         ctxt->sax = oldsax;
6222         ctxt->userData = NULL;
6223     }
6224     htmlFreeParserCtxt(ctxt);
6225
6226     return(ret);
6227 }
6228
6229 /**
6230  * htmlParseFile:
6231  * @filename:  the filename
6232  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6233  *
6234  * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6235  * compressed document is provided by default if found at compile-time.
6236  *
6237  * Returns the resulting document tree
6238  */
6239
6240 htmlDocPtr
6241 htmlParseFile(const char *filename, const char *encoding) {
6242     return(htmlSAXParseFile(filename, encoding, NULL, NULL));
6243 }
6244
6245 /**
6246  * htmlHandleOmittedElem:
6247  * @val:  int 0 or 1
6248  *
6249  * Set and return the previous value for handling HTML omitted tags.
6250  *
6251  * Returns the last value for 0 for no handling, 1 for auto insertion.
6252  */
6253
6254 int
6255 htmlHandleOmittedElem(int val) {
6256     int old = htmlOmittedDefaultValue;
6257
6258     htmlOmittedDefaultValue = val;
6259     return(old);
6260 }
6261
6262 /**
6263  * htmlElementAllowedHere:
6264  * @parent: HTML parent element
6265  * @elt: HTML element
6266  *
6267  * Checks whether an HTML element may be a direct child of a parent element.
6268  * Note - doesn't check for deprecated elements
6269  *
6270  * Returns 1 if allowed; 0 otherwise.
6271  */
6272 int
6273 htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
6274   const char** p ;
6275
6276   if ( ! elt || ! parent || ! parent->subelts )
6277         return 0 ;
6278
6279   for ( p = parent->subelts; *p; ++p )
6280     if ( !xmlStrcmp((const xmlChar *)*p, elt) )
6281       return 1 ;
6282
6283   return 0 ;
6284 }
6285 /**
6286  * htmlElementStatusHere:
6287  * @parent: HTML parent element
6288  * @elt: HTML element
6289  *
6290  * Checks whether an HTML element may be a direct child of a parent element.
6291  * and if so whether it is valid or deprecated.
6292  *
6293  * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6294  */
6295 htmlStatus
6296 htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
6297   if ( ! parent || ! elt )
6298     return HTML_INVALID ;
6299   if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
6300     return HTML_INVALID ;
6301
6302   return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
6303 }
6304 /**
6305  * htmlAttrAllowed:
6306  * @elt: HTML element
6307  * @attr: HTML attribute
6308  * @legacy: whether to allow deprecated attributes
6309  *
6310  * Checks whether an attribute is valid for an element
6311  * Has full knowledge of Required and Deprecated attributes
6312  *
6313  * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6314  */
6315 htmlStatus
6316 htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
6317   const char** p ;
6318
6319   if ( !elt || ! attr )
6320         return HTML_INVALID ;
6321
6322   if ( elt->attrs_req )
6323     for ( p = elt->attrs_req; *p; ++p)
6324       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6325         return HTML_REQUIRED ;
6326
6327   if ( elt->attrs_opt )
6328     for ( p = elt->attrs_opt; *p; ++p)
6329       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6330         return HTML_VALID ;
6331
6332   if ( legacy && elt->attrs_depr )
6333     for ( p = elt->attrs_depr; *p; ++p)
6334       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6335         return HTML_DEPRECATED ;
6336
6337   return HTML_INVALID ;
6338 }
6339 /**
6340  * htmlNodeStatus:
6341  * @node: an htmlNodePtr in a tree
6342  * @legacy: whether to allow deprecated elements (YES is faster here
6343  *      for Element nodes)
6344  *
6345  * Checks whether the tree node is valid.  Experimental (the author
6346  *     only uses the HTML enhancements in a SAX parser)
6347  *
6348  * Return: for Element nodes, a return from htmlElementAllowedHere (if
6349  *      legacy allowed) or htmlElementStatusHere (otherwise).
6350  *      for Attribute nodes, a return from htmlAttrAllowed
6351  *      for other nodes, HTML_NA (no checks performed)
6352  */
6353 htmlStatus
6354 htmlNodeStatus(const htmlNodePtr node, int legacy) {
6355   if ( ! node )
6356     return HTML_INVALID ;
6357
6358   switch ( node->type ) {
6359     case XML_ELEMENT_NODE:
6360       return legacy
6361         ? ( htmlElementAllowedHere (
6362                 htmlTagLookup(node->parent->name) , node->name
6363                 ) ? HTML_VALID : HTML_INVALID )
6364         : htmlElementStatusHere(
6365                 htmlTagLookup(node->parent->name) ,
6366                 htmlTagLookup(node->name) )
6367         ;
6368     case XML_ATTRIBUTE_NODE:
6369       return htmlAttrAllowed(
6370         htmlTagLookup(node->parent->name) , node->name, legacy) ;
6371     default: return HTML_NA ;
6372   }
6373 }
6374 /************************************************************************
6375  *                                                                      *
6376  *      New set (2.6.0) of simpler and more flexible APIs               *
6377  *                                                                      *
6378  ************************************************************************/
6379 /**
6380  * DICT_FREE:
6381  * @str:  a string
6382  *
6383  * Free a string if it is not owned by the "dict" dictionnary in the
6384  * current scope
6385  */
6386 #define DICT_FREE(str)                                          \
6387         if ((str) && ((!dict) ||                                \
6388             (xmlDictOwns(dict, (const xmlChar *)(str)) == 0)))  \
6389             xmlFree((char *)(str));
6390
6391 /**
6392  * htmlCtxtReset:
6393  * @ctxt: an HTML parser context
6394  *
6395  * Reset a parser context
6396  */
6397 void
6398 htmlCtxtReset(htmlParserCtxtPtr ctxt)
6399 {
6400     xmlParserInputPtr input;
6401     xmlDictPtr dict;
6402
6403     if (ctxt == NULL)
6404         return;
6405
6406     xmlInitParser();
6407     dict = ctxt->dict;
6408
6409     while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
6410         xmlFreeInputStream(input);
6411     }
6412     ctxt->inputNr = 0;
6413     ctxt->input = NULL;
6414
6415     ctxt->spaceNr = 0;
6416     if (ctxt->spaceTab != NULL) {
6417         ctxt->spaceTab[0] = -1;
6418         ctxt->space = &ctxt->spaceTab[0];
6419     } else {
6420         ctxt->space = NULL;
6421     }
6422
6423
6424     ctxt->nodeNr = 0;
6425     ctxt->node = NULL;
6426
6427     ctxt->nameNr = 0;
6428     ctxt->name = NULL;
6429
6430     DICT_FREE(ctxt->version);
6431     ctxt->version = NULL;
6432     DICT_FREE(ctxt->encoding);
6433     ctxt->encoding = NULL;
6434     DICT_FREE(ctxt->directory);
6435     ctxt->directory = NULL;
6436     DICT_FREE(ctxt->extSubURI);
6437     ctxt->extSubURI = NULL;
6438     DICT_FREE(ctxt->extSubSystem);
6439     ctxt->extSubSystem = NULL;
6440     if (ctxt->myDoc != NULL)
6441         xmlFreeDoc(ctxt->myDoc);
6442     ctxt->myDoc = NULL;
6443
6444     ctxt->standalone = -1;
6445     ctxt->hasExternalSubset = 0;
6446     ctxt->hasPErefs = 0;
6447     ctxt->html = 1;
6448     ctxt->external = 0;
6449     ctxt->instate = XML_PARSER_START;
6450     ctxt->token = 0;
6451
6452     ctxt->wellFormed = 1;
6453     ctxt->nsWellFormed = 1;
6454     ctxt->valid = 1;
6455     ctxt->vctxt.userData = ctxt;
6456     ctxt->vctxt.error = xmlParserValidityError;
6457     ctxt->vctxt.warning = xmlParserValidityWarning;
6458     ctxt->record_info = 0;
6459     ctxt->nbChars = 0;
6460     ctxt->checkIndex = 0;
6461     ctxt->inSubset = 0;
6462     ctxt->errNo = XML_ERR_OK;
6463     ctxt->depth = 0;
6464     ctxt->charset = XML_CHAR_ENCODING_NONE;
6465     ctxt->catalogs = NULL;
6466     xmlInitNodeInfoSeq(&ctxt->node_seq);
6467
6468     if (ctxt->attsDefault != NULL) {
6469         xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
6470         ctxt->attsDefault = NULL;
6471     }
6472     if (ctxt->attsSpecial != NULL) {
6473         xmlHashFree(ctxt->attsSpecial, NULL);
6474         ctxt->attsSpecial = NULL;
6475     }
6476 }
6477
6478 /**
6479  * htmlCtxtUseOptions:
6480  * @ctxt: an HTML parser context
6481  * @options:  a combination of htmlParserOption(s)
6482  *
6483  * Applies the options to the parser context
6484  *
6485  * Returns 0 in case of success, the set of unknown or unimplemented options
6486  *         in case of error.
6487  */
6488 int
6489 htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6490 {
6491     if (ctxt == NULL)
6492         return(-1);
6493
6494     if (options & HTML_PARSE_NOWARNING) {
6495         ctxt->sax->warning = NULL;
6496         ctxt->vctxt.warning = NULL;
6497         options -= XML_PARSE_NOWARNING;
6498         ctxt->options |= XML_PARSE_NOWARNING;
6499     }
6500     if (options & HTML_PARSE_NOERROR) {
6501         ctxt->sax->error = NULL;
6502         ctxt->vctxt.error = NULL;
6503         ctxt->sax->fatalError = NULL;
6504         options -= XML_PARSE_NOERROR;
6505         ctxt->options |= XML_PARSE_NOERROR;
6506     }
6507     if (options & HTML_PARSE_PEDANTIC) {
6508         ctxt->pedantic = 1;
6509         options -= XML_PARSE_PEDANTIC;
6510         ctxt->options |= XML_PARSE_PEDANTIC;
6511     } else
6512         ctxt->pedantic = 0;
6513     if (options & XML_PARSE_NOBLANKS) {
6514         ctxt->keepBlanks = 0;
6515         ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6516         options -= XML_PARSE_NOBLANKS;
6517         ctxt->options |= XML_PARSE_NOBLANKS;
6518     } else
6519         ctxt->keepBlanks = 1;
6520     if (options & HTML_PARSE_RECOVER) {
6521         ctxt->recovery = 1;
6522         options -= HTML_PARSE_RECOVER;
6523     } else
6524         ctxt->recovery = 0;
6525     if (options & HTML_PARSE_COMPACT) {
6526         ctxt->options |= HTML_PARSE_COMPACT;
6527         options -= HTML_PARSE_COMPACT;
6528     }
6529     if (options & XML_PARSE_HUGE) {
6530         ctxt->options |= XML_PARSE_HUGE;
6531         options -= XML_PARSE_HUGE;
6532     }
6533     ctxt->dictNames = 0;
6534     return (options);
6535 }
6536
6537 /**
6538  * htmlDoRead:
6539  * @ctxt:  an HTML parser context
6540  * @URL:  the base URL to use for the document
6541  * @encoding:  the document encoding, or NULL
6542  * @options:  a combination of htmlParserOption(s)
6543  * @reuse:  keep the context for reuse
6544  *
6545  * Common front-end for the htmlRead functions
6546  *
6547  * Returns the resulting document tree or NULL
6548  */
6549 static htmlDocPtr
6550 htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
6551           int options, int reuse)
6552 {
6553     htmlDocPtr ret;
6554
6555     htmlCtxtUseOptions(ctxt, options);
6556     ctxt->html = 1;
6557     if (encoding != NULL) {
6558         xmlCharEncodingHandlerPtr hdlr;
6559
6560         hdlr = xmlFindCharEncodingHandler(encoding);
6561         if (hdlr != NULL) {
6562             xmlSwitchToEncoding(ctxt, hdlr);
6563             if (ctxt->input->encoding != NULL)
6564               xmlFree((xmlChar *) ctxt->input->encoding);
6565             ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
6566         }
6567     }
6568     if ((URL != NULL) && (ctxt->input != NULL) &&
6569         (ctxt->input->filename == NULL))
6570         ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
6571     htmlParseDocument(ctxt);
6572     ret = ctxt->myDoc;
6573     ctxt->myDoc = NULL;
6574     if (!reuse) {
6575         if ((ctxt->dictNames) &&
6576             (ret != NULL) &&
6577             (ret->dict == ctxt->dict))
6578             ctxt->dict = NULL;
6579         xmlFreeParserCtxt(ctxt);
6580     }
6581     return (ret);
6582 }
6583
6584 /**
6585  * htmlReadDoc:
6586  * @cur:  a pointer to a zero terminated string
6587  * @URL:  the base URL to use for the document
6588  * @encoding:  the document encoding, or NULL
6589  * @options:  a combination of htmlParserOption(s)
6590  *
6591  * parse an XML in-memory document and build a tree.
6592  *
6593  * Returns the resulting document tree
6594  */
6595 htmlDocPtr
6596 htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
6597 {
6598     htmlParserCtxtPtr ctxt;
6599
6600     if (cur == NULL)
6601         return (NULL);
6602
6603     xmlInitParser();
6604     ctxt = htmlCreateDocParserCtxt(cur, NULL);
6605     if (ctxt == NULL)
6606         return (NULL);
6607     return (htmlDoRead(ctxt, URL, encoding, options, 0));
6608 }
6609
6610 /**
6611  * htmlReadFile:
6612  * @filename:  a file or URL
6613  * @encoding:  the document encoding, or NULL
6614  * @options:  a combination of htmlParserOption(s)
6615  *
6616  * parse an XML file from the filesystem or the network.
6617  *
6618  * Returns the resulting document tree
6619  */
6620 htmlDocPtr
6621 htmlReadFile(const char *filename, const char *encoding, int options)
6622 {
6623     htmlParserCtxtPtr ctxt;
6624
6625     xmlInitParser();
6626     ctxt = htmlCreateFileParserCtxt(filename, encoding);
6627     if (ctxt == NULL)
6628         return (NULL);
6629     return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6630 }
6631
6632 /**
6633  * htmlReadMemory:
6634  * @buffer:  a pointer to a char array
6635  * @size:  the size of the array
6636  * @URL:  the base URL to use for the document
6637  * @encoding:  the document encoding, or NULL
6638  * @options:  a combination of htmlParserOption(s)
6639  *
6640  * parse an XML in-memory document and build a tree.
6641  *
6642  * Returns the resulting document tree
6643  */
6644 htmlDocPtr
6645 htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6646 {
6647     htmlParserCtxtPtr ctxt;
6648
6649     xmlInitParser();
6650     ctxt = xmlCreateMemoryParserCtxt(buffer, size);
6651     if (ctxt == NULL)
6652         return (NULL);
6653     htmlDefaultSAXHandlerInit();
6654     if (ctxt->sax != NULL)
6655         memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
6656     return (htmlDoRead(ctxt, URL, encoding, options, 0));
6657 }
6658
6659 /**
6660  * htmlReadFd:
6661  * @fd:  an open file descriptor
6662  * @URL:  the base URL to use for the document
6663  * @encoding:  the document encoding, or NULL
6664  * @options:  a combination of htmlParserOption(s)
6665  *
6666  * parse an XML from a file descriptor and build a tree.
6667  *
6668  * Returns the resulting document tree
6669  */
6670 htmlDocPtr
6671 htmlReadFd(int fd, const char *URL, const char *encoding, int options)
6672 {
6673     htmlParserCtxtPtr ctxt;
6674     xmlParserInputBufferPtr input;
6675     xmlParserInputPtr stream;
6676
6677     if (fd < 0)
6678         return (NULL);
6679
6680     xmlInitParser();
6681     input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6682     if (input == NULL)
6683         return (NULL);
6684     ctxt = xmlNewParserCtxt();
6685     if (ctxt == NULL) {
6686         xmlFreeParserInputBuffer(input);
6687         return (NULL);
6688     }
6689     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6690     if (stream == NULL) {
6691         xmlFreeParserInputBuffer(input);
6692         xmlFreeParserCtxt(ctxt);
6693         return (NULL);
6694     }
6695     inputPush(ctxt, stream);
6696     return (htmlDoRead(ctxt, URL, encoding, options, 0));
6697 }
6698
6699 /**
6700  * htmlReadIO:
6701  * @ioread:  an I/O read function
6702  * @ioclose:  an I/O close function
6703  * @ioctx:  an I/O handler
6704  * @URL:  the base URL to use for the document
6705  * @encoding:  the document encoding, or NULL
6706  * @options:  a combination of htmlParserOption(s)
6707  *
6708  * parse an HTML document from I/O functions and source and build a tree.
6709  *
6710  * Returns the resulting document tree
6711  */
6712 htmlDocPtr
6713 htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6714           void *ioctx, const char *URL, const char *encoding, int options)
6715 {
6716     htmlParserCtxtPtr ctxt;
6717     xmlParserInputBufferPtr input;
6718     xmlParserInputPtr stream;
6719
6720     if (ioread == NULL)
6721         return (NULL);
6722     xmlInitParser();
6723
6724     input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6725                                          XML_CHAR_ENCODING_NONE);
6726     if (input == NULL)
6727         return (NULL);
6728     ctxt = htmlNewParserCtxt();
6729     if (ctxt == NULL) {
6730         xmlFreeParserInputBuffer(input);
6731         return (NULL);
6732     }
6733     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6734     if (stream == NULL) {
6735         xmlFreeParserInputBuffer(input);
6736         xmlFreeParserCtxt(ctxt);
6737         return (NULL);
6738     }
6739     inputPush(ctxt, stream);
6740     return (htmlDoRead(ctxt, URL, encoding, options, 0));
6741 }
6742
6743 /**
6744  * htmlCtxtReadDoc:
6745  * @ctxt:  an HTML parser context
6746  * @cur:  a pointer to a zero terminated string
6747  * @URL:  the base URL to use for the document
6748  * @encoding:  the document encoding, or NULL
6749  * @options:  a combination of htmlParserOption(s)
6750  *
6751  * parse an XML in-memory document and build a tree.
6752  * This reuses the existing @ctxt parser context
6753  *
6754  * Returns the resulting document tree
6755  */
6756 htmlDocPtr
6757 htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
6758                const char *URL, const char *encoding, int options)
6759 {
6760     xmlParserInputPtr stream;
6761
6762     if (cur == NULL)
6763         return (NULL);
6764     if (ctxt == NULL)
6765         return (NULL);
6766
6767     htmlCtxtReset(ctxt);
6768
6769     stream = xmlNewStringInputStream(ctxt, cur);
6770     if (stream == NULL) {
6771         return (NULL);
6772     }
6773     inputPush(ctxt, stream);
6774     return (htmlDoRead(ctxt, URL, encoding, options, 1));
6775 }
6776
6777 /**
6778  * htmlCtxtReadFile:
6779  * @ctxt:  an HTML parser context
6780  * @filename:  a file or URL
6781  * @encoding:  the document encoding, or NULL
6782  * @options:  a combination of htmlParserOption(s)
6783  *
6784  * parse an XML file from the filesystem or the network.
6785  * This reuses the existing @ctxt parser context
6786  *
6787  * Returns the resulting document tree
6788  */
6789 htmlDocPtr
6790 htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6791                 const char *encoding, int options)
6792 {
6793     xmlParserInputPtr stream;
6794
6795     if (filename == NULL)
6796         return (NULL);
6797     if (ctxt == NULL)
6798         return (NULL);
6799
6800     htmlCtxtReset(ctxt);
6801
6802     stream = xmlLoadExternalEntity(filename, NULL, ctxt);
6803     if (stream == NULL) {
6804         return (NULL);
6805     }
6806     inputPush(ctxt, stream);
6807     return (htmlDoRead(ctxt, NULL, encoding, options, 1));
6808 }
6809
6810 /**
6811  * htmlCtxtReadMemory:
6812  * @ctxt:  an HTML parser context
6813  * @buffer:  a pointer to a char array
6814  * @size:  the size of the array
6815  * @URL:  the base URL to use for the document
6816  * @encoding:  the document encoding, or NULL
6817  * @options:  a combination of htmlParserOption(s)
6818  *
6819  * parse an XML in-memory document and build a tree.
6820  * This reuses the existing @ctxt parser context
6821  *
6822  * Returns the resulting document tree
6823  */
6824 htmlDocPtr
6825 htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
6826                   const char *URL, const char *encoding, int options)
6827 {
6828     xmlParserInputBufferPtr input;
6829     xmlParserInputPtr stream;
6830
6831     if (ctxt == NULL)
6832         return (NULL);
6833     if (buffer == NULL)
6834         return (NULL);
6835
6836     htmlCtxtReset(ctxt);
6837
6838     input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
6839     if (input == NULL) {
6840         return(NULL);
6841     }
6842
6843     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6844     if (stream == NULL) {
6845         xmlFreeParserInputBuffer(input);
6846         return(NULL);
6847     }
6848
6849     inputPush(ctxt, stream);
6850     return (htmlDoRead(ctxt, URL, encoding, options, 1));
6851 }
6852
6853 /**
6854  * htmlCtxtReadFd:
6855  * @ctxt:  an HTML parser context
6856  * @fd:  an open file descriptor
6857  * @URL:  the base URL to use for the document
6858  * @encoding:  the document encoding, or NULL
6859  * @options:  a combination of htmlParserOption(s)
6860  *
6861  * parse an XML from a file descriptor and build a tree.
6862  * This reuses the existing @ctxt parser context
6863  *
6864  * Returns the resulting document tree
6865  */
6866 htmlDocPtr
6867 htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
6868               const char *URL, const char *encoding, int options)
6869 {
6870     xmlParserInputBufferPtr input;
6871     xmlParserInputPtr stream;
6872
6873     if (fd < 0)
6874         return (NULL);
6875     if (ctxt == NULL)
6876         return (NULL);
6877
6878     htmlCtxtReset(ctxt);
6879
6880
6881     input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6882     if (input == NULL)
6883         return (NULL);
6884     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6885     if (stream == NULL) {
6886         xmlFreeParserInputBuffer(input);
6887         return (NULL);
6888     }
6889     inputPush(ctxt, stream);
6890     return (htmlDoRead(ctxt, URL, encoding, options, 1));
6891 }
6892
6893 /**
6894  * htmlCtxtReadIO:
6895  * @ctxt:  an HTML parser context
6896  * @ioread:  an I/O read function
6897  * @ioclose:  an I/O close function
6898  * @ioctx:  an I/O handler
6899  * @URL:  the base URL to use for the document
6900  * @encoding:  the document encoding, or NULL
6901  * @options:  a combination of htmlParserOption(s)
6902  *
6903  * parse an HTML document from I/O functions and source and build a tree.
6904  * This reuses the existing @ctxt parser context
6905  *
6906  * Returns the resulting document tree
6907  */
6908 htmlDocPtr
6909 htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
6910               xmlInputCloseCallback ioclose, void *ioctx,
6911               const char *URL,
6912               const char *encoding, int options)
6913 {
6914     xmlParserInputBufferPtr input;
6915     xmlParserInputPtr stream;
6916
6917     if (ioread == NULL)
6918         return (NULL);
6919     if (ctxt == NULL)
6920         return (NULL);
6921
6922     htmlCtxtReset(ctxt);
6923
6924     input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6925                                          XML_CHAR_ENCODING_NONE);
6926     if (input == NULL)
6927         return (NULL);
6928     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6929     if (stream == NULL) {
6930         xmlFreeParserInputBuffer(input);
6931         return (NULL);
6932     }
6933     inputPush(ctxt, stream);
6934     return (htmlDoRead(ctxt, URL, encoding, options, 1));
6935 }
6936
6937 #define bottom_HTMLparser
6938 #include "elfgcchack.h"
6939 #endif /* LIBXML_HTML_ENABLED */