khtml/html/htmltokenizer.cpp

   1 /*
   2     This file is part of the KDE libraries
   3
   4     Copyright (C) 1997 Martin Jones (mjones@kde.org)
   5               (C) 1997 Torben Weis (weis@kde.org)
   6               (C) 1998 Waldo Bastian (bastian@kde.org)
   7               (C) 1999 Lars Knoll (knoll@kde.org)
   8               (C) 1999 Antti Koivisto (koivisto@kde.org)
   9               (C) 2001-2003 Dirk Mueller (mueller@kde.org)
  10               (C) 2004-2008 Apple Computer, Inc.
  11               (C) 2006-2008 Germain Garand (germain@ebooksfrance.org)
  12
  13     This library is free software; you can redistribute it and/or
  14     modify it under the terms of the GNU Library General Public
  15     License as published by the Free Software Foundation; either
  16     version 2 of the License, or (at your option) any later version.
  17
  18     This library is distributed in the hope that it will be useful,
  19     but WITHOUT ANY WARRANTY; without even the implied warranty of
  20     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  21     Library General Public License for more details.
  22
  23     You should have received a copy of the GNU Library General Public License
  24     along with this library; see the file COPYING.LIB.  If not, write to
  25     the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  26     Boston, MA 02110-1301, USA.
  27 */
  28 //----------------------------------------------------------------------------
  29 //
  30 // KDE HTML Widget - Tokenizers
  31
  32 // #define TOKEN_DEBUG 1
  33 //#define TOKEN_DEBUG 2
  34
  35 #include "htmltokenizer.h"
  36 #include "html_documentimpl.h"
  37 #include "htmlparser.h"
  38 #include "dtd.h"
  39
  40 #include <misc/loader.h>
  41 #include <misc/htmlhashes.h>
  42
  43 #include <khtmlview.h>
  44 #include <khtml_part.h>
  45 #include <xml/dom_docimpl.h>
  46 #include <css/csshelper.h>
  47 #include <ecma/kjs_proxy.h>
  48 #include <kcharsets.h>
  49 #include <kglobal.h>
  50 #include <ctype.h>
  51 #include <assert.h>
  52 #include <QtCore/QVariant>
  53 #include <kdebug.h>
  54 #include <stdlib.h>
  55
  56 #include <config.h>
  57
  58 #include "kentities.c"
  59 #include "htmlprospectivetokenizer.h"
  60
  61 #define PROSPECTIVE_TOKENIZER_ENABLED 1
  62
  63 using namespace khtml;
  64
  65 static const QChar commentStart [] = { '<','!','-','-', QChar::Null };
  66 static const char doctypeStart [] = "<!doctype";
  67 static const char publicStart [] = "public";
  68 static const char systemStart [] = "system";
  69
  70 static const char scriptEnd [] = "</script";
  71 static const char xmpEnd [] = "</xmp";
  72 static const char styleEnd [] =  "</style";
  73 static const char textareaEnd [] = "</textarea";
  74 static const char titleEnd [] = "</title";
  75
  76 #ifndef NDEBUG
  77 static const int sTokenizerChunkSize = 2048;
  78 static const int sTokenizerFastYeldDelay = 220;
  79 static const int sTokenizerYeldDelay = 650;
  80 #else
  81 static const int sTokenizerChunkSize = 4096;
  82 static const int sTokenizerFastYeldDelay = 180;
  83 static const int sTokenizerYeldDelay = 450;
  84 #endif
  85
  86 #define KHTML_ALLOC_QCHAR_VEC( N ) (QChar*) malloc( sizeof(QChar)*( N ) )
  87 #define KHTML_REALLOC_QCHAR_VEC(P, N ) (QChar*) realloc(P, sizeof(QChar)*( N ))
  88 #define KHTML_DELETE_QCHAR_VEC( P ) free((char*)( P ))
  89
  90 // Full support for MS Windows extensions to Latin-1.
  91 // Technically these extensions should only be activated for pages
  92 // marked "windows-1252" or "cp1252", but
  93 // in the standard Microsoft way, these extensions infect hundreds of thousands
  94 // of web pages.  Note that people with non-latin-1 Microsoft extensions
  95 // are SOL.
  96 //
  97 // See: http://www.microsoft.com/globaldev/reference/WinCP.asp
  98 //      http://www.bbsinc.com/iso8859.html
  99 //      http://www.obviously.com/
 100 //
 101 // There may be better equivalents
 102 #if 0
 103 #define fixUpChar(x)
 104 #else
 105 #define fixUpChar(x) \
 106             switch ((x).unicode()) \
 107             { \
 108             case 0x80: (x) = 0x20ac; break; \
 109             case 0x82: (x) = 0x201a;    break; \
 110             case 0x83: (x) = 0x0192; break; \
 111             case 0x84: (x) = 0x201e;    break; \
 112             case 0x85: (x) = 0x2026; break; \
 113             case 0x86: (x) = 0x2020; break; \
 114             case 0x87: (x) = 0x2021; break; \
 115             case 0x88: (x) = 0x02C6; break; \
 116             case 0x89: (x) = 0x2030; break; \
 117             case 0x8A: (x) = 0x0160; break; \
 118             case 0x8b: (x) = 0x2039;    break; \
 119             case 0x8C: (x) = 0x0152; break; \
 120             case 0x8E: (x) = 0x017D; break; \
 121             case 0x91: (x) = 0x2018;   break; \
 122             case 0x92: (x) = 0x2019;   break; \
 123             case 0x93: (x) = 0x201C;    break; \
 124             case 0x94: (x) = 0X201D;    break; \
 125             case 0x95: (x) = 0x2022;    break; \
 126             case 0x96: (x) = 0x2013;    break; \
 127             case 0x97: (x) = 0x2014;    break; \
 128             case 0x98: (x) = 0x02DC;    break; \
 129             case 0x99: (x) = 0x2122; break; \
 130             case 0x9A: (x) = 0x0161; break; \
 131             case 0x9b: (x) = 0x203A;    break; \
 132             case 0x9C: (x) = 0x0153; break; \
 133             case 0x9E: (x) = 0x017E; break; \
 134             case 0x9F: (x) = 0x0178; break; \
 135             default: break; \
 136             }
 137 #endif
 138 // ----------------------------------------------------------------------------
 139
 140 HTMLTokenizer::HTMLTokenizer(DOM::DocumentImpl *_doc, KHTMLView *_view)
 141 {
 142     view = _view;
 143     buffer = 0;
 144     scriptCode = 0;
 145     scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
 146     charsets = KGlobal::charsets();
 147     parser = new KHTMLParser(_view, _doc);
 148     m_executingScript = 0;
 149     m_autoCloseTimer = 0;
 150     m_tokenizerYeldDelay = sTokenizerFastYeldDelay;
 151     m_yeldTimer = 0;
 152     m_prospectiveTokenizer = 0;
 153     onHold = false;
 154
 155     reset();
 156 }
 157
 158 HTMLTokenizer::HTMLTokenizer(DOM::DocumentImpl *_doc, DOM::DocumentFragmentImpl *i)
 159 {
 160     view = 0;
 161     buffer = 0;
 162     scriptCode = 0;
 163     scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
 164     charsets = KGlobal::charsets();
 165     parser = new KHTMLParser( i, _doc );
 166     m_executingScript = 0;
 167     m_autoCloseTimer = 0;
 168     m_tokenizerYeldDelay = sTokenizerFastYeldDelay;
 169     m_yeldTimer = 0;
 170     m_prospectiveTokenizer = 0;
 171     onHold = false;
 172
 173     reset();
 174 }
 175
 176 void HTMLTokenizer::setNormalYeldDelay()
 177 {
 178     m_tokenizerYeldDelay = sTokenizerYeldDelay;
 179 }
 180
 181 void HTMLTokenizer::reset()
 182 {
 183     assert(m_executingScript == 0);
 184     Q_ASSERT(onHold == false);
 185     m_abort = false;
 186
 187     while (!cachedScript.isEmpty())
 188         cachedScript.dequeue()->deref(this);
 189
 190     if ( buffer )
 191         KHTML_DELETE_QCHAR_VEC(buffer);
 192     buffer = dest = 0;
 193     size = 0;
 194
 195     if ( scriptCode )
 196         KHTML_DELETE_QCHAR_VEC(scriptCode);
 197     scriptCode = 0;
 198     scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
 199
 200     if (m_autoCloseTimer > 0) {
 201         killTimer(m_autoCloseTimer);
 202         m_autoCloseTimer = 0;
 203     }
 204
 205     if (m_yeldTimer > 0) {
 206         killTimer(m_yeldTimer);
 207         m_yeldTimer = 0;
 208     }
 209     currToken.reset();
 210     doctypeToken.reset();
 211 }
 212
 213 void HTMLTokenizer::begin()
 214 {
 215     m_executingScript = 0;
 216     onHold = false;
 217     reset();
 218     size = 254;
 219     buffer = KHTML_ALLOC_QCHAR_VEC( 255 );
 220     dest = buffer;
 221     tag = NoTag;
 222     pending = NonePending;
 223     discard = NoneDiscard;
 224     pre = false;
 225     prePos = 0;
 226     plaintext = false;
 227     xmp = false;
 228     processingInstruction = false;
 229     script = false;
 230     escaped = false;
 231     style = false;
 232     skipLF = false;
 233     select = false;
 234     comment = false;
 235     doctype = false;
 236     doctypeComment = NoDoctypeComment;
 237     doctypeAllowComment = false;
 238     server = false;
 239     textarea = false;
 240     title = false;
 241     startTag = false;
 242     tquote = NoQuote;
 243     searchCount = 0;
 244     doctypeSearchCount = 0;
 245     doctypeSecondarySearchCount = 0;
 246     Entity = NoEntity;
 247     noMoreData = false;
 248     brokenComments = false;
 249     brokenServer = false;
 250     brokenScript = false;
 251     lineno = 0;
 252     scriptStartLineno = 0;
 253     tagStartLineno = 0;
 254 }
 255
 256 void HTMLTokenizer::processListing(TokenizerString list)
 257 {
 258     bool old_pre = pre;
 259
 260     // This function adds the listing 'list' as
 261     // preformatted text-tokens to the token-collection
 262     // thereby converting TABs.
 263     if(!style) pre = true;
 264     prePos = 0;
 265
 266     while ( !list.isEmpty() )
 267     {
 268         checkBuffer(3*TAB_SIZE);
 269
 270         if (skipLF && ( list->unicode() != '\n' ))
 271         {
 272             skipLF = false;
 273         }
 274
 275         if (skipLF)
 276         {
 277             skipLF = false;
 278             ++list;
 279         }
 280         else if (( list->unicode() == '\n' ) || ( list->unicode() == '\r' ))
 281         {
 282             if (discard == LFDiscard)
 283             {
 284                 // Ignore this LF
 285                 discard = NoneDiscard; // We have discarded 1 LF
 286             }
 287             else
 288             {
 289                 // Process this LF
 290                 if (pending)
 291                     addPending();
 292
 293                 // we used to do it not at all and we want to have
 294                 // it fixed for textarea. So here we are
 295                 if ( textarea ) {
 296                     prePos++;
 297                     *dest++ = *list;
 298                 } else
 299                     pending = LFPending;
 300             }
 301             /* Check for MS-DOS CRLF sequence */
 302             if (list->unicode() == '\r')
 303             {
 304                 skipLF = true;
 305             }
 306             ++list;
 307         }
 308         else if (( list->unicode() == ' ' ) || ( list->unicode() == '\t'))
 309         {
 310             if (pending)
 311                 addPending();
 312             if (*list == ' ')
 313                 pending = SpacePending;
 314             else
 315                 pending = TabPending;
 316
 317             ++list;
 318         }
 319         else
 320         {
 321             discard = NoneDiscard;
 322             if (pending)
 323                 addPending();
 324
 325             prePos++;
 326             *dest++ = *list;
 327             ++list;
 328         }
 329
 330     }
 331
 332     if ((pending == SpacePending) || (pending == TabPending))
 333         addPending();
 334     else
 335         pending = NonePending;
 336
 337     prePos = 0;
 338     pre = old_pre;
 339 }
 340
 341 void HTMLTokenizer::parseSpecial(TokenizerString &src)
 342 {
 343     assert( textarea || title || !Entity );
 344     assert( !tag );
 345     assert( xmp+textarea+title+style+script == 1 );
 346     if (script)
 347         scriptStartLineno = lineno+src.lineCount();
 348
 349     if ( comment ) parseComment( src );
 350
 351     while ( !src.isEmpty() ) {
 352         checkScriptBuffer();
 353         unsigned char ch = src->toLatin1();
 354         if ( !scriptCodeResync && !brokenComments && !textarea && !xmp && ch == '-' && scriptCodeSize >= 3 && !src.escaped() && QString::fromRawData( scriptCode+scriptCodeSize-3, 3 ) == "<!-" ) {
 355             comment = true;
 356             scriptCode[ scriptCodeSize++ ] = ch;
 357             ++src;
 358             parseComment( src );
 359             continue;
 360         }
 361         if ( scriptCodeResync && !tquote && ( ch == '>' ) ) {
 362             ++src;
 363             scriptCodeSize = scriptCodeResync-1;
 364             scriptCodeResync = 0;
 365             scriptCode[ scriptCodeSize ] = scriptCode[ scriptCodeSize + 1 ] = 0;
 366             if ( script )
 367                 scriptHandler();
 368             else {
 369                 processListing(TokenizerString(scriptCode, scriptCodeSize));
 370                 processToken();
 371                 if ( style )         { currToken.tid = ID_STYLE + ID_CLOSE_TAG; }
 372                 else if ( textarea ) { currToken.tid = ID_TEXTAREA + ID_CLOSE_TAG; }
 373                 else if ( title ) { currToken.tid = ID_TITLE + ID_CLOSE_TAG; }
 374                 else if ( xmp )  { currToken.tid = ID_XMP + ID_CLOSE_TAG; }
 375                 processToken();
 376                 script = style = textarea = title = xmp = false;
 377                 tquote = NoQuote;
 378                 scriptCodeSize = scriptCodeResync = 0;
 379             }
 380             return;
 381         }
 382         // possible end of tagname, lets check.
 383         if ( !scriptCodeResync && !escaped && !src.escaped() && ( ch == '>' || ch == '/' || ch <= ' ' ) && ch &&
 384              scriptCodeSize >= searchStopperLen &&
 385              !QString::fromRawData( scriptCode+scriptCodeSize-searchStopperLen, searchStopperLen ).indexOf( searchStopper, 0, Qt::CaseInsensitive )) {
 386             scriptCodeResync = scriptCodeSize-searchStopperLen+1;
 387             tquote = NoQuote;
 388             continue;
 389         }
 390         if ( scriptCodeResync && !escaped ) {
 391             if(ch == '\"')
 392                 tquote = (tquote == NoQuote) ? DoubleQuote : ((tquote == SingleQuote) ? SingleQuote : NoQuote);
 393             else if(ch == '\'')
 394                 tquote = (tquote == NoQuote) ? SingleQuote : (tquote == DoubleQuote) ? DoubleQuote : NoQuote;
 395             else if (tquote != NoQuote && (ch == '\r' || ch == '\n'))
 396                 tquote = NoQuote;
 397         }
 398         escaped = ( !escaped && ch == '\\' );
 399         if (!scriptCodeResync && (textarea||title) && !src.escaped() && ch == '&') {
 400             QChar *scriptCodeDest = scriptCode+scriptCodeSize;
 401             ++src;
 402             parseEntity(src,scriptCodeDest,true);
 403             scriptCodeSize = scriptCodeDest-scriptCode;
 404         }
 405         else {
 406             scriptCode[ scriptCodeSize++ ] = *src;
 407             ++src;
 408         }
 409     }
 410 }
 411
 412 void HTMLTokenizer::scriptHandler()
 413 {
 414     QString currentScriptSrc = scriptSrc;
 415     scriptSrc.clear();
 416
 417     processListing(TokenizerString(scriptCode, scriptCodeSize));
 418     QString exScript( buffer, dest-buffer );
 419
 420     processToken();
 421     currToken.tid = ID_SCRIPT + ID_CLOSE_TAG;
 422     processToken();
 423
 424     // Scripts following a frameset element should not be executed or even loaded in the case of extern scripts.
 425     bool followingFrameset = (parser->doc()->body() && parser->doc()->body()->id() == ID_FRAMESET);
 426     bool effectiveScript = !parser->skipMode() && !followingFrameset;
 427     bool deferredScript = false;
 428
 429     if ( effectiveScript ) {
 430         CachedScript* cs = 0;
 431
 432         // forget what we just got, load from src url instead
 433         if ( !currentScriptSrc.isEmpty() && javascript &&
 434              (cs = parser->doc()->docLoader()->requestScript(currentScriptSrc, scriptSrcCharset) )) {
 435             cachedScript.enqueue(cs);
 436         }
 437
 438         if (cs) {
 439             pendingQueue.push(src);
 440             int scriptCount = cachedScript.count();
 441             setSrc(TokenizerString());
 442             scriptCodeSize = scriptCodeResync = 0;
 443             cs->ref(this);
 444             if (cachedScript.count() == scriptCount)
 445                 deferredScript = true;
 446         }
 447         else if (currentScriptSrc.isEmpty() && view && javascript ) {
 448             pendingQueue.push(src);
 449             setSrc(TokenizerString());
 450             scriptCodeSize = scriptCodeResync = 0;
 451             scriptExecution( exScript, QString(), tagStartLineno /*scriptStartLineno*/ );
 452         } else {
 453             // script was filtered or disallowed
 454             effectiveScript = false;
 455         }
 456     }
 457
 458     script = false;
 459     scriptCodeSize = scriptCodeResync = 0;
 460
 461     if ( !effectiveScript )
 462         return;
 463
 464     if ( !m_executingScript && cachedScript.isEmpty() ) {
 465         src.append(pendingQueue.pop());
 466     } else if ( cachedScript.isEmpty() ) {
 467         write( pendingQueue.pop(), false );
 468     } else if ( !deferredScript && pendingQueue.count() > 1) {
 469         TokenizerString t = pendingQueue.pop();
 470         pendingQueue.top().prepend( t );
 471     }
 472 #if PROSPECTIVE_TOKENIZER_ENABLED
 473     if (!cachedScript.isEmpty() && !m_executingScript) {
 474         if (!m_prospectiveTokenizer)
 475             m_prospectiveTokenizer = new ProspectiveTokenizer(parser->docPtr());
 476         if (!m_prospectiveTokenizer->inProgress() && !pendingQueue.isEmpty()) {
 477             m_prospectiveTokenizer->begin();
 478             m_prospectiveTokenizer->write(pendingQueue.top());
 479         }
 480     }
 481 #endif
 482
 483 }
 484
 485 void HTMLTokenizer::scriptExecution( const QString& str, const QString& scriptURL,
 486                                      int baseLine)
 487 {
 488     bool oldscript = script;
 489     m_executingScript++;
 490     script = false;
 491     QString url;
 492     if (scriptURL.isNull() && view)
 493       url = static_cast<DocumentImpl*>(view->part()->document().handle())->URL().url();
 494     else
 495       url = scriptURL;
 496
 497     if (view)
 498         view->part()->executeScript(url,baseLine,Node(),str);
 499     m_executingScript--;
 500     script = oldscript;
 501 }
 502
 503 void HTMLTokenizer::parseComment(TokenizerString &src)
 504 {
 505     checkScriptBuffer(src.length());
 506     while ( src.length() ) {
 507         scriptCode[ scriptCodeSize++ ] = *src;
 508
 509 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
 510         qDebug("comment is now: *%s*", src.toString().left(16).toLatin1().constData());
 511 #endif
 512
 513         if (src->unicode() == '>')
 514         {
 515             bool handleBrokenComments =  brokenComments && !( script || style );
 516             bool scriptEnd=false;
 517             if ( scriptCodeSize > 2 && scriptCode[scriptCodeSize-3] == '-' &&
 518                   scriptCode[scriptCodeSize-2] == '-' )
 519             {
 520                 scriptEnd=true;
 521             }
 522
 523             if (handleBrokenComments || scriptEnd ){
 524                 ++src;
 525                 if ( !( title || script || xmp || textarea || style) ) {
 526                     checkScriptBuffer();
 527                     scriptCode[ scriptCodeSize ] = 0;
 528                     scriptCode[ scriptCodeSize + 1 ] = 0;
 529                     currToken.tid = ID_COMMENT;
 530                     processListing(TokenizerString(scriptCode, scriptCodeSize - 3));
 531                     processToken();
 532                     currToken.tid = ID_COMMENT + ID_CLOSE_TAG;
 533                     processToken();
 534                     scriptCodeSize = 0;
 535                 }
 536                 comment = false;
 537                 return; // Finished parsing comment
 538             }
 539         }
 540         ++src;
 541     }
 542 }
 543
 544 void HTMLTokenizer::parseDoctypeComment(TokenizerString &src)
 545 {
 546     while (!src.isEmpty()) {
 547         QChar c = *src;
 548         switch (doctypeComment) {
 549             case DoctypeCommentHalfBegin: {
 550                 if (c != '-') {
 551                     // Ooops, it's not comment
 552                     doctypeComment = DoctypeCommentBogus;
 553                     return;
 554                 } else {
 555                     // Doctype comment begins
 556                     doctypeComment = DoctypeComment;
 557                     ++src;
 558                 }
 559                 break;
 560             }
 561             case DoctypeComment: {
 562                 if (c == '-') {
 563                     // Perhaps this is end of comment
 564                     doctypeComment = DoctypeCommentHalfEnd;
 565                     ++src;
 566                 } else {
 567                     // Keep scanning for '--'
 568                     ++src;
 569                 }
 570                 break;
 571             }
 572             case DoctypeCommentHalfEnd: {
 573                 if (c == '-') {
 574                     // Doctype comment ends
 575                     doctypeComment = DoctypeCommentEnd;
 576                     return;
 577                 } else {
 578                     // It's not '--'
 579                     ++src;
 580                     doctypeComment = DoctypeComment;
 581                 }
 582                 break;
 583             }
 584             default: {
 585                 assert(!"Undefined doctype comment state");
 586                 break;
 587             }
 588         }
 589     }
 590 }
 591
 592 void HTMLTokenizer::parseDoctype(TokenizerString &src)
 593 {
 594     while (!src.isEmpty() && doctype) {
 595         QChar c;
 596         bool isWhitespace = false;
 597         int dontAdvance = 0;
 598         if (doctypeComment == DoctypeCommentEnd) {
 599             doctypeComment = NoDoctypeComment;
 600             isWhitespace = true;
 601         } else if (doctypeComment == DoctypeCommentBogus) {
 602             doctypeComment = NoDoctypeComment;
 603             c = '-';
 604             dontAdvance++;
 605         } else {
 606             c = *src;
 607             if (doctypeAllowComment) {
 608                 if (!doctypeComment && c == '-') {
 609                     doctypeComment = DoctypeCommentHalfBegin;
 610                     ++src;
 611                 }
 612                 if (doctypeComment) {
 613                     parseDoctypeComment(src);
 614                     continue;
 615                 }
 616                 isWhitespace = c == '\r' || c == '\n' || c == '\t' || c == ' ';
 617             }
 618         }
 619
 620         switch (doctypeToken.state) {
 621             case DoctypeBegin: {
 622                 doctypeToken.state = DoctypeBeforeName;
 623                 if (isWhitespace) {
 624                     // nothing
 625                 }
 626                 break;
 627             }
 628             case DoctypeBeforeName: {
 629                 if (c == '>') {
 630                     // Malformed. Just exit.
 631                     doctype = false;
 632                 } else if (isWhitespace) {
 633                     // nothing
 634                 } else {
 635                     dontAdvance++;
 636                     doctypeToken.state = DoctypeName;
 637                 }
 638                 break;
 639             }
 640             case DoctypeName: {
 641                 if (c == '>') {
 642                     // Valid doctype. Emit it.
 643                     doctype = false;
 644                     processDoctypeToken();
 645                 } else if (isWhitespace) {
 646                     doctypeSearchCount = 0; // Used now to scan for PUBLIC
 647                     doctypeSecondarySearchCount = 0; // Used now to scan for SYSTEM
 648                     doctypeToken.state = DoctypeAfterName;
 649                 } else {
 650                     doctypeToken.name.append(c);
 651                 }
 652                 break;
 653             }
 654             case DoctypeAfterName: {
 655                 if (c == '>') {
 656                     // Valid doctype. Emit it.
 657                     doctype = false;
 658                     processDoctypeToken();
 659                 } else if (c == '[') {
 660                     if(doctypeSearchCount > 0 || doctypeSecondarySearchCount > 0) { // is there any public/system indicator before?
 661                         doctypeSearchCount = doctypeSecondarySearchCount = 0;
 662                         doctypeToken.state = DoctypeBogus;
 663                     }
 664                     // Found internal subset
 665                     doctypeToken.state = DoctypeInternalSubset;
 666                     doctypeAllowComment = false;
 667                 } else if (!isWhitespace) {
 668                     if (c.toLower() == publicStart[doctypeSearchCount]) {
 669                         doctypeSearchCount++;
 670                         if(doctypeSearchCount == 6)
 671                             // Found 'PUBLIC' sequence
 672                             doctypeToken.state = DoctypeBeforePublicID;
 673                     } else if (doctypeSearchCount > 0) {
 674                         doctypeSearchCount = 0;
 675                         doctypeToken.state = DoctypeBogus;
 676                     } else if (c.toLower() == systemStart[doctypeSecondarySearchCount]) {
 677                         doctypeSecondarySearchCount++;
 678                         if(doctypeSecondarySearchCount == 6)
 679                             // Found 'SYSTEM' sequence
 680                             doctypeToken.state = DoctypeBeforeSystemID;
 681                     } else {
 682                         doctypeSecondarySearchCount = 0;
 683                         doctypeToken.state = DoctypeBogus;
 684                     }
 685                 } else {
 686                     // Whitespace keeps us in the after name state
 687                 }
 688                 break;
 689             }
 690             case DoctypeBeforePublicID: {
 691                 if (c == '\"' || c == '\'') {
 692                     tquote = c == '\"' ? DoubleQuote : SingleQuote;
 693                     doctypeToken.state = DoctypePublicID;
 694                     doctypeAllowComment = false;
 695                 } else if (c == '>') {
 696                     // Considered bogus. Don't process the doctype.
 697                     doctype = false;
 698                 } else if (isWhitespace) {
 699                     // nothing
 700                 } else
 701                     doctypeToken.state = DoctypeBogus;
 702                 break;
 703             }
 704             case DoctypePublicID: {
 705                 if ((c == '\"' && tquote == DoubleQuote) || (c == '\'' && tquote == SingleQuote)) {
 706                     doctypeToken.state = DoctypeAfterPublicID;
 707                     doctypeAllowComment = true;
 708                 } else if (c == '>') {
 709                     // Considered bogus. Don't process the doctype.
 710                     doctype = false;
 711                 } else {
 712                     doctypeToken.publicID.append(c);
 713                 }
 714                 break;
 715             }
 716             case DoctypeAfterPublicID: {
 717                 if (c == '\"' || c == '\'') {
 718                     tquote = c == '\"' ? DoubleQuote : SingleQuote;
 719                     doctypeToken.state = DoctypeSystemID;
 720                 } else if (c == '>') {
 721                     // Valid doctype. Emit it now.
 722                     doctype = false;
 723                     processDoctypeToken();
 724                 } else if (isWhitespace) {
 725                     // nothing
 726                 } else if (c == '[') {
 727                     // Found internal subset
 728                     doctypeToken.state = DoctypeInternalSubset;
 729                     doctypeAllowComment = false;
 730                 } else
 731                     doctypeToken.state = DoctypeBogus;
 732                 break;
 733             }
 734             case DoctypeBeforeSystemID: {
 735                 if (c == '\"' || c == '\'') {
 736                     tquote = c == '\"' ? DoubleQuote : SingleQuote;
 737                     doctypeToken.state = DoctypeSystemID;
 738                     doctypeAllowComment = false;
 739                 } else if (c == '>') {
 740                     // Considered bogus. Don't process the doctype.
 741                     doctype = false;
 742                 } else if (isWhitespace) {
 743                     // nothing
 744                 } else
 745                     doctypeToken.state = DoctypeBogus;
 746                 break;
 747             }
 748             case DoctypeSystemID: {
 749                 if ((c == '\"' && tquote == DoubleQuote) || (c == '\'' && tquote == SingleQuote)) {
 750                     doctypeToken.state = DoctypeAfterSystemID;
 751                     doctypeAllowComment = true;
 752                 } else if (c == '>') {
 753                     // Considered bogus. Don't process the doctype.
 754                     doctype = false;
 755                 } else {
 756                     doctypeToken.systemID.append(c);
 757                 }
 758                 break;
 759             }
 760             case DoctypeAfterSystemID: {
 761                 if (c == '>') {
 762                     // Valid doctype. Emit it now.
 763                     doctype = false;
 764                     processDoctypeToken();
 765                 } else if (isWhitespace) {
 766                     // nothing
 767                 } else if (c == '[') {
 768                     // Found internal subset
 769                     doctypeToken.state = DoctypeInternalSubset;
 770                     doctypeAllowComment = false;
 771                 } else {
 772                     doctypeToken.state = DoctypeBogus;
 773                 }
 774                 break;
 775             }
 776             case DoctypeInternalSubset: {
 777                 if(c == ']') {
 778                     // Done
 779                     doctypeToken.state = DoctypeAfterInternalSubset;
 780                     doctypeAllowComment = true;
 781                 } else {
 782                     doctypeToken.internalSubset.append(c);
 783                 }
 784                 break;
 785             }
 786             case DoctypeAfterInternalSubset: {
 787                 if (c == '>') {
 788                     // Valid doctype. Emit it now.
 789                     doctype = false;
 790                     processDoctypeToken();
 791                 } else if (isWhitespace) {
 792                     // nothing
 793                 } else
 794                     doctypeToken.state = DoctypeBogus;
 795                 break;
 796             }
 797             case DoctypeBogus: {
 798                 if (c == '>') {
 799                     // Done with the bogus doctype.
 800                     doctype = false;
 801                 } else {
 802                     // Just keep scanning for '>'
 803                 }
 804                 break;
 805             }
 806             default:
 807                 break;
 808         }
 809         if (!dontAdvance)
 810             ++src;
 811         else if (dontAdvance == 1)
 812             continue;
 813         else // double dontAdvance++, do workaround
 814             doctypeComment = DoctypeCommentBogus;
 815     }
 816 }
 817
 818 void HTMLTokenizer::parseServer(TokenizerString &src)
 819 {
 820     checkScriptBuffer(src.length());
 821     while ( !src.isEmpty() ) {
 822         scriptCode[ scriptCodeSize++ ] = *src;
 823         if (src->unicode() == '>' &&
 824             scriptCodeSize > 1 && scriptCode[scriptCodeSize-2] == '%') {
 825             ++src;
 826             server = false;
 827             scriptCodeSize = 0;
 828             return; // Finished parsing server include
 829         }
 830         ++src;
 831     }
 832 }
 833
 834 void HTMLTokenizer::parseProcessingInstruction(TokenizerString &src)
 835 {
 836     char oldchar = 0;
 837     while ( !src.isEmpty() )
 838     {
 839         unsigned char chbegin = src->toLatin1();
 840         if(chbegin == '\'') {
 841             tquote = tquote == SingleQuote ? NoQuote : SingleQuote;
 842         }
 843         else if(chbegin == '\"') {
 844             tquote = tquote == DoubleQuote ? NoQuote : DoubleQuote;
 845         }
 846         // Look for '?>'
 847         // some crappy sites omit the "?" before it, so
 848         // we look for an unquoted '>' instead. (IE compatible)
 849         else if ( chbegin == '>' && ( !tquote || oldchar == '?' ) )
 850         {
 851             // We got a '?>' sequence
 852             processingInstruction = false;
 853             ++src;
 854             discard=LFDiscard;
 855             return; // Finished parsing comment!
 856         }
 857         ++src;
 858         oldchar = chbegin;
 859     }
 860 }
 861
 862 void HTMLTokenizer::parseText(TokenizerString &src)
 863 {
 864     while ( !src.isEmpty() )
 865     {
 866         // do we need to enlarge the buffer?
 867         checkBuffer();
 868
 869         // ascii is okay because we only do ascii comparisons
 870         unsigned char chbegin = src->toLatin1();
 871
 872         if (skipLF && ( chbegin != '\n' ))
 873         {
 874             skipLF = false;
 875         }
 876
 877         if (skipLF)
 878         {
 879             skipLF = false;
 880             ++src;
 881         }
 882         else if (( chbegin == '\n' ) || ( chbegin == '\r' ))
 883         {
 884             if (chbegin == '\r')
 885                 skipLF = true;
 886
 887             *dest++ = '\n';
 888             ++src;
 889         }
 890         else {
 891             *dest++ = *src;
 892             ++src;
 893         }
 894     }
 895 }
 896
 897
 898 void HTMLTokenizer::parseEntity(TokenizerString &src, QChar *&dest, bool start)
 899 {
 900     if( start )
 901     {
 902         cBufferPos = 0;
 903         entityLen = 0;
 904         Entity = SearchEntity;
 905     }
 906
 907     while( !src.isEmpty() )
 908     {
 909         ushort cc = src->unicode();
 910         switch(Entity) {
 911         case NoEntity:
 912             return;
 913
 914             break;
 915         case SearchEntity:
 916             if(cc == '#') {
 917                 cBuffer[cBufferPos++] = cc;
 918                 ++src;
 919                 Entity = NumericSearch;
 920             }
 921             else
 922                 Entity = EntityName;
 923
 924             break;
 925
 926         case NumericSearch:
 927             if(cc == 'x' || cc == 'X') {
 928                 cBuffer[cBufferPos++] = cc;
 929                 ++src;
 930                 Entity = Hexadecimal;
 931             }
 932             else if(cc >= '0' && cc <= '9')
 933                 Entity = Decimal;
 934             else
 935                 Entity = SearchSemicolon;
 936
 937             break;
 938
 939         case Hexadecimal:
 940         {
 941             int uc = EntityChar.unicode();
 942             int ll = qMin<uint>(src.length(), 8);
 943             while(ll--) {
 944                 QChar csrc(src->toLower());
 945                 cc = csrc.cell();
 946
 947                 if(csrc.row() || !((cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f'))) {
 948                     break;
 949                 }
 950                 uc = uc*16 + (cc - ( cc < 'a' ? '0' : 'a' - 10));
 951                 cBuffer[cBufferPos++] = cc;
 952                 ++src;
 953             }
 954             EntityChar = QChar(uc);
 955             Entity = SearchSemicolon;
 956             break;
 957         }
 958         case Decimal:
 959         {
 960             int uc = EntityChar.unicode();
 961             int ll = qMin(src.length(), 9-cBufferPos);
 962             while(ll--) {
 963                 cc = src->cell();
 964
 965                 if(src->row() || !(cc >= '0' && cc <= '9')) {
 966                     Entity = SearchSemicolon;
 967                     break;
 968                 }
 969
 970                 uc = uc * 10 + (cc - '0');
 971                 cBuffer[cBufferPos++] = cc;
 972                 ++src;
 973             }
 974             EntityChar = QChar(uc);
 975             if(cBufferPos == 9)  Entity = SearchSemicolon;
 976             break;
 977         }
 978         case EntityName:
 979         {
 980             int ll = qMin(src.length(), 9-cBufferPos);
 981             while(ll--) {
 982                 QChar csrc = *src;
 983                 cc = csrc.cell();
 984
 985                 if(csrc.row() || !((cc >= 'a' && cc <= 'z') ||
 986                                    (cc >= '0' && cc <= '9') || (cc >= 'A' && cc <= 'Z'))) {
 987                     Entity = SearchSemicolon;
 988                     break;
 989                 }
 990
 991                 cBuffer[cBufferPos++] = cc;
 992                 ++src;
 993
 994                 // be IE compatible and interpret even unterminated entities
 995                 // outside tags. like "foo &nbspstuff bla".
 996                 if ( tag == NoTag ) {
 997                     const entity* e = kde_findEntity(cBuffer, cBufferPos);
 998                     if ( e && e->code < 256 ) {
 999                         EntityChar = e->code;
1000                         entityLen = cBufferPos;
1001                     }
1002                 }
1003             }
1004             if(cBufferPos == 9) Entity = SearchSemicolon;
1005             if(Entity == SearchSemicolon) {
1006                 if(cBufferPos > 1) {
1007                     const entity *e = kde_findEntity(cBuffer, cBufferPos);
1008                     // IE only accepts unterminated entities < 256,
1009                     // Gecko accepts them all, but only outside tags
1010                     if(e && ( tag == NoTag || e->code < 256 || *src == ';' )) {
1011                         EntityChar = e->code;
1012                         entityLen = cBufferPos;
1013                     }
1014                 }
1015             }
1016             break;
1017         }
1018         case SearchSemicolon:
1019 #ifdef TOKEN_DEBUG
1020             kDebug( 6036 ) << "ENTITY " << EntityChar.unicode();
1021 #endif
1022             fixUpChar(EntityChar);
1023
1024             if (*src == ';')
1025                     ++src;
1026
1027             if ( !EntityChar.isNull() ) {
1028                 checkBuffer();
1029                 if (entityLen > 0 && entityLen < cBufferPos) {
1030                     int rem = cBufferPos - entityLen;
1031                     src.prepend( TokenizerString(QString::fromAscii(cBuffer+entityLen, rem)) );
1032                 }
1033                 src.push( EntityChar );
1034             } else {
1035 #ifdef TOKEN_DEBUG
1036                 kDebug( 6036 ) << "unknown entity!";
1037 #endif
1038                 checkBuffer(10);
1039                 // ignore the sequence, add it to the buffer as plaintext
1040                 *dest++ = '&';
1041                 for(unsigned int i = 0; i < cBufferPos; i++)
1042                     dest[i] = cBuffer[i];
1043                 dest += cBufferPos;
1044                 if (pre)
1045                     prePos += cBufferPos+1;
1046             }
1047
1048             Entity = NoEntity;
1049             EntityChar = QChar::Null;
1050             return;
1051         };
1052     }
1053 }
1054
1055 void HTMLTokenizer::parseTag(TokenizerString &src)
1056 {
1057     assert(!Entity );
1058     checkScriptBuffer( src.length() );
1059
1060     while ( !src.isEmpty() )
1061     {
1062         checkBuffer();
1063 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1064         uint l = 0;
1065         while(l < src.length() && (src.toString()[l]).toLatin1().constData() != '>')
1066             l++;
1067         qDebug("src is now: *%s*, tquote: %d",
1068                src.toString().left(l).toLatin1().constData(), tquote);
1069 #endif
1070         switch(tag) {
1071         case NoTag:
1072             return;
1073         case TagName:
1074         {
1075 #if defined(TOKEN_DEBUG) &&  TOKEN_DEBUG > 1
1076             qDebug("TagName");
1077 #endif
1078             if (searchCount > 0)
1079             {
1080                 if (*src == commentStart[searchCount])
1081                 {
1082                     searchCount++;
1083                     if (searchCount == 2)
1084                         doctypeSearchCount++; // A '!' is also part of doctype, so we are moving through that still as well
1085                     else
1086                         doctypeSearchCount = 0;
1087
1088                     if (searchCount == 4)
1089                     {
1090 #ifdef TOKEN_DEBUG
1091                         kDebug( 6036 ) << "Found comment";
1092 #endif
1093                         // Found '<!--' sequence
1094                         ++src;
1095                         dest = buffer; // ignore the previous part of this tag
1096                         tag = NoTag;
1097
1098                         comment = true;
1099                         parseComment(src);
1100                         return; // Finished parsing tag!
1101                     }
1102                     // cuts of high part, is okay
1103                     cBuffer[cBufferPos++] = src->cell();
1104                     ++src;
1105                     break;
1106                 }
1107                 else
1108                     searchCount = 0; // Stop looking for '<!--' sequence
1109             }
1110
1111             if (doctypeSearchCount > 0) {
1112                 if((*src).toLower() == doctypeStart[doctypeSearchCount]) {
1113                     doctypeSearchCount++;
1114                     cBuffer[cBufferPos++] = src->cell();
1115                     ++src;
1116                     if(doctypeSearchCount == 9) {
1117                         // Found '<!DOCTYPE' sequence
1118                         tag = NoTag;
1119                         doctypeAllowComment = true;
1120                         doctypeComment = NoDoctypeComment;
1121                         doctypeToken.reset();
1122                         doctype = true;
1123
1124                         parseDoctype(src);
1125                         return;
1126                     }
1127                     break;
1128                 } else
1129                     doctypeSearchCount = 0; // Stop looking for '<!DOCTYPE' sequence
1130             }
1131
1132             bool finish = false;
1133             unsigned int ll = qMin(src.length(), CBUFLEN-cBufferPos);
1134             while(ll--) {
1135                 ushort curchar = src->unicode();
1136                 if(curchar <= ' ' || curchar == '>' ) {
1137                     finish = true;
1138                     break;
1139                 }
1140                 // this is a nasty performance trick. will work for the A-Z
1141                 // characters, but not for others. if it contains one,
1142                 // we fail anyway
1143                 char cc = curchar;
1144                 cBuffer[cBufferPos++] = cc | 0x20;
1145                 ++src;
1146             }
1147
1148             // Disadvantage: we add the possible rest of the tag
1149             // as attribute names. ### judge if this causes problems
1150             if(finish || CBUFLEN == cBufferPos) {
1151                 bool beginTag;
1152                 char* ptr = cBuffer;
1153                 unsigned int len = cBufferPos;
1154                 cBuffer[cBufferPos] = '\0';
1155                 if ((cBufferPos > 0) && (*ptr == '/'))
1156                 {
1157                     // End Tag
1158                     beginTag = false;
1159                     ptr++;
1160                     len--;
1161                 }
1162                 else
1163                     // Start Tag
1164                     beginTag = true;
1165                 // Accept empty xml tags like <br/>
1166                 if(len > 1 && ptr[len-1] == '/' ) {
1167                     ptr[--len] = '\0';
1168                     // if its like <br/> and not like <input/ value=foo>, take it as flat
1169                     if (*src == '>')
1170                         currToken.flat = true;
1171                 }
1172
1173                 uint tagID = 0;
1174                 if (!tagID) {
1175                     DOMString tagName(ptr);
1176                     DocumentImpl *doc = parser->docPtr();
1177                     if (Element::khtmlValidQualifiedName(tagName)) {
1178                         safeLocalName = LocalName::fromString(tagName.lower());
1179                         tagID = safeLocalName.id();
1180                     }
1181 #ifdef TOKEN_DEBUG
1182                     QByteArray tmp(ptr, len+1);
1183                     kDebug( 6036 ) << "Unknown tag: \"" << tmp.data() << "\"";
1184 #endif
1185                 }
1186                 if (tagID) {
1187 #ifdef TOKEN_DEBUG
1188                     QByteArray tmp(ptr, len+1);
1189                     kDebug( 6036 ) << "found tag id=" << tagID << ": " << tmp.data();
1190 #endif
1191                     currToken.tid = beginTag ? tagID : tagID + ID_CLOSE_TAG;
1192                 }
1193                 dest = buffer;
1194                 tag = SearchAttribute;
1195                 cBufferPos = 0;
1196             }
1197             break;
1198         }
1199         case SearchAttribute:
1200         {
1201 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1202                 qDebug("SearchAttribute");
1203 #endif
1204             bool atespace = false;
1205             ushort curchar;
1206             while(!src.isEmpty()) {
1207                 curchar = src->unicode();
1208                 if(curchar > ' ') {
1209                     if(curchar == '<' || curchar == '>')
1210                         tag = SearchEnd;
1211                     else if(atespace && (curchar == '\'' || curchar == '"'))
1212                     {
1213                         tag = SearchValue;
1214                         *dest++ = 0;
1215                         attrName.clear();
1216                     }
1217                     else
1218                         tag = AttributeName;
1219
1220                     cBufferPos = 0;
1221                     break;
1222                 }
1223                 atespace = true;
1224                 ++src;
1225             }
1226             break;
1227         }
1228         case AttributeName:
1229         {
1230 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1231                 qDebug("AttributeName");
1232 #endif
1233             ushort curchar;
1234             int ll = qMin(src.length(), CBUFLEN-cBufferPos);
1235
1236             while(ll--) {
1237                 curchar = src->unicode();
1238                 if(curchar <= '>') {
1239                     if(curchar <= ' ' || curchar == '=' || curchar == '>') {
1240                         unsigned int a;
1241                         cBuffer[cBufferPos] = '\0';
1242                         a = LocalName::fromString(DOMString(cBuffer).lower()).id();
1243                         if (a > ATTR_LAST_ATTR)
1244                             a = 0;
1245
1246                         if ( !a ) {
1247                             // did we just get /> or e.g checked/>
1248                             if (curchar == '>' && cBufferPos >=1 && cBuffer[cBufferPos-1] == '/') {
1249                                 currToken.flat = true;
1250                                 cBuffer[cBufferPos - 1] = '\0';
1251                                 if (cBufferPos>1)
1252                                     a = LocalName::fromString(DOMString(cBuffer).lower()).id();
1253                                 if (a > ATTR_LAST_ATTR)
1254                                     a = 0;
1255                                 cBuffer[cBufferPos - 1] = '/';
1256                             }
1257                             if (!a)
1258                                 attrName = QLatin1String(QByteArray(cBuffer, cBufferPos+1).data());
1259                         }
1260
1261                         dest = buffer;
1262                         *dest++ = a;
1263 #ifdef TOKEN_DEBUG
1264                         if (!a || (cBufferPos && *cBuffer == '!'))
1265                             kDebug( 6036 ) << "Unknown attribute: *" << QByteArray(cBuffer, cBufferPos+1).data() << "*";
1266                         else
1267                             kDebug( 6036 ) << "Known attribute: " << QByteArray(cBuffer, cBufferPos+1).data();
1268 #endif
1269
1270                         tag = SearchEqual;
1271                         break;
1272                     }
1273                 }
1274                 cBuffer[cBufferPos++] =
1275                      (  curchar >= 'A' && curchar <= 'Z' ) ? curchar | 0x20 : curchar;
1276                 ++src;
1277             }
1278             if ( cBufferPos == CBUFLEN ) {
1279                 cBuffer[cBufferPos] = '\0';
1280                 attrName = QLatin1String(QByteArray(cBuffer, cBufferPos+1).data());
1281                 dest = buffer;
1282                 *dest++ = 0;
1283                 tag = SearchEqual;
1284             }
1285             break;
1286         }
1287         case SearchEqual:
1288         {
1289 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1290                 qDebug("SearchEqual");
1291 #endif
1292             ushort curchar;
1293             bool atespace = false;
1294             while(!src.isEmpty()) {
1295                 curchar = src->unicode();
1296                 if(curchar > ' ') {
1297                     if(curchar == '=') {
1298 #ifdef TOKEN_DEBUG
1299                         kDebug(6036) << "found equal";
1300 #endif
1301                         tag = SearchValue;
1302                         ++src;
1303                     }
1304                     else if(atespace && (curchar == '\'' || curchar == '"'))
1305                     {
1306                         tag = SearchValue;
1307                         *dest++ = 0;
1308                         attrName.clear();
1309                     }
1310                     else {
1311                         DOMString v("");
1312                         currToken.addAttribute(parser->docPtr(), buffer, attrName, v);
1313                         dest = buffer;
1314                         tag = SearchAttribute;
1315                     }
1316                     break;
1317                 }
1318                 atespace = true;
1319                 ++src;
1320             }
1321             break;
1322         }
1323         case SearchValue:
1324         {
1325             ushort curchar;
1326             while(!src.isEmpty()) {
1327                 curchar = src->unicode();
1328                 if(curchar > ' ') {
1329                     if(( curchar == '\'' || curchar == '\"' )) {
1330                         tquote = curchar == '\"' ? DoubleQuote : SingleQuote;
1331                         tag = QuotedValue;
1332                         ++src;
1333                     } else
1334                         tag = Value;
1335
1336                     break;
1337                 }
1338                 ++src;
1339             }
1340             break;
1341         }
1342         case QuotedValue:
1343         {
1344 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1345                 qDebug("QuotedValue");
1346 #endif
1347             ushort curchar;
1348             while(!src.isEmpty()) {
1349                 checkBuffer();
1350
1351                 curchar = src->unicode();
1352                 if(curchar <= '\'' && !src.escaped()) {
1353                     // ### attributes like '&{blaa....};' are supposed to be treated as jscript.
1354                     if ( curchar == '&' )
1355                     {
1356                         ++src;
1357                         parseEntity(src, dest, true);
1358                         break;
1359                     }
1360                     else if ( (tquote == SingleQuote && curchar == '\'') ||
1361                               (tquote == DoubleQuote && curchar == '\"') )
1362                     {
1363                         // some <input type=hidden> rely on trailing spaces. argh
1364                         while(dest > buffer+1 && (*(dest-1) == '\n' || *(dest-1) == '\r'))
1365                             dest--; // remove trailing newlines
1366                         DOMString v(buffer+1, dest-buffer-1);
1367                         currToken.addAttribute(parser->docPtr(), buffer, attrName, v);
1368
1369                         dest = buffer;
1370                         tag = SearchAttribute;
1371                         tquote = NoQuote;
1372                         ++src;
1373                         break;
1374                     }
1375                 }
1376                 *dest++ = *src;
1377                 ++src;
1378             }
1379             break;
1380         }
1381         case Value:
1382         {
1383 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1384             qDebug("Value");
1385 #endif
1386             ushort curchar;
1387             while(!src.isEmpty()) {
1388                 checkBuffer();
1389                 curchar = src->unicode();
1390                 if(curchar <= '>' && !src.escaped()) {
1391                     // parse Entities
1392                     if ( curchar == '&' )
1393                     {
1394                         ++src;
1395                         parseEntity(src, dest, true);
1396                         break;
1397                     }
1398                     // no quotes. Every space means end of value
1399                     // '/' does not delimit in IE!
1400                     if ( curchar <= ' ' || curchar == '>' )
1401                     {
1402                         DOMString v(buffer+1, dest-buffer-1);
1403                         currToken.addAttribute(parser->docPtr(), buffer, attrName, v);
1404                         dest = buffer;
1405                         tag = SearchAttribute;
1406                         break;
1407                     }
1408                 }
1409
1410                 *dest++ = *src;
1411                 ++src;
1412             }
1413             break;
1414         }
1415         case SearchEnd:
1416         {
1417 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1418                 qDebug("SearchEnd");
1419 #endif
1420             while(!src.isEmpty()) {
1421                 if(*src == '<' || *src == '>')
1422                     break;
1423
1424                 if (*src == '/')
1425                     currToken.flat = true;
1426
1427                 ++src;
1428             }
1429             if(src.isEmpty() && *src != '<' && *src != '>') break;
1430
1431             searchCount = 0; // Stop looking for '<!--' sequence
1432             tag = NoTag;
1433             tquote = NoQuote;
1434             if ( *src == '>' )
1435                 ++src;
1436
1437             if ( !currToken.tid ) //stop if tag is unknown
1438                 return;
1439
1440             uint tagID = currToken.tid;
1441 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 0
1442             kDebug( 6036 ) << "appending Tag: " << tagID;
1443 #endif
1444             // If the tag requires an end tag it cannot be flat,
1445             // unless we are using the HTML parser to parse XHTML
1446             // The only exception is SCRIPT and priority 0 tokens.
1447             if (tagID < ID_CLOSE_TAG && tagID != ID_SCRIPT &&
1448                 DOM::endTagRequirement(tagID) == DOM::REQUIRED &&
1449                 parser->doc()->htmlMode() != DocumentImpl::XHtml)
1450                 currToken.flat = false;
1451
1452             bool beginTag = !currToken.flat && (tagID < ID_CLOSE_TAG);
1453
1454             if(tagID >= ID_CLOSE_TAG)
1455                 tagID -= ID_CLOSE_TAG;
1456             else if ( !brokenScript && tagID == ID_SCRIPT ) {
1457                 DOMStringImpl* a = 0;
1458                 bool foundTypeAttribute = false;
1459                 scriptSrc.clear(); scriptSrcCharset.clear();
1460                 if ( currToken.attrs && /* potentially have a ATTR_SRC ? */
1461                      view &&  /* are we a regular tokenizer or just for innerHTML ? */
1462                      parser->doc()->view()->part()->jScriptEnabled() /* jscript allowed at all? */
1463                     ) {
1464                     if ( ( a = currToken.attrs->getValue( ATTR_SRC ) ) )
1465                         scriptSrc = parser->doc()->completeURL(khtml::parseURL( DOMString(a) ).string() );
1466                     if ( ( a = currToken.attrs->getValue( ATTR_CHARSET ) ) )
1467                         scriptSrcCharset = DOMString(a).string().trimmed();
1468                     if ( scriptSrcCharset.isEmpty() && view)
1469                         scriptSrcCharset = parser->doc()->view()->part()->encoding();
1470                     /* Check type before language, since language is deprecated */
1471                     if ((a = currToken.attrs->getValue(ATTR_TYPE)) != 0 && !DOMString(a).string().isEmpty())
1472                         foundTypeAttribute = true;
1473                     else
1474                         a = currToken.attrs->getValue(ATTR_LANGUAGE);
1475                 }
1476                 javascript = true;
1477
1478                 if( foundTypeAttribute ) {
1479                     /*
1480                         Mozilla 1.5 doesn't accept the text/javascript1.x formats, but WinIE 6 does.
1481                         Mozilla 1.5 doesn't accept text/jscript, text/ecmascript, and text/livescript, but WinIE 6 does.
1482                         Mozilla 1.5 accepts application/x-javascript, WinIE 6 doesn't.
1483                         Mozilla 1.5 allows leading and trailing whitespace, but WinIE 6 doesn't.
1484                         Mozilla 1.5 and WinIE 6 both accept the empty string, but neither accept a whitespace-only string.
1485                         We want to accept all the values that either of these browsers accept, but not other values.
1486                      */
1487                     QString type = DOMString(a).string().trimmed().toLower();
1488                     if( type.compare("text/javascript") != 0 &&
1489                         type.compare("text/javascript1.0") != 0 &&
1490                         type.compare("text/javascript1.1") != 0 &&
1491                         type.compare("text/javascript1.2") != 0 &&
1492                         type.compare("text/javascript1.3") != 0 &&
1493                         type.compare("text/javascript1.4") != 0 &&
1494                         type.compare("text/javascript1.5") != 0 &&
1495                         type.compare("text/jscript") != 0 &&
1496                         type.compare("text/ecmascript") != 0 &&
1497                         type.compare("text/livescript") != 0 &&
1498                         type.compare("application/x-javascript") != 0 &&
1499                         type.compare("application/x-ecmascript") != 0 &&
1500                         type.compare("application/javascript") != 0 &&
1501                         type.compare("application/ecmascript") != 0 )
1502                         javascript = false;
1503                 } else if( a ) {
1504                     /*
1505                      Mozilla 1.5 doesn't accept jscript or ecmascript, but WinIE 6 does.
1506                      Mozilla 1.5 accepts javascript1.0, javascript1.4, and javascript1.5, but WinIE 6 accepts only 1.1 - 1.3.
1507                      Neither Mozilla 1.5 nor WinIE 6 accept leading or trailing whitespace.
1508                      We want to accept all the values that either of these browsers accept, but not other values.
1509                      */
1510                     QString lang = DOMString(a).string();
1511                     lang = lang.toLower();
1512                     if( lang.compare("") != 0 &&
1513                         lang.compare("javascript") != 0 &&
1514                         lang.compare("javascript1.0") != 0 &&
1515                         lang.compare("javascript1.1") != 0 &&
1516                         lang.compare("javascript1.2") != 0 &&
1517                         lang.compare("javascript1.3") != 0 &&
1518                         lang.compare("javascript1.4") != 0 &&
1519                         lang.compare("javascript1.5") != 0 &&
1520                         lang.compare("ecmascript") != 0 &&
1521                         lang.compare("livescript") != 0 &&
1522                         lang.compare("jscript") )
1523                         javascript = false;
1524                 }
1525             }
1526
1527             processToken();
1528
1529             if ( parser->selectMode() && beginTag)
1530                 discard = AllDiscard;
1531
1532             switch( tagID ) {
1533             case ID_LISTING:
1534             case ID_PRE:
1535                 pre = beginTag;
1536                 if (beginTag)
1537                     discard = LFDiscard;
1538                 prePos = 0;
1539                 break;
1540             case ID_BR:
1541                 prePos = 0;
1542                 break;
1543             case ID_SCRIPT:
1544                 if (beginTag) {
1545                     searchStopper = scriptEnd;
1546                     searchStopperLen = 8;
1547                     script = true;
1548                     parseSpecial(src);
1549                 }
1550                 else if (tagID < ID_CLOSE_TAG) // Handle <script src="foo"/>
1551                     scriptHandler();
1552                 break;
1553             case ID_STYLE:
1554                 if (beginTag) {
1555                     searchStopper = styleEnd;
1556                     searchStopperLen = 7;
1557                     style = true;
1558                     parseSpecial(src);
1559                 }
1560                 break;
1561             case ID_TEXTAREA:
1562                 if(beginTag) {
1563                     searchStopper = textareaEnd;
1564                     searchStopperLen = 10;
1565                     textarea = true;
1566                     discard = NoneDiscard;
1567                     parseSpecial(src);
1568                 }
1569                 break;
1570             case ID_TITLE:
1571                 if (beginTag) {
1572                     searchStopper = titleEnd;
1573                     searchStopperLen = 7;
1574                     title = true;
1575                     parseSpecial(src);
1576                 }
1577                 break;
1578             case ID_XMP:
1579                 if (beginTag) {
1580                     searchStopper = xmpEnd;
1581                     searchStopperLen = 5;
1582                     xmp = true;
1583                     parseSpecial(src);
1584                 }
1585                 break;
1586             case ID_SELECT:
1587                 select = beginTag;
1588                 break;
1589             case ID_PLAINTEXT:
1590                 plaintext = beginTag;
1591                 break;
1592             }
1593             return; // Finished parsing tag!
1594         }
1595         } // end switch
1596     }
1597     return;
1598 }
1599
1600 void HTMLTokenizer::addPending()
1601 {
1602     if ( select && !(comment || script))
1603     {
1604         *dest++ = ' ';
1605     }
1606     else
1607     {
1608         switch(pending) {
1609         case LFPending:  *dest++ = QLatin1Char('\n'); prePos = 0; break;
1610         case SpacePending: *dest++ = QLatin1Char(' '); ++prePos; break;
1611         case TabPending: {
1612             // Don't expand tabs inside <textarea> or script
1613             int p = TAB_SIZE - ( prePos % TAB_SIZE );
1614             if (textarea || script) {
1615                 *dest++ = QLatin1Char('\t');
1616             } else {
1617                 for ( int x = 0; x < p; x++ )
1618                     *dest++ = QLatin1Char(' ');
1619             }
1620             prePos += p;
1621             break;
1622         }
1623         case NonePending:
1624             assert(0);
1625         }
1626     }
1627
1628     pending = NonePending;
1629 }
1630
1631 inline bool HTMLTokenizer::continueProcessing(int& processedCount)
1632 {
1633     // We don't want to be checking elapsed time with every character, so we only check after we've
1634     // processed a certain number of characters.
1635     if (!m_executingScript && processedCount > sTokenizerChunkSize && cachedScript.isEmpty()) {
1636         processedCount = 0;
1637         if ( m_time.elapsed() > m_tokenizerYeldDelay) {
1638             m_yeldTimer = startTimer(0);
1639             m_tokenizerYeldDelay = sTokenizerFastYeldDelay;
1640             return false;
1641         }
1642     }
1643     processedCount++;
1644     return true;
1645 }
1646
1647 void HTMLTokenizer::write( const TokenizerString &str, bool appendData )
1648 {
1649 #ifdef TOKEN_DEBUG
1650     kDebug( 6036 ) << this << " Tokenizer::write(\"" << str.toString() << "\"," << appendData << ")";
1651 #endif
1652
1653     if ( !buffer )
1654         return;
1655
1656     if ( ( m_executingScript && appendData ) || cachedScript.count() ) {
1657         // don't parse; we will do this later
1658         if (pendingQueue.isEmpty())
1659             pendingQueue.push(str);
1660         else if (appendData)
1661             pendingQueue.bottom().append(str);
1662         else
1663             pendingQueue.top().append(str);
1664 #if PROSPECTIVE_TOKENIZER_ENABLED
1665         if (m_prospectiveTokenizer && m_prospectiveTokenizer->inProgress() && appendData)
1666             m_prospectiveTokenizer->write(str);
1667 #endif
1668         return;
1669     }
1670
1671 #if PROSPECTIVE_TOKENIZER_ENABLED
1672     if (m_prospectiveTokenizer && m_prospectiveTokenizer->inProgress() && appendData)
1673         m_prospectiveTokenizer->end();
1674 #endif
1675
1676     if ( onHold ) {
1677         src.append(str);
1678         return;
1679     }
1680
1681     if (!src.isEmpty())
1682         src.append(str);
1683     else
1684         setSrc(str);
1685
1686     // Once a timer is set, it has control of when the tokenizer continues.
1687     if (m_yeldTimer > 0)
1688         return;
1689
1690     int processedCount = 0;
1691     m_time.start();
1692
1693     while ( !src.isEmpty() )
1694     {
1695         if ( m_abort || !continueProcessing(processedCount) )
1696             break;
1697         // do we need to enlarge the buffer?
1698         checkBuffer();
1699
1700         ushort cc = src->unicode();
1701
1702         if (skipLF && (cc != '\n'))
1703             skipLF = false;
1704
1705         if (skipLF) {
1706             skipLF = false;
1707             ++src;
1708         }
1709         else if ( Entity )
1710             parseEntity( src, dest );
1711         else if ( plaintext )
1712             parseText( src );
1713         else if (script)
1714             parseSpecial(src);
1715         else if (style)
1716             parseSpecial(src);
1717         else if (xmp)
1718             parseSpecial(src);
1719         else if (textarea)
1720             parseSpecial(src);
1721         else if (title)
1722             parseSpecial(src);
1723         else if (comment)
1724             parseComment(src);
1725         else if (doctypeComment && doctypeComment != DoctypeCommentEnd && doctypeComment != DoctypeCommentBogus)
1726             parseDoctypeComment(src);
1727         else if (doctype)
1728             parseDoctype(src);
1729         else if (server)
1730             parseServer(src);
1731         else if (processingInstruction)
1732             parseProcessingInstruction(src);
1733         else if (tag)
1734             parseTag(src);
1735         else if ( startTag )
1736         {
1737             startTag = false;
1738             bool endTag = false;
1739
1740             switch(cc) {
1741             case '/':
1742                 endTag = true;
1743                 break;
1744             case '!':
1745             {
1746                 // <!-- comment --> or <!DOCTYPE ...>
1747                 searchCount = 1; // Look for '<!--' sequence to start comment...
1748                 doctypeSearchCount = 1; // ... or for '<!DOCTYPE' sequence to start doctype
1749                 break;
1750             }
1751             case '?':
1752             {
1753                 // xml processing instruction
1754                 processingInstruction = true;
1755                 tquote = NoQuote;
1756                 parseProcessingInstruction(src);
1757                 continue;
1758
1759                 break;
1760             }
1761             case '%':
1762                 if (!brokenServer) {
1763                     // <% server stuff, handle as comment %>
1764                     server = true;
1765                     tquote = NoQuote;
1766                     parseServer(src);
1767                     continue;
1768                 }
1769                 // else fall through
1770             default:
1771             {
1772                 if( ((cc >= 'a') && (cc <= 'z')) || ((cc >= 'A') && (cc <= 'Z')))
1773                 {
1774                     // Start of a Start-Tag
1775                 }
1776                 else
1777                 {
1778                     // Invalid tag
1779                     // Add as is
1780                     if (pending)
1781                         addPending();
1782                     *dest = '<';
1783                     dest++;
1784                     continue;
1785                 }
1786             }
1787             }; // end case
1788
1789             // According to SGML any LF immediately after a starttag, or
1790             // immediately before an endtag should be ignored.
1791             // ### Gecko and MSIE though only ignores LF immediately after
1792             // starttags and only for PRE elements -- asj (28/06-2005)
1793             if ( pending )
1794                 if (!select)
1795                     addPending();
1796                 else
1797                     pending = NonePending;
1798
1799             // Cancel unused discards
1800             discard = NoneDiscard;
1801             // if (!endTag) discard = LFDiscard;
1802
1803             processToken();
1804
1805             cBufferPos = 0;
1806             tag = TagName;
1807             parseTag(src);
1808         }
1809         else if ( cc == '&' && !src.escaped())
1810         {
1811             ++src;
1812             if ( pending )
1813                 addPending();
1814             discard = NoneDiscard;
1815             parseEntity(src, dest, true);
1816         }
1817         else if ( cc == '<' && !src.escaped())
1818         {
1819             tagStartLineno = lineno+src.lineCount();
1820             ++src;
1821             discard = NoneDiscard;
1822             startTag = true;
1823         }
1824         else if (( cc == '\n' ) || ( cc == '\r' ))
1825         {
1826             if (discard == SpaceDiscard)
1827                 discard = NoneDiscard;
1828
1829             if (discard == LFDiscard) {
1830                 // Ignore one LF
1831                 discard = NoneDiscard;
1832             }
1833             else if (discard == AllDiscard)
1834             {
1835                 // Ignore
1836             }
1837             else
1838             {
1839                 if (select && !script) {
1840                     pending = LFPending;
1841                 } else {
1842                     if (pending)
1843                         addPending();
1844                     pending = LFPending;
1845                 }
1846             }
1847
1848             /* Check for MS-DOS CRLF sequence */
1849             if (cc == '\r')
1850             {
1851                 skipLF = true;
1852             }
1853             ++src;
1854         }
1855         else if (( cc == ' ' ) || ( cc == '\t' ))
1856         {
1857             if(discard == LFDiscard)
1858                 discard = NoneDiscard;
1859
1860             if(discard == SpaceDiscard) {
1861                 // Ignore one space
1862                 discard = NoneDiscard;
1863             }
1864             else if(discard == AllDiscard)
1865             {
1866                 // Ignore
1867             }
1868             else {
1869                 if (select && !script) {
1870                     if (!pending)
1871                         pending = SpacePending;
1872                 } else {
1873                     if (pending)
1874                         addPending();
1875                     if (cc == ' ')
1876                         pending = SpacePending;
1877                     else
1878                         pending = TabPending;
1879                 }
1880             }
1881
1882             ++src;
1883         }
1884         else
1885         {
1886             if (pending)
1887                 addPending();
1888
1889             discard = NoneDiscard;
1890             if ( pre )
1891             {
1892                 prePos++;
1893             }
1894             *dest = *src;
1895             fixUpChar( *dest );
1896             ++dest;
1897             ++src;
1898         }
1899     }
1900
1901     if (noMoreData && cachedScript.isEmpty() && !m_executingScript && m_yeldTimer<=0)
1902         end(); // this actually causes us to be deleted
1903 }
1904
1905 void HTMLTokenizer::timerEvent( QTimerEvent *e )
1906 {
1907     if ( e->timerId() == m_yeldTimer ) {
1908         killTimer(m_yeldTimer);
1909         m_yeldTimer = 0;
1910         write( TokenizerString(), true );
1911     } else if ( e->timerId() == m_autoCloseTimer && cachedScript.isEmpty() ) {
1912          finish();
1913     }
1914 }
1915
1916 void HTMLTokenizer::setAutoClose( bool b ) {
1917     killTimer( m_autoCloseTimer );
1918     m_autoCloseTimer = 0;
1919     if ( b )
1920         m_autoCloseTimer = startTimer(100);
1921 }
1922
1923 void HTMLTokenizer::end()
1924 {
1925     if ( buffer == 0 ) {
1926         emit finishedParsing();
1927         return;
1928     }
1929
1930     // parseTag is using the buffer for different matters
1931     if ( !tag )
1932         processToken();
1933
1934     if(buffer)
1935         KHTML_DELETE_QCHAR_VEC(buffer);
1936
1937     if(scriptCode)
1938         KHTML_DELETE_QCHAR_VEC(scriptCode);
1939
1940     scriptCode = 0;
1941     scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
1942     buffer = 0;
1943     emit finishedParsing();
1944 }
1945
1946 void HTMLTokenizer::finish()
1947 {
1948     if ( m_autoCloseTimer ) {
1949         killTimer( m_autoCloseTimer );
1950         m_autoCloseTimer = 0;
1951     }
1952     // do this as long as we don't find matching comment ends
1953     while((title || script || comment || server) && scriptCode && scriptCodeSize)
1954     {
1955         // we've found an unmatched comment start
1956         if (comment)
1957             brokenComments = true;
1958         else if (server)
1959             brokenServer = true;
1960         else if (script)
1961             brokenScript = true;
1962
1963         checkScriptBuffer();
1964         scriptCode[ scriptCodeSize ] = 0;
1965         scriptCode[ scriptCodeSize + 1 ] = 0;
1966         int pos;
1967         QString food;
1968         if (title || style || script)
1969             food.setUnicode(scriptCode, scriptCodeSize);
1970         else if (server) {
1971             food = "<";
1972             food += QString(scriptCode, scriptCodeSize);
1973         }
1974         else {
1975             pos = QString::fromRawData(scriptCode, scriptCodeSize).indexOf('>');
1976             food.setUnicode(scriptCode+pos+1, scriptCodeSize-pos-1); // deep copy
1977         }
1978         KHTML_DELETE_QCHAR_VEC(scriptCode);
1979         scriptCode = 0;
1980         scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
1981         if (script)
1982             scriptHandler();
1983
1984         comment = title = server = script = false;
1985         if ( !food.isEmpty() )
1986             write(food, true);
1987     }
1988     // this indicates we will not receive any more data... but if we are waiting on
1989     // an external script to load, we can't finish parsing until that is done
1990     noMoreData = true;
1991     if (cachedScript.isEmpty() && !m_executingScript && !onHold && m_yeldTimer <= 0)
1992         end(); // this actually causes us to be deleted
1993 }
1994
1995 void HTMLTokenizer::processToken()
1996 {
1997     KJSProxy *jsProxy = view ? view->part()->jScript() : 0L;
1998     if (jsProxy)
1999         jsProxy->setEventHandlerLineno(tagStartLineno);
2000     if ( dest > buffer )
2001     {
2002 #if 0
2003         if(currToken.tid) {
2004             qDebug( "unexpected token id: %d, str: *%s*", currToken.tid,QString::fromRawData( buffer, dest-buffer ).toLatin1().constData() );
2005             assert(0);
2006         }
2007
2008 #endif
2009         currToken.text = new DOMStringImpl( buffer, dest - buffer );
2010         currToken.text->ref();
2011         if (currToken.tid != ID_COMMENT)
2012             currToken.tid = ID_TEXT;
2013     }
2014     else if(!currToken.tid) {
2015         currToken.reset();
2016         if (jsProxy)
2017             jsProxy->setEventHandlerLineno(lineno+src.lineCount());
2018         return;
2019     }
2020
2021     dest = buffer;
2022
2023 #ifdef TOKEN_DEBUG
2024     QString name = QString( getTagName(currToken.tid) );
2025     QString text;
2026     if(currToken.text)
2027         text = QString::fromRawData(currToken.text->s, currToken.text->l);
2028
2029     kDebug( 6036 ) << "Token --> " << name << "   id = " << currToken.tid;
2030     if (currToken.flat)
2031         kDebug( 6036 ) << "Token is FLAT!";
2032     if(!text.isNull())
2033         kDebug( 6036 ) << "text: \"" << text << "\"";
2034     unsigned long l = currToken.attrs ? currToken.attrs->length() : 0;
2035     if(l) {
2036         kDebug( 6036 ) << "Attributes: " << l;
2037         for (unsigned long i = 0; i < l; ++i) {
2038             NodeImpl::Id tid = currToken.attrs->idAt(i);
2039             DOMString value = currToken.attrs->valueAt(i);
2040             kDebug( 6036 ) << "    " << tid << " " << parser->doc()->document()->getName(NodeImpl::AttributeId, tid).string()
2041                             << "=\"" << value.string() << "\"" << endl;
2042         }
2043     }
2044     kDebug( 6036 );
2045 #endif
2046
2047     // In some cases, parseToken() can cause javascript code to be executed
2048     // (for example, when setting an attribute that causes an event handler
2049     // to be created). So we need to protect against re-entrancy into the parser
2050     m_executingScript++;
2051
2052     // pass the token over to the parser, the parser DOES NOT delete the token
2053     parser->parseToken(&currToken);
2054
2055     m_executingScript--;
2056
2057     if ( currToken.flat && currToken.tid != ID_TEXT && !parser->noSpaces() )
2058         discard = NoneDiscard;
2059
2060     currToken.reset();
2061     if (jsProxy)
2062         jsProxy->setEventHandlerLineno(0);
2063 }
2064
2065 void HTMLTokenizer::processDoctypeToken()
2066 {
2067     // kDebug( 6036 ) << "Process DoctypeToken (name: " << doctypeToken.name << ", publicID: " << doctypeToken.publicID << ", systemID: " << doctypeToken.systemID;
2068     doctypeToken.publicID = doctypeToken.publicID.simplified();
2069     doctypeToken.systemID = doctypeToken.systemID.simplified();
2070     parser->parseDoctypeToken(&doctypeToken);
2071 }
2072
2073
2074 HTMLTokenizer::~HTMLTokenizer()
2075 {
2076     reset();
2077     delete m_prospectiveTokenizer;
2078     delete parser;
2079 }
2080
2081
2082 void HTMLTokenizer::enlargeBuffer(int len)
2083 {
2084     int newsize = qMax(size*2, size+len);
2085     int oldoffs = (dest - buffer);
2086
2087     buffer = KHTML_REALLOC_QCHAR_VEC(buffer, newsize);
2088     dest = buffer + oldoffs;
2089     size = newsize;
2090 }
2091
2092 void HTMLTokenizer::enlargeScriptBuffer(int len)
2093 {
2094     int newsize = qMax(scriptCodeMaxSize*2, scriptCodeMaxSize+len);
2095     scriptCode = KHTML_REALLOC_QCHAR_VEC(scriptCode, newsize);
2096     scriptCodeMaxSize = newsize;
2097 }
2098
2099 void HTMLTokenizer::notifyFinished(CachedObject* /*finishedObj*/)
2100 {
2101     assert(!cachedScript.isEmpty());
2102     bool done = false;
2103     while (!done && cachedScript.head()->isLoaded()) {
2104
2105         kDebug( 6036 ) << "Finished loading an external script";
2106
2107         CachedScript* cs = cachedScript.dequeue();
2108         DOMString scriptSource = cs->script();
2109 #ifdef TOKEN_DEBUG
2110         kDebug( 6036 ) << "External script is:" << endl << scriptSource.string();
2111 #endif
2112         setSrc(TokenizerString());
2113
2114         // make sure we forget about the script before we execute the new one
2115         // infinite recursion might happen otherwise
2116         QString cachedScriptUrl( cs->url().string() );
2117         cs->deref(this);
2118
2119         scriptExecution( scriptSource.string(), cachedScriptUrl );
2120
2121         done = cachedScript.isEmpty();
2122
2123         // 'script' is true when we are called synchronously from
2124         // scriptHandler(). In that case scriptHandler() will take care
2125         // of 'scriptOutput'.
2126         if ( !script ) {
2127             while (pendingQueue.count() > 1) {
2128                TokenizerString t = pendingQueue.pop();
2129                pendingQueue.top().prepend( t );
2130             }
2131             if (done) {
2132                 write(pendingQueue.pop(), false);
2133             }
2134             // we might be deleted at this point, do not
2135             // access any members.
2136         }
2137     }
2138 }
2139
2140 bool HTMLTokenizer::isWaitingForScripts() const
2141 {
2142     return cachedScript.count();
2143 }
2144
2145 bool HTMLTokenizer::isExecutingScript() const
2146 {
2147     return (m_executingScript > 0);
2148 }
2149
2150 void HTMLTokenizer::setSrc(const TokenizerString& source)
2151 {
2152     lineno += src.lineCount();
2153     src = source;
2154     src.resetLineCount();
2155 }
2156
2157 void HTMLTokenizer::setOnHold(bool _onHold)
2158 {
2159     if (onHold == _onHold) return;
2160     onHold = _onHold;
2161 }
2162