fix logic
[personal-kdelibs.git] / khtml / html / htmltokenizer.cpp
blob4a2293decd9fbdbfeffcc7e98d359ba8f8bacd42
1 /*
2 This file is part of the KDE libraries
4 Copyright (C) 1997 Martin Jones (mjones@kde.org)
5 (C) 1997 Torben Weis (weis@kde.org)
6 (C) 1998 Waldo Bastian (bastian@kde.org)
7 (C) 1999 Lars Knoll (knoll@kde.org)
8 (C) 1999 Antti Koivisto (koivisto@kde.org)
9 (C) 2001-2003 Dirk Mueller (mueller@kde.org)
10 (C) 2004-2008 Apple Computer, Inc.
11 (C) 2006-2008 Germain Garand (germain@ebooksfrance.org)
13 This library is free software; you can redistribute it and/or
14 modify it under the terms of the GNU Library General Public
15 License as published by the Free Software Foundation; either
16 version 2 of the License, or (at your option) any later version.
18 This library is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 Library General Public License for more details.
23 You should have received a copy of the GNU Library General Public License
24 along with this library; see the file COPYING.LIB. If not, write to
25 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
26 Boston, MA 02110-1301, USA.
28 //----------------------------------------------------------------------------
30 // KDE HTML Widget - Tokenizers
32 // #define TOKEN_DEBUG 1
33 //#define TOKEN_DEBUG 2
35 #include "htmltokenizer.h"
36 #include "html_documentimpl.h"
37 #include "htmlparser.h"
38 #include "dtd.h"
40 #include <misc/loader.h>
41 #include <misc/htmlhashes.h>
43 #include <khtmlview.h>
44 #include <khtml_part.h>
45 #include <xml/dom_docimpl.h>
46 #include <css/csshelper.h>
47 #include <ecma/kjs_proxy.h>
48 #include <kcharsets.h>
49 #include <kglobal.h>
50 #include <ctype.h>
51 #include <assert.h>
52 #include <QtCore/QVariant>
53 #include <kdebug.h>
54 #include <stdlib.h>
56 #include <config.h>
58 #include "kentities.c"
59 #include "htmlprospectivetokenizer.h"
61 #define PROSPECTIVE_TOKENIZER_ENABLED 1
63 using namespace khtml;
65 static const QChar commentStart [] = { '<','!','-','-', QChar::Null };
66 static const char doctypeStart [] = "<!doctype";
67 static const char publicStart [] = "public";
68 static const char systemStart [] = "system";
70 static const char scriptEnd [] = "</script";
71 static const char xmpEnd [] = "</xmp";
72 static const char styleEnd [] = "</style";
73 static const char textareaEnd [] = "</textarea";
74 static const char titleEnd [] = "</title";
76 #ifndef NDEBUG
77 static const int sTokenizerChunkSize = 2048;
78 static const int sTokenizerFastYeldDelay = 220;
79 static const int sTokenizerYeldDelay = 650;
80 #else
81 static const int sTokenizerChunkSize = 4096;
82 static const int sTokenizerFastYeldDelay = 180;
83 static const int sTokenizerYeldDelay = 450;
84 #endif
86 #define KHTML_ALLOC_QCHAR_VEC( N ) (QChar*) malloc( sizeof(QChar)*( N ) )
87 #define KHTML_REALLOC_QCHAR_VEC(P, N ) (QChar*) realloc(P, sizeof(QChar)*( N ))
88 #define KHTML_DELETE_QCHAR_VEC( P ) free((char*)( P ))
90 // Full support for MS Windows extensions to Latin-1.
91 // Technically these extensions should only be activated for pages
92 // marked "windows-1252" or "cp1252", but
93 // in the standard Microsoft way, these extensions infect hundreds of thousands
94 // of web pages. Note that people with non-latin-1 Microsoft extensions
95 // are SOL.
97 // See: http://www.microsoft.com/globaldev/reference/WinCP.asp
98 // http://www.bbsinc.com/iso8859.html
99 // http://www.obviously.com/
101 // There may be better equivalents
102 #if 0
103 #define fixUpChar(x)
104 #else
105 #define fixUpChar(x) \
106 switch ((x).unicode()) \
108 case 0x80: (x) = 0x20ac; break; \
109 case 0x82: (x) = 0x201a; break; \
110 case 0x83: (x) = 0x0192; break; \
111 case 0x84: (x) = 0x201e; break; \
112 case 0x85: (x) = 0x2026; break; \
113 case 0x86: (x) = 0x2020; break; \
114 case 0x87: (x) = 0x2021; break; \
115 case 0x88: (x) = 0x02C6; break; \
116 case 0x89: (x) = 0x2030; break; \
117 case 0x8A: (x) = 0x0160; break; \
118 case 0x8b: (x) = 0x2039; break; \
119 case 0x8C: (x) = 0x0152; break; \
120 case 0x8E: (x) = 0x017D; break; \
121 case 0x91: (x) = 0x2018; break; \
122 case 0x92: (x) = 0x2019; break; \
123 case 0x93: (x) = 0x201C; break; \
124 case 0x94: (x) = 0X201D; break; \
125 case 0x95: (x) = 0x2022; break; \
126 case 0x96: (x) = 0x2013; break; \
127 case 0x97: (x) = 0x2014; break; \
128 case 0x98: (x) = 0x02DC; break; \
129 case 0x99: (x) = 0x2122; break; \
130 case 0x9A: (x) = 0x0161; break; \
131 case 0x9b: (x) = 0x203A; break; \
132 case 0x9C: (x) = 0x0153; break; \
133 case 0x9E: (x) = 0x017E; break; \
134 case 0x9F: (x) = 0x0178; break; \
135 default: break; \
137 #endif
138 // ----------------------------------------------------------------------------
140 HTMLTokenizer::HTMLTokenizer(DOM::DocumentImpl *_doc, KHTMLView *_view)
142 view = _view;
143 buffer = 0;
144 scriptCode = 0;
145 scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
146 charsets = KGlobal::charsets();
147 parser = new KHTMLParser(_view, _doc);
148 m_executingScript = 0;
149 m_autoCloseTimer = 0;
150 m_tokenizerYeldDelay = sTokenizerFastYeldDelay;
151 m_yeldTimer = 0;
152 m_prospectiveTokenizer = 0;
153 onHold = false;
155 reset();
158 HTMLTokenizer::HTMLTokenizer(DOM::DocumentImpl *_doc, DOM::DocumentFragmentImpl *i)
160 view = 0;
161 buffer = 0;
162 scriptCode = 0;
163 scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
164 charsets = KGlobal::charsets();
165 parser = new KHTMLParser( i, _doc );
166 m_executingScript = 0;
167 m_autoCloseTimer = 0;
168 m_tokenizerYeldDelay = sTokenizerFastYeldDelay;
169 m_yeldTimer = 0;
170 m_prospectiveTokenizer = 0;
171 onHold = false;
173 reset();
176 void HTMLTokenizer::setNormalYeldDelay()
178 m_tokenizerYeldDelay = sTokenizerYeldDelay;
181 void HTMLTokenizer::reset()
183 assert(m_executingScript == 0);
184 Q_ASSERT(onHold == false);
185 m_abort = false;
187 while (!cachedScript.isEmpty())
188 cachedScript.dequeue()->deref(this);
190 if ( buffer )
191 KHTML_DELETE_QCHAR_VEC(buffer);
192 buffer = dest = 0;
193 size = 0;
195 if ( scriptCode )
196 KHTML_DELETE_QCHAR_VEC(scriptCode);
197 scriptCode = 0;
198 scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
200 if (m_autoCloseTimer > 0) {
201 killTimer(m_autoCloseTimer);
202 m_autoCloseTimer = 0;
205 if (m_yeldTimer > 0) {
206 killTimer(m_yeldTimer);
207 m_yeldTimer = 0;
209 currToken.reset();
210 doctypeToken.reset();
213 void HTMLTokenizer::begin()
215 m_executingScript = 0;
216 onHold = false;
217 reset();
218 size = 254;
219 buffer = KHTML_ALLOC_QCHAR_VEC( 255 );
220 dest = buffer;
221 tag = NoTag;
222 pending = NonePending;
223 discard = NoneDiscard;
224 pre = false;
225 prePos = 0;
226 plaintext = false;
227 xmp = false;
228 processingInstruction = false;
229 script = false;
230 escaped = false;
231 style = false;
232 skipLF = false;
233 select = false;
234 comment = false;
235 doctype = false;
236 doctypeComment = NoDoctypeComment;
237 doctypeAllowComment = false;
238 server = false;
239 textarea = false;
240 title = false;
241 startTag = false;
242 tquote = NoQuote;
243 searchCount = 0;
244 doctypeSearchCount = 0;
245 doctypeSecondarySearchCount = 0;
246 Entity = NoEntity;
247 noMoreData = false;
248 brokenComments = false;
249 brokenServer = false;
250 brokenScript = false;
251 lineno = 0;
252 scriptStartLineno = 0;
253 tagStartLineno = 0;
256 void HTMLTokenizer::processListing(TokenizerString list)
258 bool old_pre = pre;
260 // This function adds the listing 'list' as
261 // preformatted text-tokens to the token-collection
262 // thereby converting TABs.
263 if(!style) pre = true;
264 prePos = 0;
266 while ( !list.isEmpty() )
268 checkBuffer(3*TAB_SIZE);
270 if (skipLF && ( list->unicode() != '\n' ))
272 skipLF = false;
275 if (skipLF)
277 skipLF = false;
278 ++list;
280 else if (( list->unicode() == '\n' ) || ( list->unicode() == '\r' ))
282 if (discard == LFDiscard)
284 // Ignore this LF
285 discard = NoneDiscard; // We have discarded 1 LF
287 else
289 // Process this LF
290 if (pending)
291 addPending();
293 // we used to do it not at all and we want to have
294 // it fixed for textarea. So here we are
295 if ( textarea ) {
296 prePos++;
297 *dest++ = *list;
298 } else
299 pending = LFPending;
301 /* Check for MS-DOS CRLF sequence */
302 if (list->unicode() == '\r')
304 skipLF = true;
306 ++list;
308 else if (( list->unicode() == ' ' ) || ( list->unicode() == '\t'))
310 if (pending)
311 addPending();
312 if (*list == ' ')
313 pending = SpacePending;
314 else
315 pending = TabPending;
317 ++list;
319 else
321 discard = NoneDiscard;
322 if (pending)
323 addPending();
325 prePos++;
326 *dest++ = *list;
327 ++list;
332 if ((pending == SpacePending) || (pending == TabPending))
333 addPending();
334 else
335 pending = NonePending;
337 prePos = 0;
338 pre = old_pre;
341 void HTMLTokenizer::parseSpecial(TokenizerString &src)
343 assert( textarea || title || !Entity );
344 assert( !tag );
345 assert( xmp+textarea+title+style+script == 1 );
346 if (script)
347 scriptStartLineno = lineno+src.lineCount();
349 if ( comment ) parseComment( src );
351 while ( !src.isEmpty() ) {
352 checkScriptBuffer();
353 unsigned char ch = src->toLatin1();
354 if ( !scriptCodeResync && !brokenComments && !textarea && !xmp && ch == '-' && scriptCodeSize >= 3 && !src.escaped() && QString::fromRawData( scriptCode+scriptCodeSize-3, 3 ) == "<!-" ) {
355 comment = true;
356 scriptCode[ scriptCodeSize++ ] = ch;
357 ++src;
358 parseComment( src );
359 continue;
361 if ( scriptCodeResync && !tquote && ( ch == '>' ) ) {
362 ++src;
363 scriptCodeSize = scriptCodeResync-1;
364 scriptCodeResync = 0;
365 scriptCode[ scriptCodeSize ] = scriptCode[ scriptCodeSize + 1 ] = 0;
366 if ( script )
367 scriptHandler();
368 else {
369 processListing(TokenizerString(scriptCode, scriptCodeSize));
370 processToken();
371 if ( style ) { currToken.tid = ID_STYLE + ID_CLOSE_TAG; }
372 else if ( textarea ) { currToken.tid = ID_TEXTAREA + ID_CLOSE_TAG; }
373 else if ( title ) { currToken.tid = ID_TITLE + ID_CLOSE_TAG; }
374 else if ( xmp ) { currToken.tid = ID_XMP + ID_CLOSE_TAG; }
375 processToken();
376 script = style = textarea = title = xmp = false;
377 tquote = NoQuote;
378 scriptCodeSize = scriptCodeResync = 0;
380 return;
382 // possible end of tagname, lets check.
383 if ( !scriptCodeResync && !escaped && !src.escaped() && ( ch == '>' || ch == '/' || ch <= ' ' ) && ch &&
384 scriptCodeSize >= searchStopperLen &&
385 !QString::fromRawData( scriptCode+scriptCodeSize-searchStopperLen, searchStopperLen ).indexOf( searchStopper, 0, Qt::CaseInsensitive )) {
386 scriptCodeResync = scriptCodeSize-searchStopperLen+1;
387 tquote = NoQuote;
388 continue;
390 if ( scriptCodeResync && !escaped ) {
391 if(ch == '\"')
392 tquote = (tquote == NoQuote) ? DoubleQuote : ((tquote == SingleQuote) ? SingleQuote : NoQuote);
393 else if(ch == '\'')
394 tquote = (tquote == NoQuote) ? SingleQuote : (tquote == DoubleQuote) ? DoubleQuote : NoQuote;
395 else if (tquote != NoQuote && (ch == '\r' || ch == '\n'))
396 tquote = NoQuote;
398 escaped = ( !escaped && ch == '\\' );
399 if (!scriptCodeResync && (textarea||title) && !src.escaped() && ch == '&') {
400 QChar *scriptCodeDest = scriptCode+scriptCodeSize;
401 ++src;
402 parseEntity(src,scriptCodeDest,true);
403 scriptCodeSize = scriptCodeDest-scriptCode;
405 else {
406 scriptCode[ scriptCodeSize++ ] = *src;
407 ++src;
412 void HTMLTokenizer::scriptHandler()
414 QString currentScriptSrc = scriptSrc;
415 scriptSrc.clear();
417 processListing(TokenizerString(scriptCode, scriptCodeSize));
418 QString exScript( buffer, dest-buffer );
420 processToken();
421 currToken.tid = ID_SCRIPT + ID_CLOSE_TAG;
422 processToken();
424 // Scripts following a frameset element should not be executed or even loaded in the case of extern scripts.
425 bool followingFrameset = (parser->doc()->body() && parser->doc()->body()->id() == ID_FRAMESET);
426 bool effectiveScript = !parser->skipMode() && !followingFrameset;
427 bool deferredScript = false;
429 if ( effectiveScript ) {
430 CachedScript* cs = 0;
432 // forget what we just got, load from src url instead
433 if ( !currentScriptSrc.isEmpty() && javascript &&
434 (cs = parser->doc()->docLoader()->requestScript(currentScriptSrc, scriptSrcCharset) )) {
435 cachedScript.enqueue(cs);
438 if (cs) {
439 pendingQueue.push(src);
440 int scriptCount = cachedScript.count();
441 setSrc(TokenizerString());
442 scriptCodeSize = scriptCodeResync = 0;
443 cs->ref(this);
444 if (cachedScript.count() == scriptCount)
445 deferredScript = true;
447 else if (currentScriptSrc.isEmpty() && view && javascript ) {
448 pendingQueue.push(src);
449 setSrc(TokenizerString());
450 scriptCodeSize = scriptCodeResync = 0;
451 scriptExecution( exScript, QString(), tagStartLineno /*scriptStartLineno*/ );
452 } else {
453 // script was filtered or disallowed
454 effectiveScript = false;
458 script = false;
459 scriptCodeSize = scriptCodeResync = 0;
461 if ( !effectiveScript )
462 return;
464 if ( !m_executingScript && cachedScript.isEmpty() ) {
465 src.append(pendingQueue.pop());
466 } else if ( cachedScript.isEmpty() ) {
467 write( pendingQueue.pop(), false );
468 } else if ( !deferredScript && pendingQueue.count() > 1) {
469 TokenizerString t = pendingQueue.pop();
470 pendingQueue.top().prepend( t );
472 #if PROSPECTIVE_TOKENIZER_ENABLED
473 if (!cachedScript.isEmpty() && !m_executingScript) {
474 if (!m_prospectiveTokenizer)
475 m_prospectiveTokenizer = new ProspectiveTokenizer(parser->docPtr());
476 if (!m_prospectiveTokenizer->inProgress() && !pendingQueue.isEmpty()) {
477 m_prospectiveTokenizer->begin();
478 m_prospectiveTokenizer->write(pendingQueue.top());
481 #endif
485 void HTMLTokenizer::scriptExecution( const QString& str, const QString& scriptURL,
486 int baseLine)
488 bool oldscript = script;
489 m_executingScript++;
490 script = false;
491 QString url;
492 if (scriptURL.isNull() && view)
493 url = static_cast<DocumentImpl*>(view->part()->document().handle())->URL().url();
494 else
495 url = scriptURL;
497 if (view)
498 view->part()->executeScript(url,baseLine,Node(),str);
499 m_executingScript--;
500 script = oldscript;
503 void HTMLTokenizer::parseComment(TokenizerString &src)
505 checkScriptBuffer(src.length());
506 while ( src.length() ) {
507 scriptCode[ scriptCodeSize++ ] = *src;
509 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
510 qDebug("comment is now: *%s*", src.toString().left(16).toLatin1().constData());
511 #endif
513 if (src->unicode() == '>')
515 bool handleBrokenComments = brokenComments && !( script || style );
516 bool scriptEnd=false;
517 if ( scriptCodeSize > 2 && scriptCode[scriptCodeSize-3] == '-' &&
518 scriptCode[scriptCodeSize-2] == '-' )
520 scriptEnd=true;
523 if (handleBrokenComments || scriptEnd ){
524 ++src;
525 if ( !( title || script || xmp || textarea || style) ) {
526 checkScriptBuffer();
527 scriptCode[ scriptCodeSize ] = 0;
528 scriptCode[ scriptCodeSize + 1 ] = 0;
529 currToken.tid = ID_COMMENT;
530 processListing(TokenizerString(scriptCode, scriptCodeSize - 3));
531 processToken();
532 currToken.tid = ID_COMMENT + ID_CLOSE_TAG;
533 processToken();
534 scriptCodeSize = 0;
536 comment = false;
537 return; // Finished parsing comment
540 ++src;
544 void HTMLTokenizer::parseDoctypeComment(TokenizerString &src)
546 while (!src.isEmpty()) {
547 QChar c = *src;
548 switch (doctypeComment) {
549 case DoctypeCommentHalfBegin: {
550 if (c != '-') {
551 // Ooops, it's not comment
552 doctypeComment = DoctypeCommentBogus;
553 return;
554 } else {
555 // Doctype comment begins
556 doctypeComment = DoctypeComment;
557 ++src;
559 break;
561 case DoctypeComment: {
562 if (c == '-') {
563 // Perhaps this is end of comment
564 doctypeComment = DoctypeCommentHalfEnd;
565 ++src;
566 } else {
567 // Keep scanning for '--'
568 ++src;
570 break;
572 case DoctypeCommentHalfEnd: {
573 if (c == '-') {
574 // Doctype comment ends
575 doctypeComment = DoctypeCommentEnd;
576 return;
577 } else {
578 // It's not '--'
579 ++src;
580 doctypeComment = DoctypeComment;
582 break;
584 default: {
585 assert(!"Undefined doctype comment state");
586 break;
592 void HTMLTokenizer::parseDoctype(TokenizerString &src)
594 while (!src.isEmpty() && doctype) {
595 QChar c;
596 bool isWhitespace = false;
597 int dontAdvance = 0;
598 if (doctypeComment == DoctypeCommentEnd) {
599 doctypeComment = NoDoctypeComment;
600 isWhitespace = true;
601 } else if (doctypeComment == DoctypeCommentBogus) {
602 doctypeComment = NoDoctypeComment;
603 c = '-';
604 dontAdvance++;
605 } else {
606 c = *src;
607 if (doctypeAllowComment) {
608 if (!doctypeComment && c == '-') {
609 doctypeComment = DoctypeCommentHalfBegin;
610 ++src;
612 if (doctypeComment) {
613 parseDoctypeComment(src);
614 continue;
616 isWhitespace = c == '\r' || c == '\n' || c == '\t' || c == ' ';
620 switch (doctypeToken.state) {
621 case DoctypeBegin: {
622 doctypeToken.state = DoctypeBeforeName;
623 if (isWhitespace) {
624 // nothing
626 break;
628 case DoctypeBeforeName: {
629 if (c == '>') {
630 // Malformed. Just exit.
631 doctype = false;
632 } else if (isWhitespace) {
633 // nothing
634 } else {
635 dontAdvance++;
636 doctypeToken.state = DoctypeName;
638 break;
640 case DoctypeName: {
641 if (c == '>') {
642 // Valid doctype. Emit it.
643 doctype = false;
644 processDoctypeToken();
645 } else if (isWhitespace) {
646 doctypeSearchCount = 0; // Used now to scan for PUBLIC
647 doctypeSecondarySearchCount = 0; // Used now to scan for SYSTEM
648 doctypeToken.state = DoctypeAfterName;
649 } else {
650 doctypeToken.name.append(c);
652 break;
654 case DoctypeAfterName: {
655 if (c == '>') {
656 // Valid doctype. Emit it.
657 doctype = false;
658 processDoctypeToken();
659 } else if (c == '[') {
660 if(doctypeSearchCount > 0 || doctypeSecondarySearchCount > 0) { // is there any public/system indicator before?
661 doctypeSearchCount = doctypeSecondarySearchCount = 0;
662 doctypeToken.state = DoctypeBogus;
664 // Found internal subset
665 doctypeToken.state = DoctypeInternalSubset;
666 doctypeAllowComment = false;
667 } else if (!isWhitespace) {
668 if (c.toLower() == publicStart[doctypeSearchCount]) {
669 doctypeSearchCount++;
670 if(doctypeSearchCount == 6)
671 // Found 'PUBLIC' sequence
672 doctypeToken.state = DoctypeBeforePublicID;
673 } else if (doctypeSearchCount > 0) {
674 doctypeSearchCount = 0;
675 doctypeToken.state = DoctypeBogus;
676 } else if (c.toLower() == systemStart[doctypeSecondarySearchCount]) {
677 doctypeSecondarySearchCount++;
678 if(doctypeSecondarySearchCount == 6)
679 // Found 'SYSTEM' sequence
680 doctypeToken.state = DoctypeBeforeSystemID;
681 } else {
682 doctypeSecondarySearchCount = 0;
683 doctypeToken.state = DoctypeBogus;
685 } else {
686 // Whitespace keeps us in the after name state
688 break;
690 case DoctypeBeforePublicID: {
691 if (c == '\"' || c == '\'') {
692 tquote = c == '\"' ? DoubleQuote : SingleQuote;
693 doctypeToken.state = DoctypePublicID;
694 doctypeAllowComment = false;
695 } else if (c == '>') {
696 // Considered bogus. Don't process the doctype.
697 doctype = false;
698 } else if (isWhitespace) {
699 // nothing
700 } else
701 doctypeToken.state = DoctypeBogus;
702 break;
704 case DoctypePublicID: {
705 if ((c == '\"' && tquote == DoubleQuote) || (c == '\'' && tquote == SingleQuote)) {
706 doctypeToken.state = DoctypeAfterPublicID;
707 doctypeAllowComment = true;
708 } else if (c == '>') {
709 // Considered bogus. Don't process the doctype.
710 doctype = false;
711 } else {
712 doctypeToken.publicID.append(c);
714 break;
716 case DoctypeAfterPublicID: {
717 if (c == '\"' || c == '\'') {
718 tquote = c == '\"' ? DoubleQuote : SingleQuote;
719 doctypeToken.state = DoctypeSystemID;
720 } else if (c == '>') {
721 // Valid doctype. Emit it now.
722 doctype = false;
723 processDoctypeToken();
724 } else if (isWhitespace) {
725 // nothing
726 } else if (c == '[') {
727 // Found internal subset
728 doctypeToken.state = DoctypeInternalSubset;
729 doctypeAllowComment = false;
730 } else
731 doctypeToken.state = DoctypeBogus;
732 break;
734 case DoctypeBeforeSystemID: {
735 if (c == '\"' || c == '\'') {
736 tquote = c == '\"' ? DoubleQuote : SingleQuote;
737 doctypeToken.state = DoctypeSystemID;
738 doctypeAllowComment = false;
739 } else if (c == '>') {
740 // Considered bogus. Don't process the doctype.
741 doctype = false;
742 } else if (isWhitespace) {
743 // nothing
744 } else
745 doctypeToken.state = DoctypeBogus;
746 break;
748 case DoctypeSystemID: {
749 if ((c == '\"' && tquote == DoubleQuote) || (c == '\'' && tquote == SingleQuote)) {
750 doctypeToken.state = DoctypeAfterSystemID;
751 doctypeAllowComment = true;
752 } else if (c == '>') {
753 // Considered bogus. Don't process the doctype.
754 doctype = false;
755 } else {
756 doctypeToken.systemID.append(c);
758 break;
760 case DoctypeAfterSystemID: {
761 if (c == '>') {
762 // Valid doctype. Emit it now.
763 doctype = false;
764 processDoctypeToken();
765 } else if (isWhitespace) {
766 // nothing
767 } else if (c == '[') {
768 // Found internal subset
769 doctypeToken.state = DoctypeInternalSubset;
770 doctypeAllowComment = false;
771 } else {
772 doctypeToken.state = DoctypeBogus;
774 break;
776 case DoctypeInternalSubset: {
777 if(c == ']') {
778 // Done
779 doctypeToken.state = DoctypeAfterInternalSubset;
780 doctypeAllowComment = true;
781 } else {
782 doctypeToken.internalSubset.append(c);
784 break;
786 case DoctypeAfterInternalSubset: {
787 if (c == '>') {
788 // Valid doctype. Emit it now.
789 doctype = false;
790 processDoctypeToken();
791 } else if (isWhitespace) {
792 // nothing
793 } else
794 doctypeToken.state = DoctypeBogus;
795 break;
797 case DoctypeBogus: {
798 if (c == '>') {
799 // Done with the bogus doctype.
800 doctype = false;
801 } else {
802 // Just keep scanning for '>'
804 break;
806 default:
807 break;
809 if (!dontAdvance)
810 ++src;
811 else if (dontAdvance == 1)
812 continue;
813 else // double dontAdvance++, do workaround
814 doctypeComment = DoctypeCommentBogus;
818 void HTMLTokenizer::parseServer(TokenizerString &src)
820 checkScriptBuffer(src.length());
821 while ( !src.isEmpty() ) {
822 scriptCode[ scriptCodeSize++ ] = *src;
823 if (src->unicode() == '>' &&
824 scriptCodeSize > 1 && scriptCode[scriptCodeSize-2] == '%') {
825 ++src;
826 server = false;
827 scriptCodeSize = 0;
828 return; // Finished parsing server include
830 ++src;
834 void HTMLTokenizer::parseProcessingInstruction(TokenizerString &src)
836 char oldchar = 0;
837 while ( !src.isEmpty() )
839 unsigned char chbegin = src->toLatin1();
840 if(chbegin == '\'') {
841 tquote = tquote == SingleQuote ? NoQuote : SingleQuote;
843 else if(chbegin == '\"') {
844 tquote = tquote == DoubleQuote ? NoQuote : DoubleQuote;
846 // Look for '?>'
847 // some crappy sites omit the "?" before it, so
848 // we look for an unquoted '>' instead. (IE compatible)
849 else if ( chbegin == '>' && ( !tquote || oldchar == '?' ) )
851 // We got a '?>' sequence
852 processingInstruction = false;
853 ++src;
854 discard=LFDiscard;
855 return; // Finished parsing comment!
857 ++src;
858 oldchar = chbegin;
862 void HTMLTokenizer::parseText(TokenizerString &src)
864 while ( !src.isEmpty() )
866 // do we need to enlarge the buffer?
867 checkBuffer();
869 // ascii is okay because we only do ascii comparisons
870 unsigned char chbegin = src->toLatin1();
872 if (skipLF && ( chbegin != '\n' ))
874 skipLF = false;
877 if (skipLF)
879 skipLF = false;
880 ++src;
882 else if (( chbegin == '\n' ) || ( chbegin == '\r' ))
884 if (chbegin == '\r')
885 skipLF = true;
887 *dest++ = '\n';
888 ++src;
890 else {
891 *dest++ = *src;
892 ++src;
898 void HTMLTokenizer::parseEntity(TokenizerString &src, QChar *&dest, bool start)
900 if( start )
902 cBufferPos = 0;
903 entityLen = 0;
904 Entity = SearchEntity;
907 while( !src.isEmpty() )
909 ushort cc = src->unicode();
910 switch(Entity) {
911 case NoEntity:
912 return;
914 break;
915 case SearchEntity:
916 if(cc == '#') {
917 cBuffer[cBufferPos++] = cc;
918 ++src;
919 Entity = NumericSearch;
921 else
922 Entity = EntityName;
924 break;
926 case NumericSearch:
927 if(cc == 'x' || cc == 'X') {
928 cBuffer[cBufferPos++] = cc;
929 ++src;
930 Entity = Hexadecimal;
932 else if(cc >= '0' && cc <= '9')
933 Entity = Decimal;
934 else
935 Entity = SearchSemicolon;
937 break;
939 case Hexadecimal:
941 int uc = EntityChar.unicode();
942 int ll = qMin<uint>(src.length(), 8);
943 while(ll--) {
944 QChar csrc(src->toLower());
945 cc = csrc.cell();
947 if(csrc.row() || !((cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f'))) {
948 break;
950 uc = uc*16 + (cc - ( cc < 'a' ? '0' : 'a' - 10));
951 cBuffer[cBufferPos++] = cc;
952 ++src;
954 EntityChar = QChar(uc);
955 Entity = SearchSemicolon;
956 break;
958 case Decimal:
960 int uc = EntityChar.unicode();
961 int ll = qMin(src.length(), 9-cBufferPos);
962 while(ll--) {
963 cc = src->cell();
965 if(src->row() || !(cc >= '0' && cc <= '9')) {
966 Entity = SearchSemicolon;
967 break;
970 uc = uc * 10 + (cc - '0');
971 cBuffer[cBufferPos++] = cc;
972 ++src;
974 EntityChar = QChar(uc);
975 if(cBufferPos == 9) Entity = SearchSemicolon;
976 break;
978 case EntityName:
980 int ll = qMin(src.length(), 9-cBufferPos);
981 while(ll--) {
982 QChar csrc = *src;
983 cc = csrc.cell();
985 if(csrc.row() || !((cc >= 'a' && cc <= 'z') ||
986 (cc >= '0' && cc <= '9') || (cc >= 'A' && cc <= 'Z'))) {
987 Entity = SearchSemicolon;
988 break;
991 cBuffer[cBufferPos++] = cc;
992 ++src;
994 // be IE compatible and interpret even unterminated entities
995 // outside tags. like "foo &nbspstuff bla".
996 if ( tag == NoTag ) {
997 const entity* e = kde_findEntity(cBuffer, cBufferPos);
998 if ( e && e->code < 256 ) {
999 EntityChar = e->code;
1000 entityLen = cBufferPos;
1004 if(cBufferPos == 9) Entity = SearchSemicolon;
1005 if(Entity == SearchSemicolon) {
1006 if(cBufferPos > 1) {
1007 const entity *e = kde_findEntity(cBuffer, cBufferPos);
1008 // IE only accepts unterminated entities < 256,
1009 // Gecko accepts them all, but only outside tags
1010 if(e && ( tag == NoTag || e->code < 256 || *src == ';' )) {
1011 EntityChar = e->code;
1012 entityLen = cBufferPos;
1016 break;
1018 case SearchSemicolon:
1019 #ifdef TOKEN_DEBUG
1020 kDebug( 6036 ) << "ENTITY " << EntityChar.unicode();
1021 #endif
1022 fixUpChar(EntityChar);
1024 if (*src == ';')
1025 ++src;
1027 if ( !EntityChar.isNull() ) {
1028 checkBuffer();
1029 if (entityLen > 0 && entityLen < cBufferPos) {
1030 int rem = cBufferPos - entityLen;
1031 src.prepend( TokenizerString(QString::fromAscii(cBuffer+entityLen, rem)) );
1033 src.push( EntityChar );
1034 } else {
1035 #ifdef TOKEN_DEBUG
1036 kDebug( 6036 ) << "unknown entity!";
1037 #endif
1038 checkBuffer(10);
1039 // ignore the sequence, add it to the buffer as plaintext
1040 *dest++ = '&';
1041 for(unsigned int i = 0; i < cBufferPos; i++)
1042 dest[i] = cBuffer[i];
1043 dest += cBufferPos;
1044 if (pre)
1045 prePos += cBufferPos+1;
1048 Entity = NoEntity;
1049 EntityChar = QChar::Null;
1050 return;
1055 void HTMLTokenizer::parseTag(TokenizerString &src)
1057 assert(!Entity );
1058 checkScriptBuffer( src.length() );
1060 while ( !src.isEmpty() )
1062 checkBuffer();
1063 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1064 uint l = 0;
1065 while(l < src.length() && (src.toString()[l]).toLatin1().constData() != '>')
1066 l++;
1067 qDebug("src is now: *%s*, tquote: %d",
1068 src.toString().left(l).toLatin1().constData(), tquote);
1069 #endif
1070 switch(tag) {
1071 case NoTag:
1072 return;
1073 case TagName:
1075 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1076 qDebug("TagName");
1077 #endif
1078 if (searchCount > 0)
1080 if (*src == commentStart[searchCount])
1082 searchCount++;
1083 if (searchCount == 2)
1084 doctypeSearchCount++; // A '!' is also part of doctype, so we are moving through that still as well
1085 else
1086 doctypeSearchCount = 0;
1088 if (searchCount == 4)
1090 #ifdef TOKEN_DEBUG
1091 kDebug( 6036 ) << "Found comment";
1092 #endif
1093 // Found '<!--' sequence
1094 ++src;
1095 dest = buffer; // ignore the previous part of this tag
1096 tag = NoTag;
1098 comment = true;
1099 parseComment(src);
1100 return; // Finished parsing tag!
1102 // cuts of high part, is okay
1103 cBuffer[cBufferPos++] = src->cell();
1104 ++src;
1105 break;
1107 else
1108 searchCount = 0; // Stop looking for '<!--' sequence
1111 if (doctypeSearchCount > 0) {
1112 if((*src).toLower() == doctypeStart[doctypeSearchCount]) {
1113 doctypeSearchCount++;
1114 cBuffer[cBufferPos++] = src->cell();
1115 ++src;
1116 if(doctypeSearchCount == 9) {
1117 // Found '<!DOCTYPE' sequence
1118 tag = NoTag;
1119 doctypeAllowComment = true;
1120 doctypeComment = NoDoctypeComment;
1121 doctypeToken.reset();
1122 doctype = true;
1124 parseDoctype(src);
1125 return;
1127 break;
1128 } else
1129 doctypeSearchCount = 0; // Stop looking for '<!DOCTYPE' sequence
1132 bool finish = false;
1133 unsigned int ll = qMin(src.length(), CBUFLEN-cBufferPos);
1134 while(ll--) {
1135 ushort curchar = src->unicode();
1136 if(curchar <= ' ' || curchar == '>' ) {
1137 finish = true;
1138 break;
1140 // this is a nasty performance trick. will work for the A-Z
1141 // characters, but not for others. if it contains one,
1142 // we fail anyway
1143 char cc = curchar;
1144 cBuffer[cBufferPos++] = cc | 0x20;
1145 ++src;
1148 // Disadvantage: we add the possible rest of the tag
1149 // as attribute names. ### judge if this causes problems
1150 if(finish || CBUFLEN == cBufferPos) {
1151 bool beginTag;
1152 char* ptr = cBuffer;
1153 unsigned int len = cBufferPos;
1154 cBuffer[cBufferPos] = '\0';
1155 if ((cBufferPos > 0) && (*ptr == '/'))
1157 // End Tag
1158 beginTag = false;
1159 ptr++;
1160 len--;
1162 else
1163 // Start Tag
1164 beginTag = true;
1165 // Accept empty xml tags like <br/>
1166 if(len > 1 && ptr[len-1] == '/' ) {
1167 ptr[--len] = '\0';
1168 // if its like <br/> and not like <input/ value=foo>, take it as flat
1169 if (*src == '>')
1170 currToken.flat = true;
1173 uint tagID = 0;
1174 if (!tagID) {
1175 DOMString tagName(ptr);
1176 DocumentImpl *doc = parser->docPtr();
1177 if (Element::khtmlValidQualifiedName(tagName)) {
1178 safeLocalName = LocalName::fromString(tagName.lower());
1179 tagID = safeLocalName.id();
1181 #ifdef TOKEN_DEBUG
1182 QByteArray tmp(ptr, len+1);
1183 kDebug( 6036 ) << "Unknown tag: \"" << tmp.data() << "\"";
1184 #endif
1186 if (tagID) {
1187 #ifdef TOKEN_DEBUG
1188 QByteArray tmp(ptr, len+1);
1189 kDebug( 6036 ) << "found tag id=" << tagID << ": " << tmp.data();
1190 #endif
1191 currToken.tid = beginTag ? tagID : tagID + ID_CLOSE_TAG;
1193 dest = buffer;
1194 tag = SearchAttribute;
1195 cBufferPos = 0;
1197 break;
1199 case SearchAttribute:
1201 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1202 qDebug("SearchAttribute");
1203 #endif
1204 bool atespace = false;
1205 ushort curchar;
1206 while(!src.isEmpty()) {
1207 curchar = src->unicode();
1208 if(curchar > ' ') {
1209 if(curchar == '<' || curchar == '>')
1210 tag = SearchEnd;
1211 else if(atespace && (curchar == '\'' || curchar == '"'))
1213 tag = SearchValue;
1214 *dest++ = 0;
1215 attrName.clear();
1217 else
1218 tag = AttributeName;
1220 cBufferPos = 0;
1221 break;
1223 atespace = true;
1224 ++src;
1226 break;
1228 case AttributeName:
1230 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1231 qDebug("AttributeName");
1232 #endif
1233 ushort curchar;
1234 int ll = qMin(src.length(), CBUFLEN-cBufferPos);
1236 while(ll--) {
1237 curchar = src->unicode();
1238 if(curchar <= '>') {
1239 if(curchar <= ' ' || curchar == '=' || curchar == '>') {
1240 unsigned int a;
1241 cBuffer[cBufferPos] = '\0';
1242 a = LocalName::fromString(DOMString(cBuffer).lower()).id();
1243 if (a > ATTR_LAST_ATTR)
1244 a = 0;
1246 if ( !a ) {
1247 // did we just get /> or e.g checked/>
1248 if (curchar == '>' && cBufferPos >=1 && cBuffer[cBufferPos-1] == '/') {
1249 currToken.flat = true;
1250 cBuffer[cBufferPos - 1] = '\0';
1251 if (cBufferPos>1)
1252 a = LocalName::fromString(DOMString(cBuffer).lower()).id();
1253 if (a > ATTR_LAST_ATTR)
1254 a = 0;
1255 cBuffer[cBufferPos - 1] = '/';
1257 if (!a)
1258 attrName = QLatin1String(QByteArray(cBuffer, cBufferPos+1).data());
1261 dest = buffer;
1262 *dest++ = a;
1263 #ifdef TOKEN_DEBUG
1264 if (!a || (cBufferPos && *cBuffer == '!'))
1265 kDebug( 6036 ) << "Unknown attribute: *" << QByteArray(cBuffer, cBufferPos+1).data() << "*";
1266 else
1267 kDebug( 6036 ) << "Known attribute: " << QByteArray(cBuffer, cBufferPos+1).data();
1268 #endif
1270 tag = SearchEqual;
1271 break;
1274 cBuffer[cBufferPos++] =
1275 ( curchar >= 'A' && curchar <= 'Z' ) ? curchar | 0x20 : curchar;
1276 ++src;
1278 if ( cBufferPos == CBUFLEN ) {
1279 cBuffer[cBufferPos] = '\0';
1280 attrName = QLatin1String(QByteArray(cBuffer, cBufferPos+1).data());
1281 dest = buffer;
1282 *dest++ = 0;
1283 tag = SearchEqual;
1285 break;
1287 case SearchEqual:
1289 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1290 qDebug("SearchEqual");
1291 #endif
1292 ushort curchar;
1293 bool atespace = false;
1294 while(!src.isEmpty()) {
1295 curchar = src->unicode();
1296 if(curchar > ' ') {
1297 if(curchar == '=') {
1298 #ifdef TOKEN_DEBUG
1299 kDebug(6036) << "found equal";
1300 #endif
1301 tag = SearchValue;
1302 ++src;
1304 else if(atespace && (curchar == '\'' || curchar == '"'))
1306 tag = SearchValue;
1307 *dest++ = 0;
1308 attrName.clear();
1310 else {
1311 DOMString v("");
1312 currToken.addAttribute(parser->docPtr(), buffer, attrName, v);
1313 dest = buffer;
1314 tag = SearchAttribute;
1316 break;
1318 atespace = true;
1319 ++src;
1321 break;
1323 case SearchValue:
1325 ushort curchar;
1326 while(!src.isEmpty()) {
1327 curchar = src->unicode();
1328 if(curchar > ' ') {
1329 if(( curchar == '\'' || curchar == '\"' )) {
1330 tquote = curchar == '\"' ? DoubleQuote : SingleQuote;
1331 tag = QuotedValue;
1332 ++src;
1333 } else
1334 tag = Value;
1336 break;
1338 ++src;
1340 break;
1342 case QuotedValue:
1344 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1345 qDebug("QuotedValue");
1346 #endif
1347 ushort curchar;
1348 while(!src.isEmpty()) {
1349 checkBuffer();
1351 curchar = src->unicode();
1352 if(curchar <= '\'' && !src.escaped()) {
1353 // ### attributes like '&{blaa....};' are supposed to be treated as jscript.
1354 if ( curchar == '&' )
1356 ++src;
1357 parseEntity(src, dest, true);
1358 break;
1360 else if ( (tquote == SingleQuote && curchar == '\'') ||
1361 (tquote == DoubleQuote && curchar == '\"') )
1363 // some <input type=hidden> rely on trailing spaces. argh
1364 while(dest > buffer+1 && (*(dest-1) == '\n' || *(dest-1) == '\r'))
1365 dest--; // remove trailing newlines
1366 DOMString v(buffer+1, dest-buffer-1);
1367 currToken.addAttribute(parser->docPtr(), buffer, attrName, v);
1369 dest = buffer;
1370 tag = SearchAttribute;
1371 tquote = NoQuote;
1372 ++src;
1373 break;
1376 *dest++ = *src;
1377 ++src;
1379 break;
1381 case Value:
1383 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1384 qDebug("Value");
1385 #endif
1386 ushort curchar;
1387 while(!src.isEmpty()) {
1388 checkBuffer();
1389 curchar = src->unicode();
1390 if(curchar <= '>' && !src.escaped()) {
1391 // parse Entities
1392 if ( curchar == '&' )
1394 ++src;
1395 parseEntity(src, dest, true);
1396 break;
1398 // no quotes. Every space means end of value
1399 // '/' does not delimit in IE!
1400 if ( curchar <= ' ' || curchar == '>' )
1402 DOMString v(buffer+1, dest-buffer-1);
1403 currToken.addAttribute(parser->docPtr(), buffer, attrName, v);
1404 dest = buffer;
1405 tag = SearchAttribute;
1406 break;
1410 *dest++ = *src;
1411 ++src;
1413 break;
1415 case SearchEnd:
1417 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1418 qDebug("SearchEnd");
1419 #endif
1420 while(!src.isEmpty()) {
1421 if(*src == '<' || *src == '>')
1422 break;
1424 if (*src == '/')
1425 currToken.flat = true;
1427 ++src;
1429 if(src.isEmpty() && *src != '<' && *src != '>') break;
1431 searchCount = 0; // Stop looking for '<!--' sequence
1432 tag = NoTag;
1433 tquote = NoQuote;
1434 if ( *src == '>' )
1435 ++src;
1437 if ( !currToken.tid ) //stop if tag is unknown
1438 return;
1440 uint tagID = currToken.tid;
1441 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 0
1442 kDebug( 6036 ) << "appending Tag: " << tagID;
1443 #endif
1444 // If the tag requires an end tag it cannot be flat,
1445 // unless we are using the HTML parser to parse XHTML
1446 // The only exception is SCRIPT and priority 0 tokens.
1447 if (tagID < ID_CLOSE_TAG && tagID != ID_SCRIPT &&
1448 DOM::endTagRequirement(tagID) == DOM::REQUIRED &&
1449 parser->doc()->htmlMode() != DocumentImpl::XHtml)
1450 currToken.flat = false;
1452 bool beginTag = !currToken.flat && (tagID < ID_CLOSE_TAG);
1454 if(tagID >= ID_CLOSE_TAG)
1455 tagID -= ID_CLOSE_TAG;
1456 else if ( !brokenScript && tagID == ID_SCRIPT ) {
1457 DOMStringImpl* a = 0;
1458 bool foundTypeAttribute = false;
1459 scriptSrc.clear(); scriptSrcCharset.clear();
1460 if ( currToken.attrs && /* potentially have a ATTR_SRC ? */
1461 view && /* are we a regular tokenizer or just for innerHTML ? */
1462 parser->doc()->view()->part()->jScriptEnabled() /* jscript allowed at all? */
1464 if ( ( a = currToken.attrs->getValue( ATTR_SRC ) ) )
1465 scriptSrc = parser->doc()->completeURL(khtml::parseURL( DOMString(a) ).string() );
1466 if ( ( a = currToken.attrs->getValue( ATTR_CHARSET ) ) )
1467 scriptSrcCharset = DOMString(a).string().trimmed();
1468 if ( scriptSrcCharset.isEmpty() && view)
1469 scriptSrcCharset = parser->doc()->view()->part()->encoding();
1470 /* Check type before language, since language is deprecated */
1471 if ((a = currToken.attrs->getValue(ATTR_TYPE)) != 0 && !DOMString(a).string().isEmpty())
1472 foundTypeAttribute = true;
1473 else
1474 a = currToken.attrs->getValue(ATTR_LANGUAGE);
1476 javascript = true;
1478 if( foundTypeAttribute ) {
1480 Mozilla 1.5 doesn't accept the text/javascript1.x formats, but WinIE 6 does.
1481 Mozilla 1.5 doesn't accept text/jscript, text/ecmascript, and text/livescript, but WinIE 6 does.
1482 Mozilla 1.5 accepts application/x-javascript, WinIE 6 doesn't.
1483 Mozilla 1.5 allows leading and trailing whitespace, but WinIE 6 doesn't.
1484 Mozilla 1.5 and WinIE 6 both accept the empty string, but neither accept a whitespace-only string.
1485 We want to accept all the values that either of these browsers accept, but not other values.
1487 QString type = DOMString(a).string().trimmed().toLower();
1488 if( type.compare("text/javascript") != 0 &&
1489 type.compare("text/javascript1.0") != 0 &&
1490 type.compare("text/javascript1.1") != 0 &&
1491 type.compare("text/javascript1.2") != 0 &&
1492 type.compare("text/javascript1.3") != 0 &&
1493 type.compare("text/javascript1.4") != 0 &&
1494 type.compare("text/javascript1.5") != 0 &&
1495 type.compare("text/jscript") != 0 &&
1496 type.compare("text/ecmascript") != 0 &&
1497 type.compare("text/livescript") != 0 &&
1498 type.compare("application/x-javascript") != 0 &&
1499 type.compare("application/x-ecmascript") != 0 &&
1500 type.compare("application/javascript") != 0 &&
1501 type.compare("application/ecmascript") != 0 )
1502 javascript = false;
1503 } else if( a ) {
1505 Mozilla 1.5 doesn't accept jscript or ecmascript, but WinIE 6 does.
1506 Mozilla 1.5 accepts javascript1.0, javascript1.4, and javascript1.5, but WinIE 6 accepts only 1.1 - 1.3.
1507 Neither Mozilla 1.5 nor WinIE 6 accept leading or trailing whitespace.
1508 We want to accept all the values that either of these browsers accept, but not other values.
1510 QString lang = DOMString(a).string();
1511 lang = lang.toLower();
1512 if( lang.compare("") != 0 &&
1513 lang.compare("javascript") != 0 &&
1514 lang.compare("javascript1.0") != 0 &&
1515 lang.compare("javascript1.1") != 0 &&
1516 lang.compare("javascript1.2") != 0 &&
1517 lang.compare("javascript1.3") != 0 &&
1518 lang.compare("javascript1.4") != 0 &&
1519 lang.compare("javascript1.5") != 0 &&
1520 lang.compare("ecmascript") != 0 &&
1521 lang.compare("livescript") != 0 &&
1522 lang.compare("jscript") )
1523 javascript = false;
1527 processToken();
1529 if ( parser->selectMode() && beginTag)
1530 discard = AllDiscard;
1532 switch( tagID ) {
1533 case ID_LISTING:
1534 case ID_PRE:
1535 pre = beginTag;
1536 if (beginTag)
1537 discard = LFDiscard;
1538 prePos = 0;
1539 break;
1540 case ID_BR:
1541 prePos = 0;
1542 break;
1543 case ID_SCRIPT:
1544 if (beginTag) {
1545 searchStopper = scriptEnd;
1546 searchStopperLen = 8;
1547 script = true;
1548 parseSpecial(src);
1550 else if (tagID < ID_CLOSE_TAG) // Handle <script src="foo"/>
1551 scriptHandler();
1552 break;
1553 case ID_STYLE:
1554 if (beginTag) {
1555 searchStopper = styleEnd;
1556 searchStopperLen = 7;
1557 style = true;
1558 parseSpecial(src);
1560 break;
1561 case ID_TEXTAREA:
1562 if(beginTag) {
1563 searchStopper = textareaEnd;
1564 searchStopperLen = 10;
1565 textarea = true;
1566 discard = NoneDiscard;
1567 parseSpecial(src);
1569 break;
1570 case ID_TITLE:
1571 if (beginTag) {
1572 searchStopper = titleEnd;
1573 searchStopperLen = 7;
1574 title = true;
1575 parseSpecial(src);
1577 break;
1578 case ID_XMP:
1579 if (beginTag) {
1580 searchStopper = xmpEnd;
1581 searchStopperLen = 5;
1582 xmp = true;
1583 parseSpecial(src);
1585 break;
1586 case ID_SELECT:
1587 select = beginTag;
1588 break;
1589 case ID_PLAINTEXT:
1590 plaintext = beginTag;
1591 break;
1593 return; // Finished parsing tag!
1595 } // end switch
1597 return;
1600 void HTMLTokenizer::addPending()
1602 if ( select && !(comment || script))
1604 *dest++ = ' ';
1606 else
1608 switch(pending) {
1609 case LFPending: *dest++ = QLatin1Char('\n'); prePos = 0; break;
1610 case SpacePending: *dest++ = QLatin1Char(' '); ++prePos; break;
1611 case TabPending: {
1612 // Don't expand tabs inside <textarea> or script
1613 int p = TAB_SIZE - ( prePos % TAB_SIZE );
1614 if (textarea || script) {
1615 *dest++ = QLatin1Char('\t');
1616 } else {
1617 for ( int x = 0; x < p; x++ )
1618 *dest++ = QLatin1Char(' ');
1620 prePos += p;
1621 break;
1623 case NonePending:
1624 assert(0);
1628 pending = NonePending;
1631 inline bool HTMLTokenizer::continueProcessing(int& processedCount)
1633 // We don't want to be checking elapsed time with every character, so we only check after we've
1634 // processed a certain number of characters.
1635 if (!m_executingScript && processedCount > sTokenizerChunkSize && cachedScript.isEmpty()) {
1636 processedCount = 0;
1637 if ( m_time.elapsed() > m_tokenizerYeldDelay) {
1638 m_yeldTimer = startTimer(0);
1639 m_tokenizerYeldDelay = sTokenizerFastYeldDelay;
1640 return false;
1643 processedCount++;
1644 return true;
1647 void HTMLTokenizer::write( const TokenizerString &str, bool appendData )
1649 #ifdef TOKEN_DEBUG
1650 kDebug( 6036 ) << this << " Tokenizer::write(\"" << str.toString() << "\"," << appendData << ")";
1651 #endif
1653 if ( !buffer )
1654 return;
1656 if ( ( m_executingScript && appendData ) || cachedScript.count() ) {
1657 // don't parse; we will do this later
1658 if (pendingQueue.isEmpty())
1659 pendingQueue.push(str);
1660 else if (appendData)
1661 pendingQueue.bottom().append(str);
1662 else
1663 pendingQueue.top().append(str);
1664 #if PROSPECTIVE_TOKENIZER_ENABLED
1665 if (m_prospectiveTokenizer && m_prospectiveTokenizer->inProgress() && appendData)
1666 m_prospectiveTokenizer->write(str);
1667 #endif
1668 return;
1671 #if PROSPECTIVE_TOKENIZER_ENABLED
1672 if (m_prospectiveTokenizer && m_prospectiveTokenizer->inProgress() && appendData)
1673 m_prospectiveTokenizer->end();
1674 #endif
1676 if ( onHold ) {
1677 src.append(str);
1678 return;
1681 if (!src.isEmpty())
1682 src.append(str);
1683 else
1684 setSrc(str);
1686 // Once a timer is set, it has control of when the tokenizer continues.
1687 if (m_yeldTimer > 0)
1688 return;
1690 int processedCount = 0;
1691 m_time.start();
1693 while ( !src.isEmpty() )
1695 if ( m_abort || !continueProcessing(processedCount) )
1696 break;
1697 // do we need to enlarge the buffer?
1698 checkBuffer();
1700 ushort cc = src->unicode();
1702 if (skipLF && (cc != '\n'))
1703 skipLF = false;
1705 if (skipLF) {
1706 skipLF = false;
1707 ++src;
1709 else if ( Entity )
1710 parseEntity( src, dest );
1711 else if ( plaintext )
1712 parseText( src );
1713 else if (script)
1714 parseSpecial(src);
1715 else if (style)
1716 parseSpecial(src);
1717 else if (xmp)
1718 parseSpecial(src);
1719 else if (textarea)
1720 parseSpecial(src);
1721 else if (title)
1722 parseSpecial(src);
1723 else if (comment)
1724 parseComment(src);
1725 else if (doctypeComment && doctypeComment != DoctypeCommentEnd && doctypeComment != DoctypeCommentBogus)
1726 parseDoctypeComment(src);
1727 else if (doctype)
1728 parseDoctype(src);
1729 else if (server)
1730 parseServer(src);
1731 else if (processingInstruction)
1732 parseProcessingInstruction(src);
1733 else if (tag)
1734 parseTag(src);
1735 else if ( startTag )
1737 startTag = false;
1738 bool endTag = false;
1740 switch(cc) {
1741 case '/':
1742 endTag = true;
1743 break;
1744 case '!':
1746 // <!-- comment --> or <!DOCTYPE ...>
1747 searchCount = 1; // Look for '<!--' sequence to start comment...
1748 doctypeSearchCount = 1; // ... or for '<!DOCTYPE' sequence to start doctype
1749 break;
1751 case '?':
1753 // xml processing instruction
1754 processingInstruction = true;
1755 tquote = NoQuote;
1756 parseProcessingInstruction(src);
1757 continue;
1759 break;
1761 case '%':
1762 if (!brokenServer) {
1763 // <% server stuff, handle as comment %>
1764 server = true;
1765 tquote = NoQuote;
1766 parseServer(src);
1767 continue;
1769 // else fall through
1770 default:
1772 if( ((cc >= 'a') && (cc <= 'z')) || ((cc >= 'A') && (cc <= 'Z')))
1774 // Start of a Start-Tag
1776 else
1778 // Invalid tag
1779 // Add as is
1780 if (pending)
1781 addPending();
1782 *dest = '<';
1783 dest++;
1784 continue;
1787 }; // end case
1789 // According to SGML any LF immediately after a starttag, or
1790 // immediately before an endtag should be ignored.
1791 // ### Gecko and MSIE though only ignores LF immediately after
1792 // starttags and only for PRE elements -- asj (28/06-2005)
1793 if ( pending )
1794 if (!select)
1795 addPending();
1796 else
1797 pending = NonePending;
1799 // Cancel unused discards
1800 discard = NoneDiscard;
1801 // if (!endTag) discard = LFDiscard;
1803 processToken();
1805 cBufferPos = 0;
1806 tag = TagName;
1807 parseTag(src);
1809 else if ( cc == '&' && !src.escaped())
1811 ++src;
1812 if ( pending )
1813 addPending();
1814 discard = NoneDiscard;
1815 parseEntity(src, dest, true);
1817 else if ( cc == '<' && !src.escaped())
1819 tagStartLineno = lineno+src.lineCount();
1820 ++src;
1821 discard = NoneDiscard;
1822 startTag = true;
1824 else if (( cc == '\n' ) || ( cc == '\r' ))
1826 if (discard == SpaceDiscard)
1827 discard = NoneDiscard;
1829 if (discard == LFDiscard) {
1830 // Ignore one LF
1831 discard = NoneDiscard;
1833 else if (discard == AllDiscard)
1835 // Ignore
1837 else
1839 if (select && !script) {
1840 pending = LFPending;
1841 } else {
1842 if (pending)
1843 addPending();
1844 pending = LFPending;
1848 /* Check for MS-DOS CRLF sequence */
1849 if (cc == '\r')
1851 skipLF = true;
1853 ++src;
1855 else if (( cc == ' ' ) || ( cc == '\t' ))
1857 if(discard == LFDiscard)
1858 discard = NoneDiscard;
1860 if(discard == SpaceDiscard) {
1861 // Ignore one space
1862 discard = NoneDiscard;
1864 else if(discard == AllDiscard)
1866 // Ignore
1868 else {
1869 if (select && !script) {
1870 if (!pending)
1871 pending = SpacePending;
1872 } else {
1873 if (pending)
1874 addPending();
1875 if (cc == ' ')
1876 pending = SpacePending;
1877 else
1878 pending = TabPending;
1882 ++src;
1884 else
1886 if (pending)
1887 addPending();
1889 discard = NoneDiscard;
1890 if ( pre )
1892 prePos++;
1894 *dest = *src;
1895 fixUpChar( *dest );
1896 ++dest;
1897 ++src;
1901 if (noMoreData && cachedScript.isEmpty() && !m_executingScript && m_yeldTimer<=0)
1902 end(); // this actually causes us to be deleted
1905 void HTMLTokenizer::timerEvent( QTimerEvent *e )
1907 if ( e->timerId() == m_yeldTimer ) {
1908 killTimer(m_yeldTimer);
1909 m_yeldTimer = 0;
1910 write( TokenizerString(), true );
1911 } else if ( e->timerId() == m_autoCloseTimer && cachedScript.isEmpty() ) {
1912 finish();
1916 void HTMLTokenizer::setAutoClose( bool b ) {
1917 killTimer( m_autoCloseTimer );
1918 m_autoCloseTimer = 0;
1919 if ( b )
1920 m_autoCloseTimer = startTimer(100);
1923 void HTMLTokenizer::end()
1925 if ( buffer == 0 ) {
1926 emit finishedParsing();
1927 return;
1930 // parseTag is using the buffer for different matters
1931 if ( !tag )
1932 processToken();
1934 if(buffer)
1935 KHTML_DELETE_QCHAR_VEC(buffer);
1937 if(scriptCode)
1938 KHTML_DELETE_QCHAR_VEC(scriptCode);
1940 scriptCode = 0;
1941 scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
1942 buffer = 0;
1943 emit finishedParsing();
1946 void HTMLTokenizer::finish()
1948 if ( m_autoCloseTimer ) {
1949 killTimer( m_autoCloseTimer );
1950 m_autoCloseTimer = 0;
1952 // do this as long as we don't find matching comment ends
1953 while((title || script || comment || server) && scriptCode && scriptCodeSize)
1955 // we've found an unmatched comment start
1956 if (comment)
1957 brokenComments = true;
1958 else if (server)
1959 brokenServer = true;
1960 else if (script)
1961 brokenScript = true;
1963 checkScriptBuffer();
1964 scriptCode[ scriptCodeSize ] = 0;
1965 scriptCode[ scriptCodeSize + 1 ] = 0;
1966 int pos;
1967 QString food;
1968 if (title || style || script)
1969 food.setUnicode(scriptCode, scriptCodeSize);
1970 else if (server) {
1971 food = "<";
1972 food += QString(scriptCode, scriptCodeSize);
1974 else {
1975 pos = QString::fromRawData(scriptCode, scriptCodeSize).indexOf('>');
1976 food.setUnicode(scriptCode+pos+1, scriptCodeSize-pos-1); // deep copy
1978 KHTML_DELETE_QCHAR_VEC(scriptCode);
1979 scriptCode = 0;
1980 scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
1981 if (script)
1982 scriptHandler();
1984 comment = title = server = script = false;
1985 if ( !food.isEmpty() )
1986 write(food, true);
1988 // this indicates we will not receive any more data... but if we are waiting on
1989 // an external script to load, we can't finish parsing until that is done
1990 noMoreData = true;
1991 if (cachedScript.isEmpty() && !m_executingScript && !onHold && m_yeldTimer <= 0)
1992 end(); // this actually causes us to be deleted
1995 void HTMLTokenizer::processToken()
1997 KJSProxy *jsProxy = view ? view->part()->jScript() : 0L;
1998 if (jsProxy)
1999 jsProxy->setEventHandlerLineno(tagStartLineno);
2000 if ( dest > buffer )
2002 #if 0
2003 if(currToken.tid) {
2004 qDebug( "unexpected token id: %d, str: *%s*", currToken.tid,QString::fromRawData( buffer, dest-buffer ).toLatin1().constData() );
2005 assert(0);
2008 #endif
2009 currToken.text = new DOMStringImpl( buffer, dest - buffer );
2010 currToken.text->ref();
2011 if (currToken.tid != ID_COMMENT)
2012 currToken.tid = ID_TEXT;
2014 else if(!currToken.tid) {
2015 currToken.reset();
2016 if (jsProxy)
2017 jsProxy->setEventHandlerLineno(lineno+src.lineCount());
2018 return;
2021 dest = buffer;
2023 #ifdef TOKEN_DEBUG
2024 QString name = QString( getTagName(currToken.tid) );
2025 QString text;
2026 if(currToken.text)
2027 text = QString::fromRawData(currToken.text->s, currToken.text->l);
2029 kDebug( 6036 ) << "Token --> " << name << " id = " << currToken.tid;
2030 if (currToken.flat)
2031 kDebug( 6036 ) << "Token is FLAT!";
2032 if(!text.isNull())
2033 kDebug( 6036 ) << "text: \"" << text << "\"";
2034 unsigned long l = currToken.attrs ? currToken.attrs->length() : 0;
2035 if(l) {
2036 kDebug( 6036 ) << "Attributes: " << l;
2037 for (unsigned long i = 0; i < l; ++i) {
2038 NodeImpl::Id tid = currToken.attrs->idAt(i);
2039 DOMString value = currToken.attrs->valueAt(i);
2040 kDebug( 6036 ) << " " << tid << " " << parser->doc()->document()->getName(NodeImpl::AttributeId, tid).string()
2041 << "=\"" << value.string() << "\"" << endl;
2044 kDebug( 6036 );
2045 #endif
2047 // In some cases, parseToken() can cause javascript code to be executed
2048 // (for example, when setting an attribute that causes an event handler
2049 // to be created). So we need to protect against re-entrancy into the parser
2050 m_executingScript++;
2052 // pass the token over to the parser, the parser DOES NOT delete the token
2053 parser->parseToken(&currToken);
2055 m_executingScript--;
2057 if ( currToken.flat && currToken.tid != ID_TEXT && !parser->noSpaces() )
2058 discard = NoneDiscard;
2060 currToken.reset();
2061 if (jsProxy)
2062 jsProxy->setEventHandlerLineno(0);
2065 void HTMLTokenizer::processDoctypeToken()
2067 // kDebug( 6036 ) << "Process DoctypeToken (name: " << doctypeToken.name << ", publicID: " << doctypeToken.publicID << ", systemID: " << doctypeToken.systemID;
2068 doctypeToken.publicID = doctypeToken.publicID.simplified();
2069 doctypeToken.systemID = doctypeToken.systemID.simplified();
2070 parser->parseDoctypeToken(&doctypeToken);
2074 HTMLTokenizer::~HTMLTokenizer()
2076 reset();
2077 delete m_prospectiveTokenizer;
2078 delete parser;
2082 void HTMLTokenizer::enlargeBuffer(int len)
2084 int newsize = qMax(size*2, size+len);
2085 int oldoffs = (dest - buffer);
2087 buffer = KHTML_REALLOC_QCHAR_VEC(buffer, newsize);
2088 dest = buffer + oldoffs;
2089 size = newsize;
2092 void HTMLTokenizer::enlargeScriptBuffer(int len)
2094 int newsize = qMax(scriptCodeMaxSize*2, scriptCodeMaxSize+len);
2095 scriptCode = KHTML_REALLOC_QCHAR_VEC(scriptCode, newsize);
2096 scriptCodeMaxSize = newsize;
2099 void HTMLTokenizer::notifyFinished(CachedObject* /*finishedObj*/)
2101 assert(!cachedScript.isEmpty());
2102 bool done = false;
2103 while (!done && cachedScript.head()->isLoaded()) {
2105 kDebug( 6036 ) << "Finished loading an external script";
2107 CachedScript* cs = cachedScript.dequeue();
2108 DOMString scriptSource = cs->script();
2109 #ifdef TOKEN_DEBUG
2110 kDebug( 6036 ) << "External script is:" << endl << scriptSource.string();
2111 #endif
2112 setSrc(TokenizerString());
2114 // make sure we forget about the script before we execute the new one
2115 // infinite recursion might happen otherwise
2116 QString cachedScriptUrl( cs->url().string() );
2117 cs->deref(this);
2119 scriptExecution( scriptSource.string(), cachedScriptUrl );
2121 done = cachedScript.isEmpty();
2123 // 'script' is true when we are called synchronously from
2124 // scriptHandler(). In that case scriptHandler() will take care
2125 // of 'scriptOutput'.
2126 if ( !script ) {
2127 while (pendingQueue.count() > 1) {
2128 TokenizerString t = pendingQueue.pop();
2129 pendingQueue.top().prepend( t );
2131 if (done) {
2132 write(pendingQueue.pop(), false);
2134 // we might be deleted at this point, do not
2135 // access any members.
2140 bool HTMLTokenizer::isWaitingForScripts() const
2142 return cachedScript.count();
2145 bool HTMLTokenizer::isExecutingScript() const
2147 return (m_executingScript > 0);
2150 void HTMLTokenizer::setSrc(const TokenizerString& source)
2152 lineno += src.lineCount();
2153 src = source;
2154 src.resetLineCount();
2157 void HTMLTokenizer::setOnHold(bool _onHold)
2159 if (onHold == _onHold) return;
2160 onHold = _onHold;