2 This file is part of the KDE libraries
4 Copyright (C) 1997 Martin Jones (mjones@kde.org)
5 (C) 1997 Torben Weis (weis@kde.org)
6 (C) 1998 Waldo Bastian (bastian@kde.org)
7 (C) 1999 Lars Knoll (knoll@kde.org)
8 (C) 1999 Antti Koivisto (koivisto@kde.org)
9 (C) 2001-2003 Dirk Mueller (mueller@kde.org)
10 (C) 2004-2008 Apple Computer, Inc.
11 (C) 2006-2008 Germain Garand (germain@ebooksfrance.org)
13 This library is free software; you can redistribute it and/or
14 modify it under the terms of the GNU Library General Public
15 License as published by the Free Software Foundation; either
16 version 2 of the License, or (at your option) any later version.
18 This library is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 Library General Public License for more details.
23 You should have received a copy of the GNU Library General Public License
24 along with this library; see the file COPYING.LIB. If not, write to
25 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
26 Boston, MA 02110-1301, USA.
28 //----------------------------------------------------------------------------
30 // KDE HTML Widget - Tokenizers
32 // #define TOKEN_DEBUG 1
33 //#define TOKEN_DEBUG 2
35 #include "htmltokenizer.h"
36 #include "html_documentimpl.h"
37 #include "htmlparser.h"
40 #include <misc/loader.h>
41 #include <misc/htmlhashes.h>
43 #include <khtmlview.h>
44 #include <khtml_part.h>
45 #include <xml/dom_docimpl.h>
46 #include <css/csshelper.h>
47 #include <ecma/kjs_proxy.h>
48 #include <kcharsets.h>
52 #include <QtCore/QVariant>
58 #include "kentities.c"
59 #include "htmlprospectivetokenizer.h"
61 #define PROSPECTIVE_TOKENIZER_ENABLED 1
63 using namespace khtml
;
65 static const QChar commentStart
[] = { '<','!','-','-', QChar::Null
};
66 static const char doctypeStart
[] = "<!doctype";
67 static const char publicStart
[] = "public";
68 static const char systemStart
[] = "system";
70 static const char scriptEnd
[] = "</script";
71 static const char xmpEnd
[] = "</xmp";
72 static const char styleEnd
[] = "</style";
73 static const char textareaEnd
[] = "</textarea";
74 static const char titleEnd
[] = "</title";
77 static const int sTokenizerChunkSize
= 2048;
78 static const int sTokenizerFastYeldDelay
= 220;
79 static const int sTokenizerYeldDelay
= 650;
81 static const int sTokenizerChunkSize
= 4096;
82 static const int sTokenizerFastYeldDelay
= 180;
83 static const int sTokenizerYeldDelay
= 450;
86 #define KHTML_ALLOC_QCHAR_VEC( N ) (QChar*) malloc( sizeof(QChar)*( N ) )
87 #define KHTML_REALLOC_QCHAR_VEC(P, N ) (QChar*) realloc(P, sizeof(QChar)*( N ))
88 #define KHTML_DELETE_QCHAR_VEC( P ) free((char*)( P ))
90 // Full support for MS Windows extensions to Latin-1.
91 // Technically these extensions should only be activated for pages
92 // marked "windows-1252" or "cp1252", but
93 // in the standard Microsoft way, these extensions infect hundreds of thousands
94 // of web pages. Note that people with non-latin-1 Microsoft extensions
97 // See: http://www.microsoft.com/globaldev/reference/WinCP.asp
98 // http://www.bbsinc.com/iso8859.html
99 // http://www.obviously.com/
101 // There may be better equivalents
105 #define fixUpChar(x) \
106 switch ((x).unicode()) \
108 case 0x80: (x) = 0x20ac; break; \
109 case 0x82: (x) = 0x201a; break; \
110 case 0x83: (x) = 0x0192; break; \
111 case 0x84: (x) = 0x201e; break; \
112 case 0x85: (x) = 0x2026; break; \
113 case 0x86: (x) = 0x2020; break; \
114 case 0x87: (x) = 0x2021; break; \
115 case 0x88: (x) = 0x02C6; break; \
116 case 0x89: (x) = 0x2030; break; \
117 case 0x8A: (x) = 0x0160; break; \
118 case 0x8b: (x) = 0x2039; break; \
119 case 0x8C: (x) = 0x0152; break; \
120 case 0x8E: (x) = 0x017D; break; \
121 case 0x91: (x) = 0x2018; break; \
122 case 0x92: (x) = 0x2019; break; \
123 case 0x93: (x) = 0x201C; break; \
124 case 0x94: (x) = 0X201D; break; \
125 case 0x95: (x) = 0x2022; break; \
126 case 0x96: (x) = 0x2013; break; \
127 case 0x97: (x) = 0x2014; break; \
128 case 0x98: (x) = 0x02DC; break; \
129 case 0x99: (x) = 0x2122; break; \
130 case 0x9A: (x) = 0x0161; break; \
131 case 0x9b: (x) = 0x203A; break; \
132 case 0x9C: (x) = 0x0153; break; \
133 case 0x9E: (x) = 0x017E; break; \
134 case 0x9F: (x) = 0x0178; break; \
138 // ----------------------------------------------------------------------------
140 HTMLTokenizer::HTMLTokenizer(DOM::DocumentImpl
*_doc
, KHTMLView
*_view
)
145 scriptCodeSize
= scriptCodeMaxSize
= scriptCodeResync
= 0;
146 charsets
= KGlobal::charsets();
147 parser
= new KHTMLParser(_view
, _doc
);
148 m_executingScript
= 0;
149 m_autoCloseTimer
= 0;
150 m_tokenizerYeldDelay
= sTokenizerFastYeldDelay
;
152 m_prospectiveTokenizer
= 0;
158 HTMLTokenizer::HTMLTokenizer(DOM::DocumentImpl
*_doc
, DOM::DocumentFragmentImpl
*i
)
163 scriptCodeSize
= scriptCodeMaxSize
= scriptCodeResync
= 0;
164 charsets
= KGlobal::charsets();
165 parser
= new KHTMLParser( i
, _doc
);
166 m_executingScript
= 0;
167 m_autoCloseTimer
= 0;
168 m_tokenizerYeldDelay
= sTokenizerFastYeldDelay
;
170 m_prospectiveTokenizer
= 0;
176 void HTMLTokenizer::setNormalYeldDelay()
178 m_tokenizerYeldDelay
= sTokenizerYeldDelay
;
181 void HTMLTokenizer::reset()
183 assert(m_executingScript
== 0);
184 Q_ASSERT(onHold
== false);
187 while (!cachedScript
.isEmpty())
188 cachedScript
.dequeue()->deref(this);
191 KHTML_DELETE_QCHAR_VEC(buffer
);
196 KHTML_DELETE_QCHAR_VEC(scriptCode
);
198 scriptCodeSize
= scriptCodeMaxSize
= scriptCodeResync
= 0;
200 if (m_autoCloseTimer
> 0) {
201 killTimer(m_autoCloseTimer
);
202 m_autoCloseTimer
= 0;
205 if (m_yeldTimer
> 0) {
206 killTimer(m_yeldTimer
);
210 doctypeToken
.reset();
213 void HTMLTokenizer::begin()
215 m_executingScript
= 0;
219 buffer
= KHTML_ALLOC_QCHAR_VEC( 255 );
222 pending
= NonePending
;
223 discard
= NoneDiscard
;
228 processingInstruction
= false;
236 doctypeComment
= NoDoctypeComment
;
237 doctypeAllowComment
= false;
244 doctypeSearchCount
= 0;
245 doctypeSecondarySearchCount
= 0;
248 brokenComments
= false;
249 brokenServer
= false;
250 brokenScript
= false;
252 scriptStartLineno
= 0;
256 void HTMLTokenizer::processListing(TokenizerString list
)
260 // This function adds the listing 'list' as
261 // preformatted text-tokens to the token-collection
262 // thereby converting TABs.
263 if(!style
) pre
= true;
266 while ( !list
.isEmpty() )
268 checkBuffer(3*TAB_SIZE
);
270 if (skipLF
&& ( list
->unicode() != '\n' ))
280 else if (( list
->unicode() == '\n' ) || ( list
->unicode() == '\r' ))
282 if (discard
== LFDiscard
)
285 discard
= NoneDiscard
; // We have discarded 1 LF
293 // we used to do it not at all and we want to have
294 // it fixed for textarea. So here we are
301 /* Check for MS-DOS CRLF sequence */
302 if (list
->unicode() == '\r')
308 else if (( list
->unicode() == ' ' ) || ( list
->unicode() == '\t'))
313 pending
= SpacePending
;
315 pending
= TabPending
;
321 discard
= NoneDiscard
;
332 if ((pending
== SpacePending
) || (pending
== TabPending
))
335 pending
= NonePending
;
341 void HTMLTokenizer::parseSpecial(TokenizerString
&src
)
343 assert( textarea
|| title
|| !Entity
);
345 assert( xmp
+textarea
+title
+style
+script
== 1 );
347 scriptStartLineno
= lineno
+src
.lineCount();
349 if ( comment
) parseComment( src
);
351 while ( !src
.isEmpty() ) {
353 unsigned char ch
= src
->toLatin1();
354 if ( !scriptCodeResync
&& !brokenComments
&& !textarea
&& !xmp
&& ch
== '-' && scriptCodeSize
>= 3 && !src
.escaped() && QString::fromRawData( scriptCode
+scriptCodeSize
-3, 3 ) == "<!-" ) {
356 scriptCode
[ scriptCodeSize
++ ] = ch
;
361 if ( scriptCodeResync
&& !tquote
&& ( ch
== '>' ) ) {
363 scriptCodeSize
= scriptCodeResync
-1;
364 scriptCodeResync
= 0;
365 scriptCode
[ scriptCodeSize
] = scriptCode
[ scriptCodeSize
+ 1 ] = 0;
369 processListing(TokenizerString(scriptCode
, scriptCodeSize
));
371 if ( style
) { currToken
.tid
= ID_STYLE
+ ID_CLOSE_TAG
; }
372 else if ( textarea
) { currToken
.tid
= ID_TEXTAREA
+ ID_CLOSE_TAG
; }
373 else if ( title
) { currToken
.tid
= ID_TITLE
+ ID_CLOSE_TAG
; }
374 else if ( xmp
) { currToken
.tid
= ID_XMP
+ ID_CLOSE_TAG
; }
376 script
= style
= textarea
= title
= xmp
= false;
378 scriptCodeSize
= scriptCodeResync
= 0;
382 // possible end of tagname, lets check.
383 if ( !scriptCodeResync
&& !escaped
&& !src
.escaped() && ( ch
== '>' || ch
== '/' || ch
<= ' ' ) && ch
&&
384 scriptCodeSize
>= searchStopperLen
&&
385 !QString::fromRawData( scriptCode
+scriptCodeSize
-searchStopperLen
, searchStopperLen
).indexOf( searchStopper
, 0, Qt::CaseInsensitive
)) {
386 scriptCodeResync
= scriptCodeSize
-searchStopperLen
+1;
390 if ( scriptCodeResync
&& !escaped
) {
392 tquote
= (tquote
== NoQuote
) ? DoubleQuote
: ((tquote
== SingleQuote
) ? SingleQuote
: NoQuote
);
394 tquote
= (tquote
== NoQuote
) ? SingleQuote
: (tquote
== DoubleQuote
) ? DoubleQuote
: NoQuote
;
395 else if (tquote
!= NoQuote
&& (ch
== '\r' || ch
== '\n'))
398 escaped
= ( !escaped
&& ch
== '\\' );
399 if (!scriptCodeResync
&& (textarea
||title
) && !src
.escaped() && ch
== '&') {
400 QChar
*scriptCodeDest
= scriptCode
+scriptCodeSize
;
402 parseEntity(src
,scriptCodeDest
,true);
403 scriptCodeSize
= scriptCodeDest
-scriptCode
;
406 scriptCode
[ scriptCodeSize
++ ] = *src
;
412 void HTMLTokenizer::scriptHandler()
414 QString currentScriptSrc
= scriptSrc
;
417 processListing(TokenizerString(scriptCode
, scriptCodeSize
));
418 QString
exScript( buffer
, dest
-buffer
);
421 currToken
.tid
= ID_SCRIPT
+ ID_CLOSE_TAG
;
424 // Scripts following a frameset element should not be executed or even loaded in the case of extern scripts.
425 bool followingFrameset
= (parser
->doc()->body() && parser
->doc()->body()->id() == ID_FRAMESET
);
426 bool effectiveScript
= !parser
->skipMode() && !followingFrameset
;
427 bool deferredScript
= false;
429 if ( effectiveScript
) {
430 CachedScript
* cs
= 0;
432 // forget what we just got, load from src url instead
433 if ( !currentScriptSrc
.isEmpty() && javascript
&&
434 (cs
= parser
->doc()->docLoader()->requestScript(currentScriptSrc
, scriptSrcCharset
) )) {
435 cachedScript
.enqueue(cs
);
439 pendingQueue
.push(src
);
440 int scriptCount
= cachedScript
.count();
441 setSrc(TokenizerString());
442 scriptCodeSize
= scriptCodeResync
= 0;
444 if (cachedScript
.count() == scriptCount
)
445 deferredScript
= true;
447 else if (currentScriptSrc
.isEmpty() && view
&& javascript
) {
448 pendingQueue
.push(src
);
449 setSrc(TokenizerString());
450 scriptCodeSize
= scriptCodeResync
= 0;
451 scriptExecution( exScript
, QString(), tagStartLineno
/*scriptStartLineno*/ );
453 // script was filtered or disallowed
454 effectiveScript
= false;
459 scriptCodeSize
= scriptCodeResync
= 0;
461 if ( !effectiveScript
)
464 if ( !m_executingScript
&& cachedScript
.isEmpty() ) {
465 src
.append(pendingQueue
.pop());
466 } else if ( cachedScript
.isEmpty() ) {
467 write( pendingQueue
.pop(), false );
468 } else if ( !deferredScript
&& pendingQueue
.count() > 1) {
469 TokenizerString t
= pendingQueue
.pop();
470 pendingQueue
.top().prepend( t
);
472 #if PROSPECTIVE_TOKENIZER_ENABLED
473 if (!cachedScript
.isEmpty() && !m_executingScript
) {
474 if (!m_prospectiveTokenizer
)
475 m_prospectiveTokenizer
= new ProspectiveTokenizer(parser
->docPtr());
476 if (!m_prospectiveTokenizer
->inProgress() && !pendingQueue
.isEmpty()) {
477 m_prospectiveTokenizer
->begin();
478 m_prospectiveTokenizer
->write(pendingQueue
.top());
485 void HTMLTokenizer::scriptExecution( const QString
& str
, const QString
& scriptURL
,
488 bool oldscript
= script
;
492 if (scriptURL
.isNull() && view
)
493 url
= static_cast<DocumentImpl
*>(view
->part()->document().handle())->URL().url();
498 view
->part()->executeScript(url
,baseLine
,Node(),str
);
503 void HTMLTokenizer::parseComment(TokenizerString
&src
)
505 checkScriptBuffer(src
.length());
506 while ( src
.length() ) {
507 scriptCode
[ scriptCodeSize
++ ] = *src
;
509 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
510 qDebug("comment is now: *%s*", src
.toString().left(16).toLatin1().constData());
513 if (src
->unicode() == '>')
515 bool handleBrokenComments
= brokenComments
&& !( script
|| style
);
516 bool scriptEnd
=false;
517 if ( scriptCodeSize
> 2 && scriptCode
[scriptCodeSize
-3] == '-' &&
518 scriptCode
[scriptCodeSize
-2] == '-' )
523 if (handleBrokenComments
|| scriptEnd
){
525 if ( !( title
|| script
|| xmp
|| textarea
|| style
) ) {
527 scriptCode
[ scriptCodeSize
] = 0;
528 scriptCode
[ scriptCodeSize
+ 1 ] = 0;
529 currToken
.tid
= ID_COMMENT
;
530 processListing(TokenizerString(scriptCode
, scriptCodeSize
- 3));
532 currToken
.tid
= ID_COMMENT
+ ID_CLOSE_TAG
;
537 return; // Finished parsing comment
544 void HTMLTokenizer::parseDoctypeComment(TokenizerString
&src
)
546 while (!src
.isEmpty()) {
548 switch (doctypeComment
) {
549 case DoctypeCommentHalfBegin
: {
551 // Ooops, it's not comment
552 doctypeComment
= DoctypeCommentBogus
;
555 // Doctype comment begins
556 doctypeComment
= DoctypeComment
;
561 case DoctypeComment
: {
563 // Perhaps this is end of comment
564 doctypeComment
= DoctypeCommentHalfEnd
;
567 // Keep scanning for '--'
572 case DoctypeCommentHalfEnd
: {
574 // Doctype comment ends
575 doctypeComment
= DoctypeCommentEnd
;
580 doctypeComment
= DoctypeComment
;
585 assert(!"Undefined doctype comment state");
592 void HTMLTokenizer::parseDoctype(TokenizerString
&src
)
594 while (!src
.isEmpty() && doctype
) {
596 bool isWhitespace
= false;
598 if (doctypeComment
== DoctypeCommentEnd
) {
599 doctypeComment
= NoDoctypeComment
;
601 } else if (doctypeComment
== DoctypeCommentBogus
) {
602 doctypeComment
= NoDoctypeComment
;
607 if (doctypeAllowComment
) {
608 if (!doctypeComment
&& c
== '-') {
609 doctypeComment
= DoctypeCommentHalfBegin
;
612 if (doctypeComment
) {
613 parseDoctypeComment(src
);
616 isWhitespace
= c
== '\r' || c
== '\n' || c
== '\t' || c
== ' ';
620 switch (doctypeToken
.state
) {
622 doctypeToken
.state
= DoctypeBeforeName
;
628 case DoctypeBeforeName
: {
630 // Malformed. Just exit.
632 } else if (isWhitespace
) {
636 doctypeToken
.state
= DoctypeName
;
642 // Valid doctype. Emit it.
644 processDoctypeToken();
645 } else if (isWhitespace
) {
646 doctypeSearchCount
= 0; // Used now to scan for PUBLIC
647 doctypeSecondarySearchCount
= 0; // Used now to scan for SYSTEM
648 doctypeToken
.state
= DoctypeAfterName
;
650 doctypeToken
.name
.append(c
);
654 case DoctypeAfterName
: {
656 // Valid doctype. Emit it.
658 processDoctypeToken();
659 } else if (c
== '[') {
660 if(doctypeSearchCount
> 0 || doctypeSecondarySearchCount
> 0) { // is there any public/system indicator before?
661 doctypeSearchCount
= doctypeSecondarySearchCount
= 0;
662 doctypeToken
.state
= DoctypeBogus
;
664 // Found internal subset
665 doctypeToken
.state
= DoctypeInternalSubset
;
666 doctypeAllowComment
= false;
667 } else if (!isWhitespace
) {
668 if (c
.toLower() == publicStart
[doctypeSearchCount
]) {
669 doctypeSearchCount
++;
670 if(doctypeSearchCount
== 6)
671 // Found 'PUBLIC' sequence
672 doctypeToken
.state
= DoctypeBeforePublicID
;
673 } else if (doctypeSearchCount
> 0) {
674 doctypeSearchCount
= 0;
675 doctypeToken
.state
= DoctypeBogus
;
676 } else if (c
.toLower() == systemStart
[doctypeSecondarySearchCount
]) {
677 doctypeSecondarySearchCount
++;
678 if(doctypeSecondarySearchCount
== 6)
679 // Found 'SYSTEM' sequence
680 doctypeToken
.state
= DoctypeBeforeSystemID
;
682 doctypeSecondarySearchCount
= 0;
683 doctypeToken
.state
= DoctypeBogus
;
686 // Whitespace keeps us in the after name state
690 case DoctypeBeforePublicID
: {
691 if (c
== '\"' || c
== '\'') {
692 tquote
= c
== '\"' ? DoubleQuote
: SingleQuote
;
693 doctypeToken
.state
= DoctypePublicID
;
694 doctypeAllowComment
= false;
695 } else if (c
== '>') {
696 // Considered bogus. Don't process the doctype.
698 } else if (isWhitespace
) {
701 doctypeToken
.state
= DoctypeBogus
;
704 case DoctypePublicID
: {
705 if ((c
== '\"' && tquote
== DoubleQuote
) || (c
== '\'' && tquote
== SingleQuote
)) {
706 doctypeToken
.state
= DoctypeAfterPublicID
;
707 doctypeAllowComment
= true;
708 } else if (c
== '>') {
709 // Considered bogus. Don't process the doctype.
712 doctypeToken
.publicID
.append(c
);
716 case DoctypeAfterPublicID
: {
717 if (c
== '\"' || c
== '\'') {
718 tquote
= c
== '\"' ? DoubleQuote
: SingleQuote
;
719 doctypeToken
.state
= DoctypeSystemID
;
720 } else if (c
== '>') {
721 // Valid doctype. Emit it now.
723 processDoctypeToken();
724 } else if (isWhitespace
) {
726 } else if (c
== '[') {
727 // Found internal subset
728 doctypeToken
.state
= DoctypeInternalSubset
;
729 doctypeAllowComment
= false;
731 doctypeToken
.state
= DoctypeBogus
;
734 case DoctypeBeforeSystemID
: {
735 if (c
== '\"' || c
== '\'') {
736 tquote
= c
== '\"' ? DoubleQuote
: SingleQuote
;
737 doctypeToken
.state
= DoctypeSystemID
;
738 doctypeAllowComment
= false;
739 } else if (c
== '>') {
740 // Considered bogus. Don't process the doctype.
742 } else if (isWhitespace
) {
745 doctypeToken
.state
= DoctypeBogus
;
748 case DoctypeSystemID
: {
749 if ((c
== '\"' && tquote
== DoubleQuote
) || (c
== '\'' && tquote
== SingleQuote
)) {
750 doctypeToken
.state
= DoctypeAfterSystemID
;
751 doctypeAllowComment
= true;
752 } else if (c
== '>') {
753 // Considered bogus. Don't process the doctype.
756 doctypeToken
.systemID
.append(c
);
760 case DoctypeAfterSystemID
: {
762 // Valid doctype. Emit it now.
764 processDoctypeToken();
765 } else if (isWhitespace
) {
767 } else if (c
== '[') {
768 // Found internal subset
769 doctypeToken
.state
= DoctypeInternalSubset
;
770 doctypeAllowComment
= false;
772 doctypeToken
.state
= DoctypeBogus
;
776 case DoctypeInternalSubset
: {
779 doctypeToken
.state
= DoctypeAfterInternalSubset
;
780 doctypeAllowComment
= true;
782 doctypeToken
.internalSubset
.append(c
);
786 case DoctypeAfterInternalSubset
: {
788 // Valid doctype. Emit it now.
790 processDoctypeToken();
791 } else if (isWhitespace
) {
794 doctypeToken
.state
= DoctypeBogus
;
799 // Done with the bogus doctype.
802 // Just keep scanning for '>'
811 else if (dontAdvance
== 1)
813 else // double dontAdvance++, do workaround
814 doctypeComment
= DoctypeCommentBogus
;
818 void HTMLTokenizer::parseServer(TokenizerString
&src
)
820 checkScriptBuffer(src
.length());
821 while ( !src
.isEmpty() ) {
822 scriptCode
[ scriptCodeSize
++ ] = *src
;
823 if (src
->unicode() == '>' &&
824 scriptCodeSize
> 1 && scriptCode
[scriptCodeSize
-2] == '%') {
828 return; // Finished parsing server include
834 void HTMLTokenizer::parseProcessingInstruction(TokenizerString
&src
)
837 while ( !src
.isEmpty() )
839 unsigned char chbegin
= src
->toLatin1();
840 if(chbegin
== '\'') {
841 tquote
= tquote
== SingleQuote
? NoQuote
: SingleQuote
;
843 else if(chbegin
== '\"') {
844 tquote
= tquote
== DoubleQuote
? NoQuote
: DoubleQuote
;
847 // some crappy sites omit the "?" before it, so
848 // we look for an unquoted '>' instead. (IE compatible)
849 else if ( chbegin
== '>' && ( !tquote
|| oldchar
== '?' ) )
851 // We got a '?>' sequence
852 processingInstruction
= false;
855 return; // Finished parsing comment!
862 void HTMLTokenizer::parseText(TokenizerString
&src
)
864 while ( !src
.isEmpty() )
866 // do we need to enlarge the buffer?
869 // ascii is okay because we only do ascii comparisons
870 unsigned char chbegin
= src
->toLatin1();
872 if (skipLF
&& ( chbegin
!= '\n' ))
882 else if (( chbegin
== '\n' ) || ( chbegin
== '\r' ))
898 void HTMLTokenizer::parseEntity(TokenizerString
&src
, QChar
*&dest
, bool start
)
904 Entity
= SearchEntity
;
907 while( !src
.isEmpty() )
909 ushort cc
= src
->unicode();
917 cBuffer
[cBufferPos
++] = cc
;
919 Entity
= NumericSearch
;
927 if(cc
== 'x' || cc
== 'X') {
928 cBuffer
[cBufferPos
++] = cc
;
930 Entity
= Hexadecimal
;
932 else if(cc
>= '0' && cc
<= '9')
935 Entity
= SearchSemicolon
;
941 int uc
= EntityChar
.unicode();
942 int ll
= qMin
<uint
>(src
.length(), 8);
944 QChar
csrc(src
->toLower());
947 if(csrc
.row() || !((cc
>= '0' && cc
<= '9') || (cc
>= 'a' && cc
<= 'f'))) {
950 uc
= uc
*16 + (cc
- ( cc
< 'a' ? '0' : 'a' - 10));
951 cBuffer
[cBufferPos
++] = cc
;
954 EntityChar
= QChar(uc
);
955 Entity
= SearchSemicolon
;
960 int uc
= EntityChar
.unicode();
961 int ll
= qMin(src
.length(), 9-cBufferPos
);
965 if(src
->row() || !(cc
>= '0' && cc
<= '9')) {
966 Entity
= SearchSemicolon
;
970 uc
= uc
* 10 + (cc
- '0');
971 cBuffer
[cBufferPos
++] = cc
;
974 EntityChar
= QChar(uc
);
975 if(cBufferPos
== 9) Entity
= SearchSemicolon
;
980 int ll
= qMin(src
.length(), 9-cBufferPos
);
985 if(csrc
.row() || !((cc
>= 'a' && cc
<= 'z') ||
986 (cc
>= '0' && cc
<= '9') || (cc
>= 'A' && cc
<= 'Z'))) {
987 Entity
= SearchSemicolon
;
991 cBuffer
[cBufferPos
++] = cc
;
994 // be IE compatible and interpret even unterminated entities
995 // outside tags. like "foo  stuff bla".
996 if ( tag
== NoTag
) {
997 const entity
* e
= kde_findEntity(cBuffer
, cBufferPos
);
998 if ( e
&& e
->code
< 256 ) {
999 EntityChar
= e
->code
;
1000 entityLen
= cBufferPos
;
1004 if(cBufferPos
== 9) Entity
= SearchSemicolon
;
1005 if(Entity
== SearchSemicolon
) {
1006 if(cBufferPos
> 1) {
1007 const entity
*e
= kde_findEntity(cBuffer
, cBufferPos
);
1008 // IE only accepts unterminated entities < 256,
1009 // Gecko accepts them all, but only outside tags
1010 if(e
&& ( tag
== NoTag
|| e
->code
< 256 || *src
== ';' )) {
1011 EntityChar
= e
->code
;
1012 entityLen
= cBufferPos
;
1018 case SearchSemicolon
:
1020 kDebug( 6036 ) << "ENTITY " << EntityChar
.unicode();
1022 fixUpChar(EntityChar
);
1027 if ( !EntityChar
.isNull() ) {
1029 if (entityLen
> 0 && entityLen
< cBufferPos
) {
1030 int rem
= cBufferPos
- entityLen
;
1031 src
.prepend( TokenizerString(QString::fromAscii(cBuffer
+entityLen
, rem
)) );
1033 src
.push( EntityChar
);
1036 kDebug( 6036 ) << "unknown entity!";
1039 // ignore the sequence, add it to the buffer as plaintext
1041 for(unsigned int i
= 0; i
< cBufferPos
; i
++)
1042 dest
[i
] = cBuffer
[i
];
1045 prePos
+= cBufferPos
+1;
1049 EntityChar
= QChar::Null
;
1055 void HTMLTokenizer::parseTag(TokenizerString
&src
)
1058 checkScriptBuffer( src
.length() );
1060 while ( !src
.isEmpty() )
1063 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1065 while(l
< src
.length() && (src
.toString()[l
]).toLatin1().constData() != '>')
1067 qDebug("src is now: *%s*, tquote: %d",
1068 src
.toString().left(l
).toLatin1().constData(), tquote
);
1075 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1078 if (searchCount
> 0)
1080 if (*src
== commentStart
[searchCount
])
1083 if (searchCount
== 2)
1084 doctypeSearchCount
++; // A '!' is also part of doctype, so we are moving through that still as well
1086 doctypeSearchCount
= 0;
1088 if (searchCount
== 4)
1091 kDebug( 6036 ) << "Found comment";
1093 // Found '<!--' sequence
1095 dest
= buffer
; // ignore the previous part of this tag
1100 return; // Finished parsing tag!
1102 // cuts of high part, is okay
1103 cBuffer
[cBufferPos
++] = src
->cell();
1108 searchCount
= 0; // Stop looking for '<!--' sequence
1111 if (doctypeSearchCount
> 0) {
1112 if((*src
).toLower() == doctypeStart
[doctypeSearchCount
]) {
1113 doctypeSearchCount
++;
1114 cBuffer
[cBufferPos
++] = src
->cell();
1116 if(doctypeSearchCount
== 9) {
1117 // Found '<!DOCTYPE' sequence
1119 doctypeAllowComment
= true;
1120 doctypeComment
= NoDoctypeComment
;
1121 doctypeToken
.reset();
1129 doctypeSearchCount
= 0; // Stop looking for '<!DOCTYPE' sequence
1132 bool finish
= false;
1133 unsigned int ll
= qMin(src
.length(), CBUFLEN
-cBufferPos
);
1135 ushort curchar
= src
->unicode();
1136 if(curchar
<= ' ' || curchar
== '>' ) {
1140 // this is a nasty performance trick. will work for the A-Z
1141 // characters, but not for others. if it contains one,
1144 cBuffer
[cBufferPos
++] = cc
| 0x20;
1148 // Disadvantage: we add the possible rest of the tag
1149 // as attribute names. ### judge if this causes problems
1150 if(finish
|| CBUFLEN
== cBufferPos
) {
1152 char* ptr
= cBuffer
;
1153 unsigned int len
= cBufferPos
;
1154 cBuffer
[cBufferPos
] = '\0';
1155 if ((cBufferPos
> 0) && (*ptr
== '/'))
1165 // Accept empty xml tags like <br/>
1166 if(len
> 1 && ptr
[len
-1] == '/' ) {
1168 // if its like <br/> and not like <input/ value=foo>, take it as flat
1170 currToken
.flat
= true;
1175 DOMString
tagName(ptr
);
1176 DocumentImpl
*doc
= parser
->docPtr();
1177 if (Element::khtmlValidQualifiedName(tagName
)) {
1178 safeLocalName
= LocalName::fromString(tagName
.lower());
1179 tagID
= safeLocalName
.id();
1182 QByteArray
tmp(ptr
, len
+1);
1183 kDebug( 6036 ) << "Unknown tag: \"" << tmp
.data() << "\"";
1188 QByteArray
tmp(ptr
, len
+1);
1189 kDebug( 6036 ) << "found tag id=" << tagID
<< ": " << tmp
.data();
1191 currToken
.tid
= beginTag
? tagID
: tagID
+ ID_CLOSE_TAG
;
1194 tag
= SearchAttribute
;
1199 case SearchAttribute
:
1201 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1202 qDebug("SearchAttribute");
1204 bool atespace
= false;
1206 while(!src
.isEmpty()) {
1207 curchar
= src
->unicode();
1209 if(curchar
== '<' || curchar
== '>')
1211 else if(atespace
&& (curchar
== '\'' || curchar
== '"'))
1218 tag
= AttributeName
;
1230 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1231 qDebug("AttributeName");
1234 int ll
= qMin(src
.length(), CBUFLEN
-cBufferPos
);
1237 curchar
= src
->unicode();
1238 if(curchar
<= '>') {
1239 if(curchar
<= ' ' || curchar
== '=' || curchar
== '>') {
1241 cBuffer
[cBufferPos
] = '\0';
1242 a
= LocalName::fromString(DOMString(cBuffer
).lower()).id();
1243 if (a
> ATTR_LAST_ATTR
)
1247 // did we just get /> or e.g checked/>
1248 if (curchar
== '>' && cBufferPos
>=1 && cBuffer
[cBufferPos
-1] == '/') {
1249 currToken
.flat
= true;
1250 cBuffer
[cBufferPos
- 1] = '\0';
1252 a
= LocalName::fromString(DOMString(cBuffer
).lower()).id();
1253 if (a
> ATTR_LAST_ATTR
)
1255 cBuffer
[cBufferPos
- 1] = '/';
1258 attrName
= QLatin1String(QByteArray(cBuffer
, cBufferPos
+1).data());
1264 if (!a
|| (cBufferPos
&& *cBuffer
== '!'))
1265 kDebug( 6036 ) << "Unknown attribute: *" << QByteArray(cBuffer
, cBufferPos
+1).data() << "*";
1267 kDebug( 6036 ) << "Known attribute: " << QByteArray(cBuffer
, cBufferPos
+1).data();
1274 cBuffer
[cBufferPos
++] =
1275 ( curchar
>= 'A' && curchar
<= 'Z' ) ? curchar
| 0x20 : curchar
;
1278 if ( cBufferPos
== CBUFLEN
) {
1279 cBuffer
[cBufferPos
] = '\0';
1280 attrName
= QLatin1String(QByteArray(cBuffer
, cBufferPos
+1).data());
1289 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1290 qDebug("SearchEqual");
1293 bool atespace
= false;
1294 while(!src
.isEmpty()) {
1295 curchar
= src
->unicode();
1297 if(curchar
== '=') {
1299 kDebug(6036) << "found equal";
1304 else if(atespace
&& (curchar
== '\'' || curchar
== '"'))
1312 currToken
.addAttribute(parser
->docPtr(), buffer
, attrName
, v
);
1314 tag
= SearchAttribute
;
1326 while(!src
.isEmpty()) {
1327 curchar
= src
->unicode();
1329 if(( curchar
== '\'' || curchar
== '\"' )) {
1330 tquote
= curchar
== '\"' ? DoubleQuote
: SingleQuote
;
1344 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1345 qDebug("QuotedValue");
1348 while(!src
.isEmpty()) {
1351 curchar
= src
->unicode();
1352 if(curchar
<= '\'' && !src
.escaped()) {
1353 // ### attributes like '&{blaa....};' are supposed to be treated as jscript.
1354 if ( curchar
== '&' )
1357 parseEntity(src
, dest
, true);
1360 else if ( (tquote
== SingleQuote
&& curchar
== '\'') ||
1361 (tquote
== DoubleQuote
&& curchar
== '\"') )
1363 // some <input type=hidden> rely on trailing spaces. argh
1364 while(dest
> buffer
+1 && (*(dest
-1) == '\n' || *(dest
-1) == '\r'))
1365 dest
--; // remove trailing newlines
1366 DOMString
v(buffer
+1, dest
-buffer
-1);
1367 currToken
.addAttribute(parser
->docPtr(), buffer
, attrName
, v
);
1370 tag
= SearchAttribute
;
1383 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1387 while(!src
.isEmpty()) {
1389 curchar
= src
->unicode();
1390 if(curchar
<= '>' && !src
.escaped()) {
1392 if ( curchar
== '&' )
1395 parseEntity(src
, dest
, true);
1398 // no quotes. Every space means end of value
1399 // '/' does not delimit in IE!
1400 if ( curchar
<= ' ' || curchar
== '>' )
1402 DOMString
v(buffer
+1, dest
-buffer
-1);
1403 currToken
.addAttribute(parser
->docPtr(), buffer
, attrName
, v
);
1405 tag
= SearchAttribute
;
1417 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1418 qDebug("SearchEnd");
1420 while(!src
.isEmpty()) {
1421 if(*src
== '<' || *src
== '>')
1425 currToken
.flat
= true;
1429 if(src
.isEmpty() && *src
!= '<' && *src
!= '>') break;
1431 searchCount
= 0; // Stop looking for '<!--' sequence
1437 if ( !currToken
.tid
) //stop if tag is unknown
1440 uint tagID
= currToken
.tid
;
1441 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 0
1442 kDebug( 6036 ) << "appending Tag: " << tagID
;
1444 // If the tag requires an end tag it cannot be flat,
1445 // unless we are using the HTML parser to parse XHTML
1446 // The only exception is SCRIPT and priority 0 tokens.
1447 if (tagID
< ID_CLOSE_TAG
&& tagID
!= ID_SCRIPT
&&
1448 DOM::endTagRequirement(tagID
) == DOM::REQUIRED
&&
1449 parser
->doc()->htmlMode() != DocumentImpl::XHtml
)
1450 currToken
.flat
= false;
1452 bool beginTag
= !currToken
.flat
&& (tagID
< ID_CLOSE_TAG
);
1454 if(tagID
>= ID_CLOSE_TAG
)
1455 tagID
-= ID_CLOSE_TAG
;
1456 else if ( !brokenScript
&& tagID
== ID_SCRIPT
) {
1457 DOMStringImpl
* a
= 0;
1458 bool foundTypeAttribute
= false;
1459 scriptSrc
.clear(); scriptSrcCharset
.clear();
1460 if ( currToken
.attrs
&& /* potentially have a ATTR_SRC ? */
1461 view
&& /* are we a regular tokenizer or just for innerHTML ? */
1462 parser
->doc()->view()->part()->jScriptEnabled() /* jscript allowed at all? */
1464 if ( ( a
= currToken
.attrs
->getValue( ATTR_SRC
) ) )
1465 scriptSrc
= parser
->doc()->completeURL(khtml::parseURL( DOMString(a
) ).string() );
1466 if ( ( a
= currToken
.attrs
->getValue( ATTR_CHARSET
) ) )
1467 scriptSrcCharset
= DOMString(a
).string().trimmed();
1468 if ( scriptSrcCharset
.isEmpty() && view
)
1469 scriptSrcCharset
= parser
->doc()->view()->part()->encoding();
1470 /* Check type before language, since language is deprecated */
1471 if ((a
= currToken
.attrs
->getValue(ATTR_TYPE
)) != 0 && !DOMString(a
).string().isEmpty())
1472 foundTypeAttribute
= true;
1474 a
= currToken
.attrs
->getValue(ATTR_LANGUAGE
);
1478 if( foundTypeAttribute
) {
1480 Mozilla 1.5 doesn't accept the text/javascript1.x formats, but WinIE 6 does.
1481 Mozilla 1.5 doesn't accept text/jscript, text/ecmascript, and text/livescript, but WinIE 6 does.
1482 Mozilla 1.5 accepts application/x-javascript, WinIE 6 doesn't.
1483 Mozilla 1.5 allows leading and trailing whitespace, but WinIE 6 doesn't.
1484 Mozilla 1.5 and WinIE 6 both accept the empty string, but neither accept a whitespace-only string.
1485 We want to accept all the values that either of these browsers accept, but not other values.
1487 QString type
= DOMString(a
).string().trimmed().toLower();
1488 if( type
.compare("text/javascript") != 0 &&
1489 type
.compare("text/javascript1.0") != 0 &&
1490 type
.compare("text/javascript1.1") != 0 &&
1491 type
.compare("text/javascript1.2") != 0 &&
1492 type
.compare("text/javascript1.3") != 0 &&
1493 type
.compare("text/javascript1.4") != 0 &&
1494 type
.compare("text/javascript1.5") != 0 &&
1495 type
.compare("text/jscript") != 0 &&
1496 type
.compare("text/ecmascript") != 0 &&
1497 type
.compare("text/livescript") != 0 &&
1498 type
.compare("application/x-javascript") != 0 &&
1499 type
.compare("application/x-ecmascript") != 0 &&
1500 type
.compare("application/javascript") != 0 &&
1501 type
.compare("application/ecmascript") != 0 )
1505 Mozilla 1.5 doesn't accept jscript or ecmascript, but WinIE 6 does.
1506 Mozilla 1.5 accepts javascript1.0, javascript1.4, and javascript1.5, but WinIE 6 accepts only 1.1 - 1.3.
1507 Neither Mozilla 1.5 nor WinIE 6 accept leading or trailing whitespace.
1508 We want to accept all the values that either of these browsers accept, but not other values.
1510 QString lang
= DOMString(a
).string();
1511 lang
= lang
.toLower();
1512 if( lang
.compare("") != 0 &&
1513 lang
.compare("javascript") != 0 &&
1514 lang
.compare("javascript1.0") != 0 &&
1515 lang
.compare("javascript1.1") != 0 &&
1516 lang
.compare("javascript1.2") != 0 &&
1517 lang
.compare("javascript1.3") != 0 &&
1518 lang
.compare("javascript1.4") != 0 &&
1519 lang
.compare("javascript1.5") != 0 &&
1520 lang
.compare("ecmascript") != 0 &&
1521 lang
.compare("livescript") != 0 &&
1522 lang
.compare("jscript") )
1529 if ( parser
->selectMode() && beginTag
)
1530 discard
= AllDiscard
;
1537 discard
= LFDiscard
;
1545 searchStopper
= scriptEnd
;
1546 searchStopperLen
= 8;
1550 else if (tagID
< ID_CLOSE_TAG
) // Handle <script src="foo"/>
1555 searchStopper
= styleEnd
;
1556 searchStopperLen
= 7;
1563 searchStopper
= textareaEnd
;
1564 searchStopperLen
= 10;
1566 discard
= NoneDiscard
;
1572 searchStopper
= titleEnd
;
1573 searchStopperLen
= 7;
1580 searchStopper
= xmpEnd
;
1581 searchStopperLen
= 5;
1590 plaintext
= beginTag
;
1593 return; // Finished parsing tag!
1600 void HTMLTokenizer::addPending()
1602 if ( select
&& !(comment
|| script
))
1609 case LFPending
: *dest
++ = QLatin1Char('\n'); prePos
= 0; break;
1610 case SpacePending
: *dest
++ = QLatin1Char(' '); ++prePos
; break;
1612 // Don't expand tabs inside <textarea> or script
1613 int p
= TAB_SIZE
- ( prePos
% TAB_SIZE
);
1614 if (textarea
|| script
) {
1615 *dest
++ = QLatin1Char('\t');
1617 for ( int x
= 0; x
< p
; x
++ )
1618 *dest
++ = QLatin1Char(' ');
1628 pending
= NonePending
;
1631 inline bool HTMLTokenizer::continueProcessing(int& processedCount
)
1633 // We don't want to be checking elapsed time with every character, so we only check after we've
1634 // processed a certain number of characters.
1635 if (!m_executingScript
&& processedCount
> sTokenizerChunkSize
&& cachedScript
.isEmpty()) {
1637 if ( m_time
.elapsed() > m_tokenizerYeldDelay
) {
1638 m_yeldTimer
= startTimer(0);
1639 m_tokenizerYeldDelay
= sTokenizerFastYeldDelay
;
1647 void HTMLTokenizer::write( const TokenizerString
&str
, bool appendData
)
1650 kDebug( 6036 ) << this << " Tokenizer::write(\"" << str
.toString() << "\"," << appendData
<< ")";
1656 if ( ( m_executingScript
&& appendData
) || cachedScript
.count() ) {
1657 // don't parse; we will do this later
1658 if (pendingQueue
.isEmpty())
1659 pendingQueue
.push(str
);
1660 else if (appendData
)
1661 pendingQueue
.bottom().append(str
);
1663 pendingQueue
.top().append(str
);
1664 #if PROSPECTIVE_TOKENIZER_ENABLED
1665 if (m_prospectiveTokenizer
&& m_prospectiveTokenizer
->inProgress() && appendData
)
1666 m_prospectiveTokenizer
->write(str
);
1671 #if PROSPECTIVE_TOKENIZER_ENABLED
1672 if (m_prospectiveTokenizer
&& m_prospectiveTokenizer
->inProgress() && appendData
)
1673 m_prospectiveTokenizer
->end();
1686 // Once a timer is set, it has control of when the tokenizer continues.
1687 if (m_yeldTimer
> 0)
1690 int processedCount
= 0;
1693 while ( !src
.isEmpty() )
1695 if ( m_abort
|| !continueProcessing(processedCount
) )
1697 // do we need to enlarge the buffer?
1700 ushort cc
= src
->unicode();
1702 if (skipLF
&& (cc
!= '\n'))
1710 parseEntity( src
, dest
);
1711 else if ( plaintext
)
1725 else if (doctypeComment
&& doctypeComment
!= DoctypeCommentEnd
&& doctypeComment
!= DoctypeCommentBogus
)
1726 parseDoctypeComment(src
);
1731 else if (processingInstruction
)
1732 parseProcessingInstruction(src
);
1735 else if ( startTag
)
1738 bool endTag
= false;
1746 // <!-- comment --> or <!DOCTYPE ...>
1747 searchCount
= 1; // Look for '<!--' sequence to start comment...
1748 doctypeSearchCount
= 1; // ... or for '<!DOCTYPE' sequence to start doctype
1753 // xml processing instruction
1754 processingInstruction
= true;
1756 parseProcessingInstruction(src
);
1762 if (!brokenServer
) {
1763 // <% server stuff, handle as comment %>
1769 // else fall through
1772 if( ((cc
>= 'a') && (cc
<= 'z')) || ((cc
>= 'A') && (cc
<= 'Z')))
1774 // Start of a Start-Tag
1789 // According to SGML any LF immediately after a starttag, or
1790 // immediately before an endtag should be ignored.
1791 // ### Gecko and MSIE though only ignores LF immediately after
1792 // starttags and only for PRE elements -- asj (28/06-2005)
1797 pending
= NonePending
;
1799 // Cancel unused discards
1800 discard
= NoneDiscard
;
1801 // if (!endTag) discard = LFDiscard;
1809 else if ( cc
== '&' && !src
.escaped())
1814 discard
= NoneDiscard
;
1815 parseEntity(src
, dest
, true);
1817 else if ( cc
== '<' && !src
.escaped())
1819 tagStartLineno
= lineno
+src
.lineCount();
1821 discard
= NoneDiscard
;
1824 else if (( cc
== '\n' ) || ( cc
== '\r' ))
1826 if (discard
== SpaceDiscard
)
1827 discard
= NoneDiscard
;
1829 if (discard
== LFDiscard
) {
1831 discard
= NoneDiscard
;
1833 else if (discard
== AllDiscard
)
1839 if (select
&& !script
) {
1840 pending
= LFPending
;
1844 pending
= LFPending
;
1848 /* Check for MS-DOS CRLF sequence */
1855 else if (( cc
== ' ' ) || ( cc
== '\t' ))
1857 if(discard
== LFDiscard
)
1858 discard
= NoneDiscard
;
1860 if(discard
== SpaceDiscard
) {
1862 discard
= NoneDiscard
;
1864 else if(discard
== AllDiscard
)
1869 if (select
&& !script
) {
1871 pending
= SpacePending
;
1876 pending
= SpacePending
;
1878 pending
= TabPending
;
1889 discard
= NoneDiscard
;
1901 if (noMoreData
&& cachedScript
.isEmpty() && !m_executingScript
&& m_yeldTimer
<=0)
1902 end(); // this actually causes us to be deleted
1905 void HTMLTokenizer::timerEvent( QTimerEvent
*e
)
1907 if ( e
->timerId() == m_yeldTimer
) {
1908 killTimer(m_yeldTimer
);
1910 write( TokenizerString(), true );
1911 } else if ( e
->timerId() == m_autoCloseTimer
&& cachedScript
.isEmpty() ) {
1916 void HTMLTokenizer::setAutoClose( bool b
) {
1917 killTimer( m_autoCloseTimer
);
1918 m_autoCloseTimer
= 0;
1920 m_autoCloseTimer
= startTimer(100);
1923 void HTMLTokenizer::end()
1925 if ( buffer
== 0 ) {
1926 emit
finishedParsing();
1930 // parseTag is using the buffer for different matters
1935 KHTML_DELETE_QCHAR_VEC(buffer
);
1938 KHTML_DELETE_QCHAR_VEC(scriptCode
);
1941 scriptCodeSize
= scriptCodeMaxSize
= scriptCodeResync
= 0;
1943 emit
finishedParsing();
1946 void HTMLTokenizer::finish()
1948 if ( m_autoCloseTimer
) {
1949 killTimer( m_autoCloseTimer
);
1950 m_autoCloseTimer
= 0;
1952 // do this as long as we don't find matching comment ends
1953 while((title
|| script
|| comment
|| server
) && scriptCode
&& scriptCodeSize
)
1955 // we've found an unmatched comment start
1957 brokenComments
= true;
1959 brokenServer
= true;
1961 brokenScript
= true;
1963 checkScriptBuffer();
1964 scriptCode
[ scriptCodeSize
] = 0;
1965 scriptCode
[ scriptCodeSize
+ 1 ] = 0;
1968 if (title
|| style
|| script
)
1969 food
.setUnicode(scriptCode
, scriptCodeSize
);
1972 food
+= QString(scriptCode
, scriptCodeSize
);
1975 pos
= QString::fromRawData(scriptCode
, scriptCodeSize
).indexOf('>');
1976 food
.setUnicode(scriptCode
+pos
+1, scriptCodeSize
-pos
-1); // deep copy
1978 KHTML_DELETE_QCHAR_VEC(scriptCode
);
1980 scriptCodeSize
= scriptCodeMaxSize
= scriptCodeResync
= 0;
1984 comment
= title
= server
= script
= false;
1985 if ( !food
.isEmpty() )
1988 // this indicates we will not receive any more data... but if we are waiting on
1989 // an external script to load, we can't finish parsing until that is done
1991 if (cachedScript
.isEmpty() && !m_executingScript
&& !onHold
&& m_yeldTimer
<= 0)
1992 end(); // this actually causes us to be deleted
1995 void HTMLTokenizer::processToken()
1997 KJSProxy
*jsProxy
= view
? view
->part()->jScript() : 0L;
1999 jsProxy
->setEventHandlerLineno(tagStartLineno
);
2000 if ( dest
> buffer
)
2004 qDebug( "unexpected token id: %d, str: *%s*", currToken
.tid
,QString::fromRawData( buffer
, dest
-buffer
).toLatin1().constData() );
2009 currToken
.text
= new DOMStringImpl( buffer
, dest
- buffer
);
2010 currToken
.text
->ref();
2011 if (currToken
.tid
!= ID_COMMENT
)
2012 currToken
.tid
= ID_TEXT
;
2014 else if(!currToken
.tid
) {
2017 jsProxy
->setEventHandlerLineno(lineno
+src
.lineCount());
2024 QString name
= QString( getTagName(currToken
.tid
) );
2027 text
= QString::fromRawData(currToken
.text
->s
, currToken
.text
->l
);
2029 kDebug( 6036 ) << "Token --> " << name
<< " id = " << currToken
.tid
;
2031 kDebug( 6036 ) << "Token is FLAT!";
2033 kDebug( 6036 ) << "text: \"" << text
<< "\"";
2034 unsigned long l
= currToken
.attrs
? currToken
.attrs
->length() : 0;
2036 kDebug( 6036 ) << "Attributes: " << l
;
2037 for (unsigned long i
= 0; i
< l
; ++i
) {
2038 NodeImpl::Id tid
= currToken
.attrs
->idAt(i
);
2039 DOMString value
= currToken
.attrs
->valueAt(i
);
2040 kDebug( 6036 ) << " " << tid
<< " " << parser
->doc()->document()->getName(NodeImpl::AttributeId
, tid
).string()
2041 << "=\"" << value
.string() << "\"" << endl
;
2047 // In some cases, parseToken() can cause javascript code to be executed
2048 // (for example, when setting an attribute that causes an event handler
2049 // to be created). So we need to protect against re-entrancy into the parser
2050 m_executingScript
++;
2052 // pass the token over to the parser, the parser DOES NOT delete the token
2053 parser
->parseToken(&currToken
);
2055 m_executingScript
--;
2057 if ( currToken
.flat
&& currToken
.tid
!= ID_TEXT
&& !parser
->noSpaces() )
2058 discard
= NoneDiscard
;
2062 jsProxy
->setEventHandlerLineno(0);
2065 void HTMLTokenizer::processDoctypeToken()
2067 // kDebug( 6036 ) << "Process DoctypeToken (name: " << doctypeToken.name << ", publicID: " << doctypeToken.publicID << ", systemID: " << doctypeToken.systemID;
2068 doctypeToken
.publicID
= doctypeToken
.publicID
.simplified();
2069 doctypeToken
.systemID
= doctypeToken
.systemID
.simplified();
2070 parser
->parseDoctypeToken(&doctypeToken
);
2074 HTMLTokenizer::~HTMLTokenizer()
2077 delete m_prospectiveTokenizer
;
2082 void HTMLTokenizer::enlargeBuffer(int len
)
2084 int newsize
= qMax(size
*2, size
+len
);
2085 int oldoffs
= (dest
- buffer
);
2087 buffer
= KHTML_REALLOC_QCHAR_VEC(buffer
, newsize
);
2088 dest
= buffer
+ oldoffs
;
2092 void HTMLTokenizer::enlargeScriptBuffer(int len
)
2094 int newsize
= qMax(scriptCodeMaxSize
*2, scriptCodeMaxSize
+len
);
2095 scriptCode
= KHTML_REALLOC_QCHAR_VEC(scriptCode
, newsize
);
2096 scriptCodeMaxSize
= newsize
;
2099 void HTMLTokenizer::notifyFinished(CachedObject
* /*finishedObj*/)
2101 assert(!cachedScript
.isEmpty());
2103 while (!done
&& cachedScript
.head()->isLoaded()) {
2105 kDebug( 6036 ) << "Finished loading an external script";
2107 CachedScript
* cs
= cachedScript
.dequeue();
2108 DOMString scriptSource
= cs
->script();
2110 kDebug( 6036 ) << "External script is:" << endl
<< scriptSource
.string();
2112 setSrc(TokenizerString());
2114 // make sure we forget about the script before we execute the new one
2115 // infinite recursion might happen otherwise
2116 QString
cachedScriptUrl( cs
->url().string() );
2119 scriptExecution( scriptSource
.string(), cachedScriptUrl
);
2121 done
= cachedScript
.isEmpty();
2123 // 'script' is true when we are called synchronously from
2124 // scriptHandler(). In that case scriptHandler() will take care
2125 // of 'scriptOutput'.
2127 while (pendingQueue
.count() > 1) {
2128 TokenizerString t
= pendingQueue
.pop();
2129 pendingQueue
.top().prepend( t
);
2132 write(pendingQueue
.pop(), false);
2134 // we might be deleted at this point, do not
2135 // access any members.
2140 bool HTMLTokenizer::isWaitingForScripts() const
2142 return cachedScript
.count();
2145 bool HTMLTokenizer::isExecutingScript() const
2147 return (m_executingScript
> 0);
2150 void HTMLTokenizer::setSrc(const TokenizerString
& source
)
2152 lineno
+= src
.lineCount();
2154 src
.resetLineCount();
2157 void HTMLTokenizer::setOnHold(bool _onHold
)
2159 if (onHold
== _onHold
) return;