2 * Copyright (C) 2008 Apple Inc. All Rights Reserved.
3 * (C) 2008 Germain Garand <germain@ebooksfrance.org>
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 #include "htmlprospectivetokenizer.h"
32 #include "html_headimpl.h"
33 #include "html_documentimpl.h"
34 #include "htmlparser.h"
37 #include <misc/loader.h>
38 #include <misc/htmlhashes.h>
39 #include <khtmlview.h>
40 #include <khtml_part.h>
41 #include <xml/dom_docimpl.h>
42 #include <css/csshelper.h>
43 #include <ecma/kjs_proxy.h>
47 #include <QtCore/QVariant>
52 // The main tokenizer includes this too so we are getting two copies of the data. However, this way the code gets inlined.
53 #include "kentities.c"
55 // Not inlined for non-GCC compilers
60 const struct entity
*kde_findEntity (register const char *str
, register unsigned int len
);
63 #define PRELOAD_DEBUG 0
65 #define U16_TRAIL(sup) (ushort)(((sup)&0x3ff)|0xdc00)
66 #define U16_LEAD(sup) (ushort)(((sup)>>10)+0xd7c0)
68 using namespace khtml
;
70 ProspectiveTokenizer::ProspectiveTokenizer(DOM::DocumentImpl
* doc
)
74 , m_attributeValue(255)
81 kDebug() << "CREATING PRELOAD SCANNER FOR" << m_document
<< m_document
->URL().prettyUrl();
85 ProspectiveTokenizer::~ProspectiveTokenizer()
88 fprintf(stderr
, "DELETING PRELOAD SCANNER FOR %p\n", m_document
);
89 fprintf(stderr
, "TOTAL TIME USED %dms\n", m_timeUsed
);
93 void ProspectiveTokenizer::begin()
95 assert(!m_inProgress
);
100 void ProspectiveTokenizer::end()
102 assert(m_inProgress
);
103 m_inProgress
= false;
106 void ProspectiveTokenizer::reset()
112 m_contentModel
= PCDATA
;
117 m_attributeName
.clear();
118 m_attributeValue
.clear();
119 m_lastStartTag
.clear();
120 m_lastStartTagId
= 0;
123 m_linkIsStyleSheet
= false;
124 m_lastCharacterIndex
= 0;
125 clearLastCharacters();
127 m_cssState
= CSSInitial
;
129 m_cssRuleValue
.clear();
132 void ProspectiveTokenizer::write(const TokenizerString
& source
)
142 m_timeUsed
+= t
.elapsed();
146 static inline bool isWhitespace(QChar c
)
148 return c
== ' ' || c
== '\n' || c
== '\r' || c
== '\t';
151 inline void ProspectiveTokenizer::clearLastCharacters()
153 memset(m_lastCharacters
, 0, lastCharactersBufferSize
* sizeof(QChar
));
156 inline void ProspectiveTokenizer::rememberCharacter(QChar c
)
158 m_lastCharacterIndex
= (m_lastCharacterIndex
+ 1) % lastCharactersBufferSize
;
159 m_lastCharacters
[m_lastCharacterIndex
] = c
;
162 inline bool ProspectiveTokenizer::lastCharactersMatch(const char* chars
, unsigned count
) const
164 unsigned pos
= m_lastCharacterIndex
;
166 if (chars
[count
- 1] != m_lastCharacters
[pos
])
170 pos
= lastCharactersBufferSize
;
176 static inline unsigned legalEntityFor(unsigned value
)
178 // FIXME There is a table for more exceptions in the HTML5 specification.
179 if (value
== 0 || value
> 0x10FFFF || (value
>= 0xD800 && value
<= 0xDFFF))
184 unsigned ProspectiveTokenizer::consumeEntity(TokenizerString
& source
, bool& notEnoughCharacters
)
194 EntityState entityState
= Initial
;
196 QVector
<QChar
> seenChars(10);
197 QVector
<char> entityName(10);
199 while (!source
.isEmpty()) {
200 seenChars
.append(*source
);
201 ushort cc
= source
->unicode();
202 switch (entityState
) {
204 if (isWhitespace(cc
) || cc
== '<' || cc
== '&')
207 entityState
= NumberType
;
208 else if ((cc
>= 'a' && cc
<= 'z') || (cc
>= 'A' && cc
<= 'Z')) {
209 entityName
.append(cc
);
215 if (cc
== 'x' || cc
== 'X')
216 entityState
= MaybeHex
;
217 else if (cc
>= '0' && cc
<= '9') {
218 entityState
= Decimal
;
226 if (cc
>= '0' && cc
<= '9')
228 else if (cc
>= 'a' && cc
<= 'f')
229 result
= 10 + cc
- 'a';
230 else if (cc
>= 'A' && cc
<= 'F')
231 result
= 10 + cc
- 'A';
233 source
.push(seenChars
[1]);
240 if (cc
>= '0' && cc
<= '9')
241 result
= result
* 16 + cc
- '0';
242 else if (cc
>= 'a' && cc
<= 'f')
243 result
= result
* 16 + 10 + cc
- 'a';
244 else if (cc
>= 'A' && cc
<= 'F')
245 result
= result
* 16 + 10 + cc
- 'A';
246 else if (cc
== ';') {
248 return legalEntityFor(result
);
250 return legalEntityFor(result
);
253 if (cc
>= '0' && cc
<= '9')
254 result
= result
* 10 + cc
- '0';
255 else if (cc
== ';') {
257 return legalEntityFor(result
);
259 return legalEntityFor(result
);
262 // This is the attribute only version, generic version matches somewhat differently
263 while (entityName
.size() <= 8) {
265 const entity
* e
= kde_findEntity(entityName
.data(), entityName
.size());
272 if (!(cc
>= 'a' && cc
<= 'z') && !(cc
>= 'A' && cc
<= 'Z') && !(cc
>= '0' && cc
<= '9')) {
273 const entity
* e
= kde_findEntity(entityName
.data(), entityName
.size());
278 entityName
.append(cc
);
280 if (source
.isEmpty())
281 goto outOfCharacters
;
282 cc
= source
->unicode();
283 seenChars
.append(cc
);
285 if (seenChars
.size() == 2)
286 source
.push(seenChars
[0]);
287 else if (seenChars
.size() == 3) {
288 source
.push(seenChars
[1]);
289 source
.push(seenChars
[0]);
291 source
.prepend(TokenizerString(QString(seenChars
.data(), seenChars
.size() - 1)));
297 notEnoughCharacters
= true;
298 source
.prepend(TokenizerString(QString(seenChars
.data(), seenChars
.size())));
302 void ProspectiveTokenizer::tokenize(const TokenizerString
& source
)
304 assert(m_inProgress
);
306 m_source
.append(source
);
308 // This is a simplified HTML5 Tokenizer
309 // http://www.whatwg.org/specs/web-apps/current-work/#tokenisation0
310 while (!m_source
.isEmpty()) {
311 ushort cc
= m_source
->unicode();
315 rememberCharacter(cc
);
317 if (m_contentModel
== PCDATA
|| m_contentModel
== RCDATA
) {
318 m_state
= EntityData
;
321 } else if (cc
== '-') {
322 if ((m_contentModel
== RCDATA
|| m_contentModel
== CDATA
) && !m_escape
) {
323 if (lastCharactersMatch("<!--", 4))
326 } else if (cc
== '<') {
327 if (m_contentModel
== PCDATA
|| ((m_contentModel
== RCDATA
|| m_contentModel
== CDATA
) && !m_escape
)) {
331 } else if (cc
== '>') {
332 if ((m_contentModel
== RCDATA
|| m_contentModel
== CDATA
) && m_escape
) {
333 if (lastCharactersMatch("-->", 3))
339 if (m_source
.isEmpty())
341 cc
= m_source
->unicode();
345 // should try to consume the entity but we only care about entities in attributes
349 if (m_contentModel
== RCDATA
|| m_contentModel
== CDATA
) {
351 m_state
= CloseTagOpen
;
356 } else if (m_contentModel
== PCDATA
) {
358 m_state
= MarkupDeclarationOpen
;
360 m_state
= CloseTagOpen
;
361 else if (cc
>= 'A' && cc
<= 'Z') {
363 m_tagName
.append(cc
+ 0x20);
366 } else if (cc
>= 'a' && cc
<= 'z') {
368 m_tagName
.append(cc
);
371 } else if (cc
== '>') {
373 } else if (cc
== '?') {
374 m_state
= BogusComment
;
382 if (m_contentModel
== RCDATA
|| m_contentModel
== CDATA
) {
383 if (!m_lastStartTag
.size()) {
387 if ((unsigned)m_source
.length() < m_lastStartTag
.size() + 1)
389 QVector
<QChar
> tmpString
;
392 for (unsigned n
= 0; n
< m_lastStartTag
.size() + 1; n
++) {
393 tmpChar
= m_source
->toLower();
394 if (n
< m_lastStartTag
.size() && tmpChar
!= m_lastStartTag
[n
])
396 tmpString
.append(tmpChar
);
399 m_source
.prepend(TokenizerString(QString(tmpString
.data(), tmpString
.size())));
400 if (!match
|| (!isWhitespace(tmpChar
) && tmpChar
!= '>' && tmpChar
!= '/')) {
405 if (cc
>= 'A' && cc
<= 'Z') {
407 m_tagName
.append(cc
+ 0x20);
410 } else if (cc
>= 'a' && cc
<= 'z') {
412 m_tagName
.append(cc
);
415 } else if (cc
== '>') {
418 m_state
= BogusComment
;
422 if (isWhitespace(cc
)) {
423 m_state
= BeforeAttributeName
;
432 m_state
= BeforeAttributeName
;
435 if (cc
>= 'A' && cc
<= 'Z')
436 m_tagName
.append(cc
+ 0x20);
438 m_tagName
.append(cc
);
440 if (m_source
.isEmpty())
442 cc
= m_source
->unicode();
445 case BeforeAttributeName
:
446 if (isWhitespace(cc
))
448 else if (cc
== '>') {
451 } else if (cc
>= 'A' && cc
<= 'Z') {
452 m_attributeName
.clear();
453 m_attributeValue
.clear();
454 m_attributeName
.append(cc
+ 0x20);
455 m_state
= AttributeName
;
456 } else if (cc
== '/')
459 m_attributeName
.clear();
460 m_attributeValue
.clear();
461 m_attributeName
.append(cc
);
462 m_state
= AttributeName
;
467 if (isWhitespace(cc
)) {
468 m_state
= AfterAttributeName
;
472 m_state
= BeforeAttributeValue
;
481 m_state
= BeforeAttributeName
;
484 if (cc
>= 'A' && cc
<= 'Z')
485 m_attributeName
.append(cc
+ 0x20);
487 m_attributeName
.append(cc
);
489 if (m_source
.isEmpty())
491 cc
= m_source
->unicode();
494 case AfterAttributeName
:
495 if (isWhitespace(cc
))
498 m_state
= BeforeAttributeValue
;
499 else if (cc
== '>') {
502 } else if (cc
>= 'A' && cc
<= 'Z') {
503 m_attributeName
.clear();
504 m_attributeValue
.clear();
505 m_attributeName
.append(cc
+ 0x20);
506 m_state
= AttributeName
;
507 } else if (cc
== '/')
508 m_state
= BeforeAttributeName
;
510 m_attributeName
.clear();
511 m_attributeValue
.clear();
512 m_attributeName
.append(cc
);
513 m_state
= AttributeName
;
516 case BeforeAttributeValue
:
517 if (isWhitespace(cc
))
520 m_state
= AttributeValueDoubleQuoted
;
521 else if (cc
== '&') {
522 m_state
= AttributeValueUnquoted
;
524 } else if (cc
== '\'')
525 m_state
= AttributeValueSingleQuoted
;
526 else if (cc
== '>') {
530 m_attributeValue
.append(cc
);
531 m_state
= AttributeValueUnquoted
;
534 case AttributeValueDoubleQuoted
:
538 m_state
= BeforeAttributeName
;
542 m_stateBeforeEntityInAttributeValue
= m_state
;
543 m_state
= EntityInAttributeValue
;
546 m_attributeValue
.append(cc
);
548 if (m_source
.isEmpty())
550 cc
= m_source
->unicode();
553 case AttributeValueSingleQuoted
:
557 m_state
= BeforeAttributeName
;
561 m_stateBeforeEntityInAttributeValue
= m_state
;
562 m_state
= EntityInAttributeValue
;
565 m_attributeValue
.append(cc
);
567 if (m_source
.isEmpty())
569 cc
= m_source
->unicode();
572 case AttributeValueUnquoted
:
574 if (isWhitespace(cc
)) {
576 m_state
= BeforeAttributeName
;
580 m_stateBeforeEntityInAttributeValue
= m_state
;
581 m_state
= EntityInAttributeValue
;
590 m_attributeValue
.append(cc
);
592 if (m_source
.isEmpty())
594 cc
= m_source
->unicode();
597 case EntityInAttributeValue
:
599 bool notEnoughCharacters
= false;
600 unsigned entity
= consumeEntity(m_source
, notEnoughCharacters
);
601 if (notEnoughCharacters
)
603 if (entity
> 0xFFFF) {
604 m_attributeValue
.append(U16_LEAD(entity
));
605 m_attributeValue
.append(U16_TRAIL(entity
));
607 m_attributeValue
.append(entity
);
609 m_attributeValue
.append('&');
611 m_state
= m_stateBeforeEntityInAttributeValue
;
620 if (m_source
.isEmpty())
622 cc
= m_source
->unicode();
625 case MarkupDeclarationOpen
: {
627 if (m_source
.length() < 2)
630 cc
= m_source
->unicode();
632 m_state
= CommentStart
;
634 m_state
= BogusComment
;
637 // If we cared about the DOCTYPE we would test to enter those states here
639 m_state
= BogusComment
;
646 m_state
= CommentStartDash
;
652 case CommentStartDash
:
654 m_state
= CommentEnd
;
663 m_state
= CommentEndDash
;
667 if (m_source
.isEmpty())
669 cc
= m_source
->unicode();
674 m_state
= CommentEnd
;
691 void ProspectiveTokenizer::processAttribute()
693 LocalName tagLocal
= LocalName::fromString(DOMString(m_tagName
.data(), m_tagName
.size()).lower());
694 uint tag
= tagLocal
.id();
695 LocalName attrLocal
= LocalName::fromString(DOMString(m_tagName
.data(), m_tagName
.size()).lower());
696 uint attribute
= attrLocal
.id();
698 const DOMString value
= DOMString(m_attributeValue
.data(), m_attributeValue
.size()); // ####
703 if (attribute
== ATTR_SRC
&& m_urlToLoad
.isEmpty())
704 m_urlToLoad
= parseURL(value
);
707 if (attribute
== ATTR_HREF
&& m_urlToLoad
.isEmpty())
708 m_urlToLoad
= parseURL(value
);
709 else if (attribute
== ATTR_REL
) {
710 QString val
= value
.string();
711 m_linkIsStyleSheet
= val
.contains("styleSheet") && !val
.contains("alternate") && !val
.contains("icon");
718 inline void ProspectiveTokenizer::emitCharacter(QChar c
)
720 if (m_contentModel
== CDATA
&& m_lastStartTagId
== ID_STYLE
)
724 inline void ProspectiveTokenizer::tokenizeCSS(QChar c
)
726 // We are just interested in @import rules, no need for real tokenization here
727 // Searching for other types of resources is probably low payoff
728 switch (m_cssState
) {
731 m_cssState
= CSSRuleStart
;
733 m_cssState
= CSSMaybeComment
;
735 case CSSMaybeComment
:
737 m_cssState
= CSSComment
;
739 m_cssState
= CSSInitial
;
743 m_cssState
= CSSMaybeCommentEnd
;
745 case CSSMaybeCommentEnd
:
747 m_cssState
= CSSInitial
;
751 m_cssState
= CSSComment
;
754 if ((c
>= 'A' && c
<= 'Z') || (c
>= 'a' && c
<= 'z')) {
756 m_cssRuleValue
.clear();
758 m_cssState
= CSSRule
;
760 m_cssState
= CSSInitial
;
764 m_cssState
= CSSAfterRule
;
766 m_cssState
= CSSInitial
;
774 m_cssState
= CSSInitial
;
776 m_cssState
= CSSRuleValue
;
777 m_cssRuleValue
.append(c
);
782 m_cssState
= CSSAferRuleValue
;
785 m_cssState
= CSSInitial
;
787 m_cssRuleValue
.append(c
);
789 case CSSAferRuleValue
:
794 m_cssState
= CSSInitial
;
797 m_cssState
= CSSInitial
;
803 void ProspectiveTokenizer::emitTag()
806 m_contentModel
= PCDATA
;
807 m_cssState
= CSSInitial
;
808 clearLastCharacters();
812 LocalName tagLocal
= LocalName::fromString(DOMString(m_tagName
.data(), m_tagName
.size()));
813 uint tag
= tagLocal
.id();
814 m_lastStartTagId
= tag
;
815 m_lastStartTag
= m_tagName
;
820 m_contentModel
= RCDATA
;
828 m_contentModel
= CDATA
;
831 // we wouldn't be here if scripts were disabled
832 m_contentModel
= CDATA
;
835 m_contentModel
= PLAINTEXT
;
838 m_contentModel
= PCDATA
;
841 if (m_urlToLoad
.isEmpty()) {
842 m_linkIsStyleSheet
= false;
847 if (tag
== ID_SCRIPT
)
848 o
= m_document
->docLoader()->requestScript( m_urlToLoad
, m_document
->part()->encoding() );
849 else if (tag
== ID_IMAGE
|| tag
== ID_IMG
)
850 o
= m_document
->docLoader()->requestImage( m_urlToLoad
);
851 else if (tag
== ID_LINK
&& m_linkIsStyleSheet
)
852 o
= m_document
->docLoader()->requestStyleSheet( m_urlToLoad
, m_document
->part()->encoding() );
855 m_document
->docLoader()->registerPreload( o
);
857 m_urlToLoad
= QString();
858 m_linkIsStyleSheet
= false;
861 void ProspectiveTokenizer::emitCSSRule()
863 QString
rule(m_cssRule
.data(), m_cssRule
.size());
864 if (rule
.toLower() == "import" && !m_cssRuleValue
.isEmpty()) {
865 DOMString value
= DOMString(m_cssRuleValue
.data(), m_cssRuleValue
.size());
866 DOMString url
= parseURL(value
);
868 m_document
->docLoader()->registerPreload( m_document
->docLoader()->requestStyleSheet( m_urlToLoad
, m_document
->part()->encoding() ) ); // #### charset
871 m_cssRuleValue
.clear();