2 * Copyright (C) 2008 Apple Inc. All Rights Reserved.
3 * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/
4 * Copyright (C) 2010 Google, Inc. All Rights Reserved.
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
15 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
16 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
19 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 #include "core/html/parser/HTMLTokenizer.h"
31 #include "core/HTMLNames.h"
32 #include "core/HTMLTokenizerNames.h"
33 #include "core/html/parser/HTMLEntityParser.h"
34 #include "core/html/parser/HTMLParserIdioms.h"
35 #include "core/html/parser/HTMLTreeBuilder.h"
36 #include "core/xml/parser/MarkupTokenizerInlines.h"
37 #include "platform/NotImplemented.h"
38 #include "wtf/ASCIICType.h"
39 #include "wtf/text/AtomicString.h"
40 #include "wtf/text/Unicode.h"
42 // Please don't use DEFINE_STATIC_LOCAL in this file. The HTMLTokenizer is used
43 // from multiple threads and DEFINE_STATIC_LOCAL isn't threadsafe.
44 #undef DEFINE_STATIC_LOCAL
48 using namespace HTMLNames
;
50 // This has to go in a .cpp file, as the linker doesn't like it being included more than once.
51 // We don't have an HTMLToken.cpp though, so this is the next best place.
52 QualifiedName
AtomicHTMLToken::nameForAttribute(const HTMLToken::Attribute
& attribute
) const
54 return QualifiedName(nullAtom
, AtomicString(attribute
.name
), nullAtom
);
57 bool AtomicHTMLToken::usesName() const
59 return m_type
== HTMLToken::StartTag
|| m_type
== HTMLToken::EndTag
|| m_type
== HTMLToken::DOCTYPE
;
62 bool AtomicHTMLToken::usesAttributes() const
64 return m_type
== HTMLToken::StartTag
|| m_type
== HTMLToken::EndTag
;
67 static inline UChar
toLowerCase(UChar cc
)
69 ASSERT(isASCIIUpper(cc
));
70 const int lowerCaseOffset
= 0x20;
71 return cc
+ lowerCaseOffset
;
74 static inline bool vectorEqualsString(const Vector
<LChar
, 32>& vector
, const String
& string
)
76 if (vector
.size() != string
.length())
82 return equal(string
.impl(), vector
.data(), vector
.size());
85 static inline bool isEndTagBufferingState(HTMLTokenizer::State state
)
88 case HTMLTokenizer::RCDATAEndTagOpenState
:
89 case HTMLTokenizer::RCDATAEndTagNameState
:
90 case HTMLTokenizer::RAWTEXTEndTagOpenState
:
91 case HTMLTokenizer::RAWTEXTEndTagNameState
:
92 case HTMLTokenizer::ScriptDataEndTagOpenState
:
93 case HTMLTokenizer::ScriptDataEndTagNameState
:
94 case HTMLTokenizer::ScriptDataEscapedEndTagOpenState
:
95 case HTMLTokenizer::ScriptDataEscapedEndTagNameState
:
102 #define HTML_BEGIN_STATE(stateName) BEGIN_STATE(HTMLTokenizer, stateName)
103 #define HTML_RECONSUME_IN(stateName) RECONSUME_IN(HTMLTokenizer, stateName)
104 #define HTML_ADVANCE_TO(stateName) ADVANCE_TO(HTMLTokenizer, stateName)
105 #define HTML_SWITCH_TO(stateName) SWITCH_TO(HTMLTokenizer, stateName)
107 HTMLTokenizer::HTMLTokenizer(const HTMLParserOptions
& options
)
108 : m_inputStreamPreprocessor(this)
114 HTMLTokenizer::~HTMLTokenizer()
118 void HTMLTokenizer::reset()
120 m_state
= HTMLTokenizer::DataState
;
122 m_forceNullCharacterReplacement
= false;
123 m_shouldAllowCDATA
= false;
124 m_additionalAllowedCharacter
= '\0';
127 inline bool HTMLTokenizer::processEntity(SegmentedString
& source
)
129 bool notEnoughCharacters
= false;
130 DecodedHTMLEntity decodedEntity
;
131 bool success
= consumeHTMLEntity(source
, decodedEntity
, notEnoughCharacters
);
132 if (notEnoughCharacters
)
135 ASSERT(decodedEntity
.isEmpty());
136 bufferCharacter('&');
138 for (unsigned i
= 0; i
< decodedEntity
.length
; ++i
)
139 bufferCharacter(decodedEntity
.data
[i
]);
144 bool HTMLTokenizer::flushBufferedEndTag(SegmentedString
& source
)
146 ASSERT(m_token
->type() == HTMLToken::Character
|| m_token
->type() == HTMLToken::Uninitialized
);
147 source
.advanceAndUpdateLineNumber();
148 if (m_token
->type() == HTMLToken::Character
)
150 m_token
->beginEndTag(m_bufferedEndTagName
);
151 m_bufferedEndTagName
.clear();
152 m_appropriateEndTagName
.clear();
153 m_temporaryBuffer
.clear();
157 #define FLUSH_AND_ADVANCE_TO(stateName) \
159 m_state = HTMLTokenizer::stateName; \
160 if (flushBufferedEndTag(source)) \
162 if (source.isEmpty() \
163 || !m_inputStreamPreprocessor.peek(source)) \
164 return haveBufferedCharacterToken(); \
165 cc = m_inputStreamPreprocessor.nextInputCharacter(); \
169 bool HTMLTokenizer::flushEmitAndResumeIn(SegmentedString
& source
, HTMLTokenizer::State state
)
172 flushBufferedEndTag(source
);
176 bool HTMLTokenizer::nextToken(SegmentedString
& source
, HTMLToken
& token
)
178 // If we have a token in progress, then we're supposed to be called back
179 // with the same token so we can finish it.
180 ASSERT(!m_token
|| m_token
== &token
|| token
.type() == HTMLToken::Uninitialized
);
183 if (!m_bufferedEndTagName
.isEmpty() && !isEndTagBufferingState(m_state
)) {
184 // FIXME: This should call flushBufferedEndTag().
185 // We started an end tag during our last iteration.
186 m_token
->beginEndTag(m_bufferedEndTagName
);
187 m_bufferedEndTagName
.clear();
188 m_appropriateEndTagName
.clear();
189 m_temporaryBuffer
.clear();
190 if (m_state
== HTMLTokenizer::DataState
) {
191 // We're back in the data state, so we must be done with the tag.
196 if (source
.isEmpty() || !m_inputStreamPreprocessor
.peek(source
))
197 return haveBufferedCharacterToken();
198 UChar cc
= m_inputStreamPreprocessor
.nextInputCharacter();
200 // Source: http://www.whatwg.org/specs/web-apps/current-work/#tokenisation0
202 HTML_BEGIN_STATE(DataState
) {
204 HTML_ADVANCE_TO(CharacterReferenceInDataState
);
205 else if (cc
== '<') {
206 if (m_token
->type() == HTMLToken::Character
) {
207 // We have a bunch of character tokens queued up that we
208 // are emitting lazily here.
211 HTML_ADVANCE_TO(TagOpenState
);
212 } else if (cc
== kEndOfFileMarker
)
213 return emitEndOfFile(source
);
216 HTML_ADVANCE_TO(DataState
);
221 HTML_BEGIN_STATE(CharacterReferenceInDataState
) {
222 if (!processEntity(source
))
223 return haveBufferedCharacterToken();
224 HTML_SWITCH_TO(DataState
);
228 HTML_BEGIN_STATE(RCDATAState
) {
230 HTML_ADVANCE_TO(CharacterReferenceInRCDATAState
);
232 HTML_ADVANCE_TO(RCDATALessThanSignState
);
233 else if (cc
== kEndOfFileMarker
)
234 return emitEndOfFile(source
);
237 HTML_ADVANCE_TO(RCDATAState
);
242 HTML_BEGIN_STATE(CharacterReferenceInRCDATAState
) {
243 if (!processEntity(source
))
244 return haveBufferedCharacterToken();
245 HTML_SWITCH_TO(RCDATAState
);
249 HTML_BEGIN_STATE(RAWTEXTState
) {
251 HTML_ADVANCE_TO(RAWTEXTLessThanSignState
);
252 else if (cc
== kEndOfFileMarker
)
253 return emitEndOfFile(source
);
256 HTML_ADVANCE_TO(RAWTEXTState
);
261 HTML_BEGIN_STATE(ScriptDataState
) {
263 HTML_ADVANCE_TO(ScriptDataLessThanSignState
);
264 else if (cc
== kEndOfFileMarker
)
265 return emitEndOfFile(source
);
268 HTML_ADVANCE_TO(ScriptDataState
);
273 HTML_BEGIN_STATE(PLAINTEXTState
) {
274 if (cc
== kEndOfFileMarker
)
275 return emitEndOfFile(source
);
277 HTML_ADVANCE_TO(PLAINTEXTState
);
281 HTML_BEGIN_STATE(TagOpenState
) {
283 HTML_ADVANCE_TO(MarkupDeclarationOpenState
);
285 HTML_ADVANCE_TO(EndTagOpenState
);
286 else if (isASCIIUpper(cc
)) {
287 m_token
->beginStartTag(toLowerCase(cc
));
288 HTML_ADVANCE_TO(TagNameState
);
289 } else if (isASCIILower(cc
)) {
290 m_token
->beginStartTag(cc
);
291 HTML_ADVANCE_TO(TagNameState
);
292 } else if (cc
== '?') {
294 // The spec consumes the current character before switching
295 // to the bogus comment state, but it's easier to implement
296 // if we reconsume the current character.
297 HTML_RECONSUME_IN(BogusCommentState
);
300 bufferCharacter('<');
301 HTML_RECONSUME_IN(DataState
);
306 HTML_BEGIN_STATE(EndTagOpenState
) {
307 if (isASCIIUpper(cc
)) {
308 m_token
->beginEndTag(static_cast<LChar
>(toLowerCase(cc
)));
309 m_appropriateEndTagName
.clear();
310 HTML_ADVANCE_TO(TagNameState
);
311 } else if (isASCIILower(cc
)) {
312 m_token
->beginEndTag(static_cast<LChar
>(cc
));
313 m_appropriateEndTagName
.clear();
314 HTML_ADVANCE_TO(TagNameState
);
315 } else if (cc
== '>') {
317 HTML_ADVANCE_TO(DataState
);
318 } else if (cc
== kEndOfFileMarker
) {
320 bufferCharacter('<');
321 bufferCharacter('/');
322 HTML_RECONSUME_IN(DataState
);
325 HTML_RECONSUME_IN(BogusCommentState
);
330 HTML_BEGIN_STATE(TagNameState
) {
331 if (isTokenizerWhitespace(cc
))
332 HTML_ADVANCE_TO(BeforeAttributeNameState
);
334 HTML_ADVANCE_TO(SelfClosingStartTagState
);
336 return emitAndResumeIn(source
, HTMLTokenizer::DataState
);
337 else if (isASCIIUpper(cc
)) {
338 m_token
->appendToName(toLowerCase(cc
));
339 HTML_ADVANCE_TO(TagNameState
);
340 } else if (cc
== kEndOfFileMarker
) {
342 HTML_RECONSUME_IN(DataState
);
344 m_token
->appendToName(cc
);
345 HTML_ADVANCE_TO(TagNameState
);
350 HTML_BEGIN_STATE(RCDATALessThanSignState
) {
352 m_temporaryBuffer
.clear();
353 ASSERT(m_bufferedEndTagName
.isEmpty());
354 HTML_ADVANCE_TO(RCDATAEndTagOpenState
);
356 bufferCharacter('<');
357 HTML_RECONSUME_IN(RCDATAState
);
362 HTML_BEGIN_STATE(RCDATAEndTagOpenState
) {
363 if (isASCIIUpper(cc
)) {
364 m_temporaryBuffer
.append(static_cast<LChar
>(cc
));
365 addToPossibleEndTag(static_cast<LChar
>(toLowerCase(cc
)));
366 HTML_ADVANCE_TO(RCDATAEndTagNameState
);
367 } else if (isASCIILower(cc
)) {
368 m_temporaryBuffer
.append(static_cast<LChar
>(cc
));
369 addToPossibleEndTag(static_cast<LChar
>(cc
));
370 HTML_ADVANCE_TO(RCDATAEndTagNameState
);
372 bufferCharacter('<');
373 bufferCharacter('/');
374 HTML_RECONSUME_IN(RCDATAState
);
379 HTML_BEGIN_STATE(RCDATAEndTagNameState
) {
380 if (isASCIIUpper(cc
)) {
381 m_temporaryBuffer
.append(static_cast<LChar
>(cc
));
382 addToPossibleEndTag(static_cast<LChar
>(toLowerCase(cc
)));
383 HTML_ADVANCE_TO(RCDATAEndTagNameState
);
384 } else if (isASCIILower(cc
)) {
385 m_temporaryBuffer
.append(static_cast<LChar
>(cc
));
386 addToPossibleEndTag(static_cast<LChar
>(cc
));
387 HTML_ADVANCE_TO(RCDATAEndTagNameState
);
389 if (isTokenizerWhitespace(cc
)) {
390 if (isAppropriateEndTag()) {
391 m_temporaryBuffer
.append(static_cast<LChar
>(cc
));
392 FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState
);
394 } else if (cc
== '/') {
395 if (isAppropriateEndTag()) {
396 m_temporaryBuffer
.append(static_cast<LChar
>(cc
));
397 FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState
);
399 } else if (cc
== '>') {
400 if (isAppropriateEndTag()) {
401 m_temporaryBuffer
.append(static_cast<LChar
>(cc
));
402 return flushEmitAndResumeIn(source
, HTMLTokenizer::DataState
);
405 bufferCharacter('<');
406 bufferCharacter('/');
407 m_token
->appendToCharacter(m_temporaryBuffer
);
408 m_bufferedEndTagName
.clear();
409 m_temporaryBuffer
.clear();
410 HTML_RECONSUME_IN(RCDATAState
);
415 HTML_BEGIN_STATE(RAWTEXTLessThanSignState
) {
417 m_temporaryBuffer
.clear();
418 ASSERT(m_bufferedEndTagName
.isEmpty());
419 HTML_ADVANCE_TO(RAWTEXTEndTagOpenState
);
421 bufferCharacter('<');
422 HTML_RECONSUME_IN(RAWTEXTState
);
427 HTML_BEGIN_STATE(RAWTEXTEndTagOpenState
) {
428 if (isASCIIUpper(cc
)) {
429 m_temporaryBuffer
.append(static_cast<LChar
>(cc
));
430 addToPossibleEndTag(static_cast<LChar
>(toLowerCase(cc
)));
431 HTML_ADVANCE_TO(RAWTEXTEndTagNameState
);
432 } else if (isASCIILower(cc
)) {
433 m_temporaryBuffer
.append(static_cast<LChar
>(cc
));
434 addToPossibleEndTag(static_cast<LChar
>(cc
));
435 HTML_ADVANCE_TO(RAWTEXTEndTagNameState
);
437 bufferCharacter('<');
438 bufferCharacter('/');
439 HTML_RECONSUME_IN(RAWTEXTState
);
444 HTML_BEGIN_STATE(RAWTEXTEndTagNameState
) {
445 if (isASCIIUpper(cc
)) {
446 m_temporaryBuffer
.append(static_cast<LChar
>(cc
));
447 addToPossibleEndTag(static_cast<LChar
>(toLowerCase(cc
)));
448 HTML_ADVANCE_TO(RAWTEXTEndTagNameState
);
449 } else if (isASCIILower(cc
)) {
450 m_temporaryBuffer
.append(static_cast<LChar
>(cc
));
451 addToPossibleEndTag(static_cast<LChar
>(cc
));
452 HTML_ADVANCE_TO(RAWTEXTEndTagNameState
);
454 if (isTokenizerWhitespace(cc
)) {
455 if (isAppropriateEndTag()) {
456 m_temporaryBuffer
.append(static_cast<LChar
>(cc
));
457 FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState
);
459 } else if (cc
== '/') {
460 if (isAppropriateEndTag()) {
461 m_temporaryBuffer
.append(static_cast<LChar
>(cc
));
462 FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState
);
464 } else if (cc
== '>') {
465 if (isAppropriateEndTag()) {
466 m_temporaryBuffer
.append(static_cast<LChar
>(cc
));
467 return flushEmitAndResumeIn(source
, HTMLTokenizer::DataState
);
470 bufferCharacter('<');
471 bufferCharacter('/');
472 m_token
->appendToCharacter(m_temporaryBuffer
);
473 m_bufferedEndTagName
.clear();
474 m_temporaryBuffer
.clear();
475 HTML_RECONSUME_IN(RAWTEXTState
);
480 HTML_BEGIN_STATE(ScriptDataLessThanSignState
) {
482 m_temporaryBuffer
.clear();
483 ASSERT(m_bufferedEndTagName
.isEmpty());
484 HTML_ADVANCE_TO(ScriptDataEndTagOpenState
);
485 } else if (cc
== '!') {
486 bufferCharacter('<');
487 bufferCharacter('!');
488 HTML_ADVANCE_TO(ScriptDataEscapeStartState
);
490 bufferCharacter('<');
491 HTML_RECONSUME_IN(ScriptDataState
);
496 HTML_BEGIN_STATE(ScriptDataEndTagOpenState
) {
497 if (isASCIIUpper(cc
)) {
498 m_temporaryBuffer
.append(static_cast<LChar
>(cc
));
499 addToPossibleEndTag(static_cast<LChar
>(toLowerCase(cc
)));
500 HTML_ADVANCE_TO(ScriptDataEndTagNameState
);
501 } else if (isASCIILower(cc
)) {
502 m_temporaryBuffer
.append(static_cast<LChar
>(cc
));
503 addToPossibleEndTag(static_cast<LChar
>(cc
));
504 HTML_ADVANCE_TO(ScriptDataEndTagNameState
);
506 bufferCharacter('<');
507 bufferCharacter('/');
508 HTML_RECONSUME_IN(ScriptDataState
);
513 HTML_BEGIN_STATE(ScriptDataEndTagNameState
) {
514 if (isASCIIUpper(cc
)) {
515 m_temporaryBuffer
.append(static_cast<LChar
>(cc
));
516 addToPossibleEndTag(static_cast<LChar
>(toLowerCase(cc
)));
517 HTML_ADVANCE_TO(ScriptDataEndTagNameState
);
518 } else if (isASCIILower(cc
)) {
519 m_temporaryBuffer
.append(static_cast<LChar
>(cc
));
520 addToPossibleEndTag(static_cast<LChar
>(cc
));
521 HTML_ADVANCE_TO(ScriptDataEndTagNameState
);
523 if (isTokenizerWhitespace(cc
)) {
524 if (isAppropriateEndTag()) {
525 m_temporaryBuffer
.append(static_cast<LChar
>(cc
));
526 FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState
);
528 } else if (cc
== '/') {
529 if (isAppropriateEndTag()) {
530 m_temporaryBuffer
.append(static_cast<LChar
>(cc
));
531 FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState
);
533 } else if (cc
== '>') {
534 if (isAppropriateEndTag()) {
535 m_temporaryBuffer
.append(static_cast<LChar
>(cc
));
536 return flushEmitAndResumeIn(source
, HTMLTokenizer::DataState
);
539 bufferCharacter('<');
540 bufferCharacter('/');
541 m_token
->appendToCharacter(m_temporaryBuffer
);
542 m_bufferedEndTagName
.clear();
543 m_temporaryBuffer
.clear();
544 HTML_RECONSUME_IN(ScriptDataState
);
549 HTML_BEGIN_STATE(ScriptDataEscapeStartState
) {
552 HTML_ADVANCE_TO(ScriptDataEscapeStartDashState
);
554 HTML_RECONSUME_IN(ScriptDataState
);
558 HTML_BEGIN_STATE(ScriptDataEscapeStartDashState
) {
561 HTML_ADVANCE_TO(ScriptDataEscapedDashDashState
);
563 HTML_RECONSUME_IN(ScriptDataState
);
567 HTML_BEGIN_STATE(ScriptDataEscapedState
) {
570 HTML_ADVANCE_TO(ScriptDataEscapedDashState
);
571 } else if (cc
== '<')
572 HTML_ADVANCE_TO(ScriptDataEscapedLessThanSignState
);
573 else if (cc
== kEndOfFileMarker
) {
575 HTML_RECONSUME_IN(DataState
);
578 HTML_ADVANCE_TO(ScriptDataEscapedState
);
583 HTML_BEGIN_STATE(ScriptDataEscapedDashState
) {
586 HTML_ADVANCE_TO(ScriptDataEscapedDashDashState
);
587 } else if (cc
== '<')
588 HTML_ADVANCE_TO(ScriptDataEscapedLessThanSignState
);
589 else if (cc
== kEndOfFileMarker
) {
591 HTML_RECONSUME_IN(DataState
);
594 HTML_ADVANCE_TO(ScriptDataEscapedState
);
599 HTML_BEGIN_STATE(ScriptDataEscapedDashDashState
) {
602 HTML_ADVANCE_TO(ScriptDataEscapedDashDashState
);
603 } else if (cc
== '<')
604 HTML_ADVANCE_TO(ScriptDataEscapedLessThanSignState
);
605 else if (cc
== '>') {
607 HTML_ADVANCE_TO(ScriptDataState
);
608 } else if (cc
== kEndOfFileMarker
) {
610 HTML_RECONSUME_IN(DataState
);
613 HTML_ADVANCE_TO(ScriptDataEscapedState
);
618 HTML_BEGIN_STATE(ScriptDataEscapedLessThanSignState
) {
620 m_temporaryBuffer
.clear();
621 ASSERT(m_bufferedEndTagName
.isEmpty());
622 HTML_ADVANCE_TO(ScriptDataEscapedEndTagOpenState
);
623 } else if (isASCIIUpper(cc
)) {
624 bufferCharacter('<');
626 m_temporaryBuffer
.clear();
627 m_temporaryBuffer
.append(toLowerCase(cc
));
628 HTML_ADVANCE_TO(ScriptDataDoubleEscapeStartState
);
629 } else if (isASCIILower(cc
)) {
630 bufferCharacter('<');
632 m_temporaryBuffer
.clear();
633 m_temporaryBuffer
.append(static_cast<LChar
>(cc
));
634 HTML_ADVANCE_TO(ScriptDataDoubleEscapeStartState
);
636 bufferCharacter('<');
637 HTML_RECONSUME_IN(ScriptDataEscapedState
);
642 HTML_BEGIN_STATE(ScriptDataEscapedEndTagOpenState
) {
643 if (isASCIIUpper(cc
)) {
644 m_temporaryBuffer
.append(static_cast<LChar
>(cc
));
645 addToPossibleEndTag(static_cast<LChar
>(toLowerCase(cc
)));
646 HTML_ADVANCE_TO(ScriptDataEscapedEndTagNameState
);
647 } else if (isASCIILower(cc
)) {
648 m_temporaryBuffer
.append(static_cast<LChar
>(cc
));
649 addToPossibleEndTag(static_cast<LChar
>(cc
));
650 HTML_ADVANCE_TO(ScriptDataEscapedEndTagNameState
);
652 bufferCharacter('<');
653 bufferCharacter('/');
654 HTML_RECONSUME_IN(ScriptDataEscapedState
);
659 HTML_BEGIN_STATE(ScriptDataEscapedEndTagNameState
) {
660 if (isASCIIUpper(cc
)) {
661 m_temporaryBuffer
.append(static_cast<LChar
>(cc
));
662 addToPossibleEndTag(static_cast<LChar
>(toLowerCase(cc
)));
663 HTML_ADVANCE_TO(ScriptDataEscapedEndTagNameState
);
664 } else if (isASCIILower(cc
)) {
665 m_temporaryBuffer
.append(static_cast<LChar
>(cc
));
666 addToPossibleEndTag(static_cast<LChar
>(cc
));
667 HTML_ADVANCE_TO(ScriptDataEscapedEndTagNameState
);
669 if (isTokenizerWhitespace(cc
)) {
670 if (isAppropriateEndTag()) {
671 m_temporaryBuffer
.append(static_cast<LChar
>(cc
));
672 FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState
);
674 } else if (cc
== '/') {
675 if (isAppropriateEndTag()) {
676 m_temporaryBuffer
.append(static_cast<LChar
>(cc
));
677 FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState
);
679 } else if (cc
== '>') {
680 if (isAppropriateEndTag()) {
681 m_temporaryBuffer
.append(static_cast<LChar
>(cc
));
682 return flushEmitAndResumeIn(source
, HTMLTokenizer::DataState
);
685 bufferCharacter('<');
686 bufferCharacter('/');
687 m_token
->appendToCharacter(m_temporaryBuffer
);
688 m_bufferedEndTagName
.clear();
689 m_temporaryBuffer
.clear();
690 HTML_RECONSUME_IN(ScriptDataEscapedState
);
695 HTML_BEGIN_STATE(ScriptDataDoubleEscapeStartState
) {
696 if (isTokenizerWhitespace(cc
) || cc
== '/' || cc
== '>') {
698 if (temporaryBufferIs(scriptTag
.localName()))
699 HTML_ADVANCE_TO(ScriptDataDoubleEscapedState
);
701 HTML_ADVANCE_TO(ScriptDataEscapedState
);
702 } else if (isASCIIUpper(cc
)) {
704 m_temporaryBuffer
.append(toLowerCase(cc
));
705 HTML_ADVANCE_TO(ScriptDataDoubleEscapeStartState
);
706 } else if (isASCIILower(cc
)) {
708 m_temporaryBuffer
.append(static_cast<LChar
>(cc
));
709 HTML_ADVANCE_TO(ScriptDataDoubleEscapeStartState
);
711 HTML_RECONSUME_IN(ScriptDataEscapedState
);
715 HTML_BEGIN_STATE(ScriptDataDoubleEscapedState
) {
718 HTML_ADVANCE_TO(ScriptDataDoubleEscapedDashState
);
719 } else if (cc
== '<') {
721 HTML_ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState
);
722 } else if (cc
== kEndOfFileMarker
) {
724 HTML_RECONSUME_IN(DataState
);
727 HTML_ADVANCE_TO(ScriptDataDoubleEscapedState
);
732 HTML_BEGIN_STATE(ScriptDataDoubleEscapedDashState
) {
735 HTML_ADVANCE_TO(ScriptDataDoubleEscapedDashDashState
);
736 } else if (cc
== '<') {
738 HTML_ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState
);
739 } else if (cc
== kEndOfFileMarker
) {
741 HTML_RECONSUME_IN(DataState
);
744 HTML_ADVANCE_TO(ScriptDataDoubleEscapedState
);
749 HTML_BEGIN_STATE(ScriptDataDoubleEscapedDashDashState
) {
752 HTML_ADVANCE_TO(ScriptDataDoubleEscapedDashDashState
);
753 } else if (cc
== '<') {
755 HTML_ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState
);
756 } else if (cc
== '>') {
758 HTML_ADVANCE_TO(ScriptDataState
);
759 } else if (cc
== kEndOfFileMarker
) {
761 HTML_RECONSUME_IN(DataState
);
764 HTML_ADVANCE_TO(ScriptDataDoubleEscapedState
);
769 HTML_BEGIN_STATE(ScriptDataDoubleEscapedLessThanSignState
) {
772 m_temporaryBuffer
.clear();
773 HTML_ADVANCE_TO(ScriptDataDoubleEscapeEndState
);
775 HTML_RECONSUME_IN(ScriptDataDoubleEscapedState
);
779 HTML_BEGIN_STATE(ScriptDataDoubleEscapeEndState
) {
780 if (isTokenizerWhitespace(cc
) || cc
== '/' || cc
== '>') {
782 if (temporaryBufferIs(scriptTag
.localName()))
783 HTML_ADVANCE_TO(ScriptDataEscapedState
);
785 HTML_ADVANCE_TO(ScriptDataDoubleEscapedState
);
786 } else if (isASCIIUpper(cc
)) {
788 m_temporaryBuffer
.append(toLowerCase(cc
));
789 HTML_ADVANCE_TO(ScriptDataDoubleEscapeEndState
);
790 } else if (isASCIILower(cc
)) {
792 m_temporaryBuffer
.append(static_cast<LChar
>(cc
));
793 HTML_ADVANCE_TO(ScriptDataDoubleEscapeEndState
);
795 HTML_RECONSUME_IN(ScriptDataDoubleEscapedState
);
799 HTML_BEGIN_STATE(BeforeAttributeNameState
) {
800 if (isTokenizerWhitespace(cc
))
801 HTML_ADVANCE_TO(BeforeAttributeNameState
);
803 HTML_ADVANCE_TO(SelfClosingStartTagState
);
805 return emitAndResumeIn(source
, HTMLTokenizer::DataState
);
806 else if (isASCIIUpper(cc
)) {
807 m_token
->addNewAttribute();
808 m_token
->beginAttributeName(source
.numberOfCharactersConsumed());
809 m_token
->appendToAttributeName(toLowerCase(cc
));
810 HTML_ADVANCE_TO(AttributeNameState
);
811 } else if (cc
== kEndOfFileMarker
) {
813 HTML_RECONSUME_IN(DataState
);
815 if (cc
== '"' || cc
== '\'' || cc
== '<' || cc
== '=')
817 m_token
->addNewAttribute();
818 m_token
->beginAttributeName(source
.numberOfCharactersConsumed());
819 m_token
->appendToAttributeName(cc
);
820 HTML_ADVANCE_TO(AttributeNameState
);
825 HTML_BEGIN_STATE(AttributeNameState
) {
826 if (isTokenizerWhitespace(cc
)) {
827 m_token
->endAttributeName(source
.numberOfCharactersConsumed());
828 HTML_ADVANCE_TO(AfterAttributeNameState
);
829 } else if (cc
== '/') {
830 m_token
->endAttributeName(source
.numberOfCharactersConsumed());
831 HTML_ADVANCE_TO(SelfClosingStartTagState
);
832 } else if (cc
== '=') {
833 m_token
->endAttributeName(source
.numberOfCharactersConsumed());
834 HTML_ADVANCE_TO(BeforeAttributeValueState
);
835 } else if (cc
== '>') {
836 m_token
->endAttributeName(source
.numberOfCharactersConsumed());
837 return emitAndResumeIn(source
, HTMLTokenizer::DataState
);
838 } else if (isASCIIUpper(cc
)) {
839 m_token
->appendToAttributeName(toLowerCase(cc
));
840 HTML_ADVANCE_TO(AttributeNameState
);
841 } else if (cc
== kEndOfFileMarker
) {
843 m_token
->endAttributeName(source
.numberOfCharactersConsumed());
844 HTML_RECONSUME_IN(DataState
);
846 if (cc
== '"' || cc
== '\'' || cc
== '<' || cc
== '=')
848 m_token
->appendToAttributeName(cc
);
849 HTML_ADVANCE_TO(AttributeNameState
);
854 HTML_BEGIN_STATE(AfterAttributeNameState
) {
855 if (isTokenizerWhitespace(cc
))
856 HTML_ADVANCE_TO(AfterAttributeNameState
);
858 HTML_ADVANCE_TO(SelfClosingStartTagState
);
860 HTML_ADVANCE_TO(BeforeAttributeValueState
);
862 return emitAndResumeIn(source
, HTMLTokenizer::DataState
);
863 else if (isASCIIUpper(cc
)) {
864 m_token
->addNewAttribute();
865 m_token
->beginAttributeName(source
.numberOfCharactersConsumed());
866 m_token
->appendToAttributeName(toLowerCase(cc
));
867 HTML_ADVANCE_TO(AttributeNameState
);
868 } else if (cc
== kEndOfFileMarker
) {
870 HTML_RECONSUME_IN(DataState
);
872 if (cc
== '"' || cc
== '\'' || cc
== '<')
874 m_token
->addNewAttribute();
875 m_token
->beginAttributeName(source
.numberOfCharactersConsumed());
876 m_token
->appendToAttributeName(cc
);
877 HTML_ADVANCE_TO(AttributeNameState
);
882 HTML_BEGIN_STATE(BeforeAttributeValueState
) {
883 if (isTokenizerWhitespace(cc
))
884 HTML_ADVANCE_TO(BeforeAttributeValueState
);
885 else if (cc
== '"') {
886 m_token
->beginAttributeValue(source
.numberOfCharactersConsumed() + 1);
887 HTML_ADVANCE_TO(AttributeValueDoubleQuotedState
);
888 } else if (cc
== '&') {
889 m_token
->beginAttributeValue(source
.numberOfCharactersConsumed());
890 HTML_RECONSUME_IN(AttributeValueUnquotedState
);
891 } else if (cc
== '\'') {
892 m_token
->beginAttributeValue(source
.numberOfCharactersConsumed() + 1);
893 HTML_ADVANCE_TO(AttributeValueSingleQuotedState
);
894 } else if (cc
== '>') {
896 return emitAndResumeIn(source
, HTMLTokenizer::DataState
);
897 } else if (cc
== kEndOfFileMarker
) {
899 HTML_RECONSUME_IN(DataState
);
901 if (cc
== '<' || cc
== '=' || cc
== '`')
903 m_token
->beginAttributeValue(source
.numberOfCharactersConsumed());
904 m_token
->appendToAttributeValue(cc
);
905 HTML_ADVANCE_TO(AttributeValueUnquotedState
);
910 HTML_BEGIN_STATE(AttributeValueDoubleQuotedState
) {
912 m_token
->endAttributeValue(source
.numberOfCharactersConsumed());
913 HTML_ADVANCE_TO(AfterAttributeValueQuotedState
);
914 } else if (cc
== '&') {
915 m_additionalAllowedCharacter
= '"';
916 HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState
);
917 } else if (cc
== kEndOfFileMarker
) {
919 m_token
->endAttributeValue(source
.numberOfCharactersConsumed());
920 HTML_RECONSUME_IN(DataState
);
922 m_token
->appendToAttributeValue(cc
);
923 HTML_ADVANCE_TO(AttributeValueDoubleQuotedState
);
928 HTML_BEGIN_STATE(AttributeValueSingleQuotedState
) {
930 m_token
->endAttributeValue(source
.numberOfCharactersConsumed());
931 HTML_ADVANCE_TO(AfterAttributeValueQuotedState
);
932 } else if (cc
== '&') {
933 m_additionalAllowedCharacter
= '\'';
934 HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState
);
935 } else if (cc
== kEndOfFileMarker
) {
937 m_token
->endAttributeValue(source
.numberOfCharactersConsumed());
938 HTML_RECONSUME_IN(DataState
);
940 m_token
->appendToAttributeValue(cc
);
941 HTML_ADVANCE_TO(AttributeValueSingleQuotedState
);
946 HTML_BEGIN_STATE(AttributeValueUnquotedState
) {
947 if (isTokenizerWhitespace(cc
)) {
948 m_token
->endAttributeValue(source
.numberOfCharactersConsumed());
949 HTML_ADVANCE_TO(BeforeAttributeNameState
);
950 } else if (cc
== '&') {
951 m_additionalAllowedCharacter
= '>';
952 HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState
);
953 } else if (cc
== '>') {
954 m_token
->endAttributeValue(source
.numberOfCharactersConsumed());
955 return emitAndResumeIn(source
, HTMLTokenizer::DataState
);
956 } else if (cc
== kEndOfFileMarker
) {
958 m_token
->endAttributeValue(source
.numberOfCharactersConsumed());
959 HTML_RECONSUME_IN(DataState
);
961 if (cc
== '"' || cc
== '\'' || cc
== '<' || cc
== '=' || cc
== '`')
963 m_token
->appendToAttributeValue(cc
);
964 HTML_ADVANCE_TO(AttributeValueUnquotedState
);
969 HTML_BEGIN_STATE(CharacterReferenceInAttributeValueState
) {
970 bool notEnoughCharacters
= false;
971 DecodedHTMLEntity decodedEntity
;
972 bool success
= consumeHTMLEntity(source
, decodedEntity
, notEnoughCharacters
, m_additionalAllowedCharacter
);
973 if (notEnoughCharacters
)
974 return haveBufferedCharacterToken();
976 ASSERT(decodedEntity
.isEmpty());
977 m_token
->appendToAttributeValue('&');
979 for (unsigned i
= 0; i
< decodedEntity
.length
; ++i
)
980 m_token
->appendToAttributeValue(decodedEntity
.data
[i
]);
982 // We're supposed to switch back to the attribute value state that
983 // we were in when we were switched into this state. Rather than
984 // keeping track of this explictly, we observe that the previous
985 // state can be determined by m_additionalAllowedCharacter.
986 if (m_additionalAllowedCharacter
== '"')
987 HTML_SWITCH_TO(AttributeValueDoubleQuotedState
);
988 else if (m_additionalAllowedCharacter
== '\'')
989 HTML_SWITCH_TO(AttributeValueSingleQuotedState
);
990 else if (m_additionalAllowedCharacter
== '>')
991 HTML_SWITCH_TO(AttributeValueUnquotedState
);
993 ASSERT_NOT_REACHED();
997 HTML_BEGIN_STATE(AfterAttributeValueQuotedState
) {
998 if (isTokenizerWhitespace(cc
))
999 HTML_ADVANCE_TO(BeforeAttributeNameState
);
1001 HTML_ADVANCE_TO(SelfClosingStartTagState
);
1003 return emitAndResumeIn(source
, HTMLTokenizer::DataState
);
1004 else if (cc
== kEndOfFileMarker
) {
1006 HTML_RECONSUME_IN(DataState
);
1009 HTML_RECONSUME_IN(BeforeAttributeNameState
);
1014 HTML_BEGIN_STATE(SelfClosingStartTagState
) {
1016 m_token
->setSelfClosing();
1017 return emitAndResumeIn(source
, HTMLTokenizer::DataState
);
1018 } else if (cc
== kEndOfFileMarker
) {
1020 HTML_RECONSUME_IN(DataState
);
1023 HTML_RECONSUME_IN(BeforeAttributeNameState
);
1028 HTML_BEGIN_STATE(BogusCommentState
) {
1029 m_token
->beginComment();
1030 HTML_RECONSUME_IN(ContinueBogusCommentState
);
1034 HTML_BEGIN_STATE(ContinueBogusCommentState
) {
1036 return emitAndResumeIn(source
, HTMLTokenizer::DataState
);
1037 else if (cc
== kEndOfFileMarker
)
1038 return emitAndReconsumeIn(source
, HTMLTokenizer::DataState
);
1040 m_token
->appendToComment(cc
);
1041 HTML_ADVANCE_TO(ContinueBogusCommentState
);
1046 HTML_BEGIN_STATE(MarkupDeclarationOpenState
) {
1048 SegmentedString::LookAheadResult result
= source
.lookAhead(HTMLTokenizerNames::dashDash
);
1049 if (result
== SegmentedString::DidMatch
) {
1050 source
.advanceAndASSERT('-');
1051 source
.advanceAndASSERT('-');
1052 m_token
->beginComment();
1053 HTML_SWITCH_TO(CommentStartState
);
1054 } else if (result
== SegmentedString::NotEnoughCharacters
)
1055 return haveBufferedCharacterToken();
1056 } else if (cc
== 'D' || cc
== 'd') {
1057 SegmentedString::LookAheadResult result
= source
.lookAheadIgnoringCase(HTMLTokenizerNames::doctype
);
1058 if (result
== SegmentedString::DidMatch
) {
1059 advanceStringAndASSERTIgnoringCase(source
, "doctype");
1060 HTML_SWITCH_TO(DOCTYPEState
);
1061 } else if (result
== SegmentedString::NotEnoughCharacters
)
1062 return haveBufferedCharacterToken();
1063 } else if (cc
== '[' && shouldAllowCDATA()) {
1064 SegmentedString::LookAheadResult result
= source
.lookAhead(HTMLTokenizerNames::cdata
);
1065 if (result
== SegmentedString::DidMatch
) {
1066 advanceStringAndASSERT(source
, "[CDATA[");
1067 HTML_SWITCH_TO(CDATASectionState
);
1068 } else if (result
== SegmentedString::NotEnoughCharacters
)
1069 return haveBufferedCharacterToken();
1072 HTML_RECONSUME_IN(BogusCommentState
);
1076 HTML_BEGIN_STATE(CommentStartState
) {
1078 HTML_ADVANCE_TO(CommentStartDashState
);
1079 else if (cc
== '>') {
1081 return emitAndResumeIn(source
, HTMLTokenizer::DataState
);
1082 } else if (cc
== kEndOfFileMarker
) {
1084 return emitAndReconsumeIn(source
, HTMLTokenizer::DataState
);
1086 m_token
->appendToComment(cc
);
1087 HTML_ADVANCE_TO(CommentState
);
1092 HTML_BEGIN_STATE(CommentStartDashState
) {
1094 HTML_ADVANCE_TO(CommentEndState
);
1095 else if (cc
== '>') {
1097 return emitAndResumeIn(source
, HTMLTokenizer::DataState
);
1098 } else if (cc
== kEndOfFileMarker
) {
1100 return emitAndReconsumeIn(source
, HTMLTokenizer::DataState
);
1102 m_token
->appendToComment('-');
1103 m_token
->appendToComment(cc
);
1104 HTML_ADVANCE_TO(CommentState
);
1109 HTML_BEGIN_STATE(CommentState
) {
1111 HTML_ADVANCE_TO(CommentEndDashState
);
1112 else if (cc
== kEndOfFileMarker
) {
1114 return emitAndReconsumeIn(source
, HTMLTokenizer::DataState
);
1116 m_token
->appendToComment(cc
);
1117 HTML_ADVANCE_TO(CommentState
);
1122 HTML_BEGIN_STATE(CommentEndDashState
) {
1124 HTML_ADVANCE_TO(CommentEndState
);
1125 else if (cc
== kEndOfFileMarker
) {
1127 return emitAndReconsumeIn(source
, HTMLTokenizer::DataState
);
1129 m_token
->appendToComment('-');
1130 m_token
->appendToComment(cc
);
1131 HTML_ADVANCE_TO(CommentState
);
1136 HTML_BEGIN_STATE(CommentEndState
) {
1138 return emitAndResumeIn(source
, HTMLTokenizer::DataState
);
1139 else if (cc
== '!') {
1141 HTML_ADVANCE_TO(CommentEndBangState
);
1142 } else if (cc
== '-') {
1144 m_token
->appendToComment('-');
1145 HTML_ADVANCE_TO(CommentEndState
);
1146 } else if (cc
== kEndOfFileMarker
) {
1148 return emitAndReconsumeIn(source
, HTMLTokenizer::DataState
);
1151 m_token
->appendToComment('-');
1152 m_token
->appendToComment('-');
1153 m_token
->appendToComment(cc
);
1154 HTML_ADVANCE_TO(CommentState
);
1159 HTML_BEGIN_STATE(CommentEndBangState
) {
1161 m_token
->appendToComment('-');
1162 m_token
->appendToComment('-');
1163 m_token
->appendToComment('!');
1164 HTML_ADVANCE_TO(CommentEndDashState
);
1165 } else if (cc
== '>')
1166 return emitAndResumeIn(source
, HTMLTokenizer::DataState
);
1167 else if (cc
== kEndOfFileMarker
) {
1169 return emitAndReconsumeIn(source
, HTMLTokenizer::DataState
);
1171 m_token
->appendToComment('-');
1172 m_token
->appendToComment('-');
1173 m_token
->appendToComment('!');
1174 m_token
->appendToComment(cc
);
1175 HTML_ADVANCE_TO(CommentState
);
1180 HTML_BEGIN_STATE(DOCTYPEState
) {
1181 if (isTokenizerWhitespace(cc
))
1182 HTML_ADVANCE_TO(BeforeDOCTYPENameState
);
1183 else if (cc
== kEndOfFileMarker
) {
1185 m_token
->beginDOCTYPE();
1186 m_token
->setForceQuirks();
1187 return emitAndReconsumeIn(source
, HTMLTokenizer::DataState
);
1190 HTML_RECONSUME_IN(BeforeDOCTYPENameState
);
1195 HTML_BEGIN_STATE(BeforeDOCTYPENameState
) {
1196 if (isTokenizerWhitespace(cc
))
1197 HTML_ADVANCE_TO(BeforeDOCTYPENameState
);
1198 else if (isASCIIUpper(cc
)) {
1199 m_token
->beginDOCTYPE(toLowerCase(cc
));
1200 HTML_ADVANCE_TO(DOCTYPENameState
);
1201 } else if (cc
== '>') {
1203 m_token
->beginDOCTYPE();
1204 m_token
->setForceQuirks();
1205 return emitAndResumeIn(source
, HTMLTokenizer::DataState
);
1206 } else if (cc
== kEndOfFileMarker
) {
1208 m_token
->beginDOCTYPE();
1209 m_token
->setForceQuirks();
1210 return emitAndReconsumeIn(source
, HTMLTokenizer::DataState
);
1212 m_token
->beginDOCTYPE(cc
);
1213 HTML_ADVANCE_TO(DOCTYPENameState
);
1218 HTML_BEGIN_STATE(DOCTYPENameState
) {
1219 if (isTokenizerWhitespace(cc
))
1220 HTML_ADVANCE_TO(AfterDOCTYPENameState
);
1222 return emitAndResumeIn(source
, HTMLTokenizer::DataState
);
1223 else if (isASCIIUpper(cc
)) {
1224 m_token
->appendToName(toLowerCase(cc
));
1225 HTML_ADVANCE_TO(DOCTYPENameState
);
1226 } else if (cc
== kEndOfFileMarker
) {
1228 m_token
->setForceQuirks();
1229 return emitAndReconsumeIn(source
, HTMLTokenizer::DataState
);
1231 m_token
->appendToName(cc
);
1232 HTML_ADVANCE_TO(DOCTYPENameState
);
1237 HTML_BEGIN_STATE(AfterDOCTYPENameState
) {
1238 if (isTokenizerWhitespace(cc
))
1239 HTML_ADVANCE_TO(AfterDOCTYPENameState
);
1241 return emitAndResumeIn(source
, HTMLTokenizer::DataState
);
1242 else if (cc
== kEndOfFileMarker
) {
1244 m_token
->setForceQuirks();
1245 return emitAndReconsumeIn(source
, HTMLTokenizer::DataState
);
1247 if (cc
== 'P' || cc
== 'p') {
1248 SegmentedString::LookAheadResult result
= source
.lookAheadIgnoringCase(HTMLTokenizerNames::publicString
);
1249 if (result
== SegmentedString::DidMatch
) {
1250 advanceStringAndASSERTIgnoringCase(source
, "public");
1251 HTML_SWITCH_TO(AfterDOCTYPEPublicKeywordState
);
1252 } else if (result
== SegmentedString::NotEnoughCharacters
)
1253 return haveBufferedCharacterToken();
1254 } else if (cc
== 'S' || cc
== 's') {
1255 SegmentedString::LookAheadResult result
= source
.lookAheadIgnoringCase(HTMLTokenizerNames::system
);
1256 if (result
== SegmentedString::DidMatch
) {
1257 advanceStringAndASSERTIgnoringCase(source
, "system");
1258 HTML_SWITCH_TO(AfterDOCTYPESystemKeywordState
);
1259 } else if (result
== SegmentedString::NotEnoughCharacters
)
1260 return haveBufferedCharacterToken();
1263 m_token
->setForceQuirks();
1264 HTML_ADVANCE_TO(BogusDOCTYPEState
);
1269 HTML_BEGIN_STATE(AfterDOCTYPEPublicKeywordState
) {
1270 if (isTokenizerWhitespace(cc
))
1271 HTML_ADVANCE_TO(BeforeDOCTYPEPublicIdentifierState
);
1272 else if (cc
== '"') {
1274 m_token
->setPublicIdentifierToEmptyString();
1275 HTML_ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState
);
1276 } else if (cc
== '\'') {
1278 m_token
->setPublicIdentifierToEmptyString();
1279 HTML_ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState
);
1280 } else if (cc
== '>') {
1282 m_token
->setForceQuirks();
1283 return emitAndResumeIn(source
, HTMLTokenizer::DataState
);
1284 } else if (cc
== kEndOfFileMarker
) {
1286 m_token
->setForceQuirks();
1287 return emitAndReconsumeIn(source
, HTMLTokenizer::DataState
);
1290 m_token
->setForceQuirks();
1291 HTML_ADVANCE_TO(BogusDOCTYPEState
);
1296 HTML_BEGIN_STATE(BeforeDOCTYPEPublicIdentifierState
) {
1297 if (isTokenizerWhitespace(cc
))
1298 HTML_ADVANCE_TO(BeforeDOCTYPEPublicIdentifierState
);
1299 else if (cc
== '"') {
1300 m_token
->setPublicIdentifierToEmptyString();
1301 HTML_ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState
);
1302 } else if (cc
== '\'') {
1303 m_token
->setPublicIdentifierToEmptyString();
1304 HTML_ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState
);
1305 } else if (cc
== '>') {
1307 m_token
->setForceQuirks();
1308 return emitAndResumeIn(source
, HTMLTokenizer::DataState
);
1309 } else if (cc
== kEndOfFileMarker
) {
1311 m_token
->setForceQuirks();
1312 return emitAndReconsumeIn(source
, HTMLTokenizer::DataState
);
1315 m_token
->setForceQuirks();
1316 HTML_ADVANCE_TO(BogusDOCTYPEState
);
1321 HTML_BEGIN_STATE(DOCTYPEPublicIdentifierDoubleQuotedState
) {
1323 HTML_ADVANCE_TO(AfterDOCTYPEPublicIdentifierState
);
1324 else if (cc
== '>') {
1326 m_token
->setForceQuirks();
1327 return emitAndResumeIn(source
, HTMLTokenizer::DataState
);
1328 } else if (cc
== kEndOfFileMarker
) {
1330 m_token
->setForceQuirks();
1331 return emitAndReconsumeIn(source
, HTMLTokenizer::DataState
);
1333 m_token
->appendToPublicIdentifier(cc
);
1334 HTML_ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState
);
1339 HTML_BEGIN_STATE(DOCTYPEPublicIdentifierSingleQuotedState
) {
1341 HTML_ADVANCE_TO(AfterDOCTYPEPublicIdentifierState
);
1342 else if (cc
== '>') {
1344 m_token
->setForceQuirks();
1345 return emitAndResumeIn(source
, HTMLTokenizer::DataState
);
1346 } else if (cc
== kEndOfFileMarker
) {
1348 m_token
->setForceQuirks();
1349 return emitAndReconsumeIn(source
, HTMLTokenizer::DataState
);
1351 m_token
->appendToPublicIdentifier(cc
);
1352 HTML_ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState
);
1357 HTML_BEGIN_STATE(AfterDOCTYPEPublicIdentifierState
) {
1358 if (isTokenizerWhitespace(cc
))
1359 HTML_ADVANCE_TO(BetweenDOCTYPEPublicAndSystemIdentifiersState
);
1361 return emitAndResumeIn(source
, HTMLTokenizer::DataState
);
1362 else if (cc
== '"') {
1364 m_token
->setSystemIdentifierToEmptyString();
1365 HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState
);
1366 } else if (cc
== '\'') {
1368 m_token
->setSystemIdentifierToEmptyString();
1369 HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState
);
1370 } else if (cc
== kEndOfFileMarker
) {
1372 m_token
->setForceQuirks();
1373 return emitAndReconsumeIn(source
, HTMLTokenizer::DataState
);
1376 m_token
->setForceQuirks();
1377 HTML_ADVANCE_TO(BogusDOCTYPEState
);
1382 HTML_BEGIN_STATE(BetweenDOCTYPEPublicAndSystemIdentifiersState
) {
1383 if (isTokenizerWhitespace(cc
))
1384 HTML_ADVANCE_TO(BetweenDOCTYPEPublicAndSystemIdentifiersState
);
1386 return emitAndResumeIn(source
, HTMLTokenizer::DataState
);
1387 else if (cc
== '"') {
1388 m_token
->setSystemIdentifierToEmptyString();
1389 HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState
);
1390 } else if (cc
== '\'') {
1391 m_token
->setSystemIdentifierToEmptyString();
1392 HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState
);
1393 } else if (cc
== kEndOfFileMarker
) {
1395 m_token
->setForceQuirks();
1396 return emitAndReconsumeIn(source
, HTMLTokenizer::DataState
);
1399 m_token
->setForceQuirks();
1400 HTML_ADVANCE_TO(BogusDOCTYPEState
);
1405 HTML_BEGIN_STATE(AfterDOCTYPESystemKeywordState
) {
1406 if (isTokenizerWhitespace(cc
))
1407 HTML_ADVANCE_TO(BeforeDOCTYPESystemIdentifierState
);
1408 else if (cc
== '"') {
1410 m_token
->setSystemIdentifierToEmptyString();
1411 HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState
);
1412 } else if (cc
== '\'') {
1414 m_token
->setSystemIdentifierToEmptyString();
1415 HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState
);
1416 } else if (cc
== '>') {
1418 m_token
->setForceQuirks();
1419 return emitAndResumeIn(source
, HTMLTokenizer::DataState
);
1420 } else if (cc
== kEndOfFileMarker
) {
1422 m_token
->setForceQuirks();
1423 return emitAndReconsumeIn(source
, HTMLTokenizer::DataState
);
1426 m_token
->setForceQuirks();
1427 HTML_ADVANCE_TO(BogusDOCTYPEState
);
1432 HTML_BEGIN_STATE(BeforeDOCTYPESystemIdentifierState
) {
1433 if (isTokenizerWhitespace(cc
))
1434 HTML_ADVANCE_TO(BeforeDOCTYPESystemIdentifierState
);
1436 m_token
->setSystemIdentifierToEmptyString();
1437 HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState
);
1438 } else if (cc
== '\'') {
1439 m_token
->setSystemIdentifierToEmptyString();
1440 HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState
);
1441 } else if (cc
== '>') {
1443 m_token
->setForceQuirks();
1444 return emitAndResumeIn(source
, HTMLTokenizer::DataState
);
1445 } else if (cc
== kEndOfFileMarker
) {
1447 m_token
->setForceQuirks();
1448 return emitAndReconsumeIn(source
, HTMLTokenizer::DataState
);
1451 m_token
->setForceQuirks();
1452 HTML_ADVANCE_TO(BogusDOCTYPEState
);
1457 HTML_BEGIN_STATE(DOCTYPESystemIdentifierDoubleQuotedState
) {
1459 HTML_ADVANCE_TO(AfterDOCTYPESystemIdentifierState
);
1460 else if (cc
== '>') {
1462 m_token
->setForceQuirks();
1463 return emitAndResumeIn(source
, HTMLTokenizer::DataState
);
1464 } else if (cc
== kEndOfFileMarker
) {
1466 m_token
->setForceQuirks();
1467 return emitAndReconsumeIn(source
, HTMLTokenizer::DataState
);
1469 m_token
->appendToSystemIdentifier(cc
);
1470 HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState
);
1475 HTML_BEGIN_STATE(DOCTYPESystemIdentifierSingleQuotedState
) {
1477 HTML_ADVANCE_TO(AfterDOCTYPESystemIdentifierState
);
1478 else if (cc
== '>') {
1480 m_token
->setForceQuirks();
1481 return emitAndResumeIn(source
, HTMLTokenizer::DataState
);
1482 } else if (cc
== kEndOfFileMarker
) {
1484 m_token
->setForceQuirks();
1485 return emitAndReconsumeIn(source
, HTMLTokenizer::DataState
);
1487 m_token
->appendToSystemIdentifier(cc
);
1488 HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState
);
1493 HTML_BEGIN_STATE(AfterDOCTYPESystemIdentifierState
) {
1494 if (isTokenizerWhitespace(cc
))
1495 HTML_ADVANCE_TO(AfterDOCTYPESystemIdentifierState
);
1497 return emitAndResumeIn(source
, HTMLTokenizer::DataState
);
1498 else if (cc
== kEndOfFileMarker
) {
1500 m_token
->setForceQuirks();
1501 return emitAndReconsumeIn(source
, HTMLTokenizer::DataState
);
1504 HTML_ADVANCE_TO(BogusDOCTYPEState
);
1509 HTML_BEGIN_STATE(BogusDOCTYPEState
) {
1511 return emitAndResumeIn(source
, HTMLTokenizer::DataState
);
1512 else if (cc
== kEndOfFileMarker
)
1513 return emitAndReconsumeIn(source
, HTMLTokenizer::DataState
);
1514 HTML_ADVANCE_TO(BogusDOCTYPEState
);
1518 HTML_BEGIN_STATE(CDATASectionState
) {
1520 HTML_ADVANCE_TO(CDATASectionRightSquareBracketState
);
1521 else if (cc
== kEndOfFileMarker
)
1522 HTML_RECONSUME_IN(DataState
);
1524 bufferCharacter(cc
);
1525 HTML_ADVANCE_TO(CDATASectionState
);
1530 HTML_BEGIN_STATE(CDATASectionRightSquareBracketState
) {
1532 HTML_ADVANCE_TO(CDATASectionDoubleRightSquareBracketState
);
1534 bufferCharacter(']');
1535 HTML_RECONSUME_IN(CDATASectionState
);
1539 HTML_BEGIN_STATE(CDATASectionDoubleRightSquareBracketState
) {
1541 HTML_ADVANCE_TO(DataState
);
1543 bufferCharacter(']');
1544 bufferCharacter(']');
1545 HTML_RECONSUME_IN(CDATASectionState
);
1552 ASSERT_NOT_REACHED();
1556 String
HTMLTokenizer::bufferedCharacters() const
1558 // FIXME: Add an assert about m_state.
1559 StringBuilder characters
;
1560 characters
.reserveCapacity(numberOfBufferedCharacters());
1561 characters
.append('<');
1562 characters
.append('/');
1563 characters
.append(m_temporaryBuffer
.data(), m_temporaryBuffer
.size());
1564 return characters
.toString();
1567 void HTMLTokenizer::updateStateFor(const String
& tagName
)
1569 if (threadSafeMatch(tagName
, textareaTag
) || threadSafeMatch(tagName
, titleTag
))
1570 setState(HTMLTokenizer::RCDATAState
);
1571 else if (threadSafeMatch(tagName
, plaintextTag
))
1572 setState(HTMLTokenizer::PLAINTEXTState
);
1573 else if (threadSafeMatch(tagName
, scriptTag
))
1574 setState(HTMLTokenizer::ScriptDataState
);
1575 else if (threadSafeMatch(tagName
, styleTag
)
1576 || threadSafeMatch(tagName
, iframeTag
)
1577 || threadSafeMatch(tagName
, xmpTag
)
1578 || (threadSafeMatch(tagName
, noembedTag
) && m_options
.pluginsEnabled
)
1579 || threadSafeMatch(tagName
, noframesTag
)
1580 || (threadSafeMatch(tagName
, noscriptTag
) && m_options
.scriptEnabled
))
1581 setState(HTMLTokenizer::RAWTEXTState
);
1584 inline bool HTMLTokenizer::temporaryBufferIs(const String
& expectedString
)
1586 return vectorEqualsString(m_temporaryBuffer
, expectedString
);
1589 inline void HTMLTokenizer::addToPossibleEndTag(LChar cc
)
1591 ASSERT(isEndTagBufferingState(m_state
));
1592 m_bufferedEndTagName
.append(cc
);
1595 inline bool HTMLTokenizer::isAppropriateEndTag()
1597 if (m_bufferedEndTagName
.size() != m_appropriateEndTagName
.size())
1600 size_t numCharacters
= m_bufferedEndTagName
.size();
1602 for (size_t i
= 0; i
< numCharacters
; i
++) {
1603 if (m_bufferedEndTagName
[i
] != m_appropriateEndTagName
[i
])
1610 inline void HTMLTokenizer::parseError()