Move parseFontFaceDescriptor to CSSPropertyParser.cpp
[chromium-blink-merge.git] / third_party / WebKit / Source / core / html / parser / HTMLTokenizer.cpp
blob47b00b223c7a843dbc1adb1e6f832b9e59b8e9fa
1 /*
2 * Copyright (C) 2008 Apple Inc. All Rights Reserved.
3 * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/
4 * Copyright (C) 2010 Google, Inc. All Rights Reserved.
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
15 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
16 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
19 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 #include "config.h"
29 #include "core/html/parser/HTMLTokenizer.h"
31 #include "core/HTMLNames.h"
32 #include "core/HTMLTokenizerNames.h"
33 #include "core/html/parser/HTMLEntityParser.h"
34 #include "core/html/parser/HTMLParserIdioms.h"
35 #include "core/html/parser/HTMLTreeBuilder.h"
36 #include "core/xml/parser/MarkupTokenizerInlines.h"
37 #include "platform/NotImplemented.h"
38 #include "wtf/ASCIICType.h"
39 #include "wtf/text/AtomicString.h"
40 #include "wtf/text/Unicode.h"
42 // Please don't use DEFINE_STATIC_LOCAL in this file. The HTMLTokenizer is used
43 // from multiple threads and DEFINE_STATIC_LOCAL isn't threadsafe.
44 #undef DEFINE_STATIC_LOCAL
46 namespace blink {
48 using namespace HTMLNames;
50 // This has to go in a .cpp file, as the linker doesn't like it being included more than once.
51 // We don't have an HTMLToken.cpp though, so this is the next best place.
52 QualifiedName AtomicHTMLToken::nameForAttribute(const HTMLToken::Attribute& attribute) const
54 return QualifiedName(nullAtom, AtomicString(attribute.name), nullAtom);
57 bool AtomicHTMLToken::usesName() const
59 return m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag || m_type == HTMLToken::DOCTYPE;
62 bool AtomicHTMLToken::usesAttributes() const
64 return m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag;
67 static inline UChar toLowerCase(UChar cc)
69 ASSERT(isASCIIUpper(cc));
70 const int lowerCaseOffset = 0x20;
71 return cc + lowerCaseOffset;
74 static inline bool vectorEqualsString(const Vector<LChar, 32>& vector, const String& string)
76 if (vector.size() != string.length())
77 return false;
79 if (!string.length())
80 return true;
82 return equal(string.impl(), vector.data(), vector.size());
85 static inline bool isEndTagBufferingState(HTMLTokenizer::State state)
87 switch (state) {
88 case HTMLTokenizer::RCDATAEndTagOpenState:
89 case HTMLTokenizer::RCDATAEndTagNameState:
90 case HTMLTokenizer::RAWTEXTEndTagOpenState:
91 case HTMLTokenizer::RAWTEXTEndTagNameState:
92 case HTMLTokenizer::ScriptDataEndTagOpenState:
93 case HTMLTokenizer::ScriptDataEndTagNameState:
94 case HTMLTokenizer::ScriptDataEscapedEndTagOpenState:
95 case HTMLTokenizer::ScriptDataEscapedEndTagNameState:
96 return true;
97 default:
98 return false;
102 #define HTML_BEGIN_STATE(stateName) BEGIN_STATE(HTMLTokenizer, stateName)
103 #define HTML_RECONSUME_IN(stateName) RECONSUME_IN(HTMLTokenizer, stateName)
104 #define HTML_ADVANCE_TO(stateName) ADVANCE_TO(HTMLTokenizer, stateName)
105 #define HTML_SWITCH_TO(stateName) SWITCH_TO(HTMLTokenizer, stateName)
107 HTMLTokenizer::HTMLTokenizer(const HTMLParserOptions& options)
108 : m_inputStreamPreprocessor(this)
109 , m_options(options)
111 reset();
114 HTMLTokenizer::~HTMLTokenizer()
118 void HTMLTokenizer::reset()
120 m_state = HTMLTokenizer::DataState;
121 m_token = 0;
122 m_forceNullCharacterReplacement = false;
123 m_shouldAllowCDATA = false;
124 m_additionalAllowedCharacter = '\0';
127 inline bool HTMLTokenizer::processEntity(SegmentedString& source)
129 bool notEnoughCharacters = false;
130 DecodedHTMLEntity decodedEntity;
131 bool success = consumeHTMLEntity(source, decodedEntity, notEnoughCharacters);
132 if (notEnoughCharacters)
133 return false;
134 if (!success) {
135 ASSERT(decodedEntity.isEmpty());
136 bufferCharacter('&');
137 } else {
138 for (unsigned i = 0; i < decodedEntity.length; ++i)
139 bufferCharacter(decodedEntity.data[i]);
141 return true;
144 bool HTMLTokenizer::flushBufferedEndTag(SegmentedString& source)
146 ASSERT(m_token->type() == HTMLToken::Character || m_token->type() == HTMLToken::Uninitialized);
147 source.advanceAndUpdateLineNumber();
148 if (m_token->type() == HTMLToken::Character)
149 return true;
150 m_token->beginEndTag(m_bufferedEndTagName);
151 m_bufferedEndTagName.clear();
152 m_appropriateEndTagName.clear();
153 m_temporaryBuffer.clear();
154 return false;
157 #define FLUSH_AND_ADVANCE_TO(stateName) \
158 do { \
159 m_state = HTMLTokenizer::stateName; \
160 if (flushBufferedEndTag(source)) \
161 return true; \
162 if (source.isEmpty() \
163 || !m_inputStreamPreprocessor.peek(source)) \
164 return haveBufferedCharacterToken(); \
165 cc = m_inputStreamPreprocessor.nextInputCharacter(); \
166 goto stateName; \
167 } while (false)
169 bool HTMLTokenizer::flushEmitAndResumeIn(SegmentedString& source, HTMLTokenizer::State state)
171 m_state = state;
172 flushBufferedEndTag(source);
173 return true;
176 bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
178 // If we have a token in progress, then we're supposed to be called back
179 // with the same token so we can finish it.
180 ASSERT(!m_token || m_token == &token || token.type() == HTMLToken::Uninitialized);
181 m_token = &token;
183 if (!m_bufferedEndTagName.isEmpty() && !isEndTagBufferingState(m_state)) {
184 // FIXME: This should call flushBufferedEndTag().
185 // We started an end tag during our last iteration.
186 m_token->beginEndTag(m_bufferedEndTagName);
187 m_bufferedEndTagName.clear();
188 m_appropriateEndTagName.clear();
189 m_temporaryBuffer.clear();
190 if (m_state == HTMLTokenizer::DataState) {
191 // We're back in the data state, so we must be done with the tag.
192 return true;
196 if (source.isEmpty() || !m_inputStreamPreprocessor.peek(source))
197 return haveBufferedCharacterToken();
198 UChar cc = m_inputStreamPreprocessor.nextInputCharacter();
200 // Source: http://www.whatwg.org/specs/web-apps/current-work/#tokenisation0
201 switch (m_state) {
202 HTML_BEGIN_STATE(DataState) {
203 if (cc == '&')
204 HTML_ADVANCE_TO(CharacterReferenceInDataState);
205 else if (cc == '<') {
206 if (m_token->type() == HTMLToken::Character) {
207 // We have a bunch of character tokens queued up that we
208 // are emitting lazily here.
209 return true;
211 HTML_ADVANCE_TO(TagOpenState);
212 } else if (cc == kEndOfFileMarker)
213 return emitEndOfFile(source);
214 else {
215 bufferCharacter(cc);
216 HTML_ADVANCE_TO(DataState);
219 END_STATE()
221 HTML_BEGIN_STATE(CharacterReferenceInDataState) {
222 if (!processEntity(source))
223 return haveBufferedCharacterToken();
224 HTML_SWITCH_TO(DataState);
226 END_STATE()
228 HTML_BEGIN_STATE(RCDATAState) {
229 if (cc == '&')
230 HTML_ADVANCE_TO(CharacterReferenceInRCDATAState);
231 else if (cc == '<')
232 HTML_ADVANCE_TO(RCDATALessThanSignState);
233 else if (cc == kEndOfFileMarker)
234 return emitEndOfFile(source);
235 else {
236 bufferCharacter(cc);
237 HTML_ADVANCE_TO(RCDATAState);
240 END_STATE()
242 HTML_BEGIN_STATE(CharacterReferenceInRCDATAState) {
243 if (!processEntity(source))
244 return haveBufferedCharacterToken();
245 HTML_SWITCH_TO(RCDATAState);
247 END_STATE()
249 HTML_BEGIN_STATE(RAWTEXTState) {
250 if (cc == '<')
251 HTML_ADVANCE_TO(RAWTEXTLessThanSignState);
252 else if (cc == kEndOfFileMarker)
253 return emitEndOfFile(source);
254 else {
255 bufferCharacter(cc);
256 HTML_ADVANCE_TO(RAWTEXTState);
259 END_STATE()
261 HTML_BEGIN_STATE(ScriptDataState) {
262 if (cc == '<')
263 HTML_ADVANCE_TO(ScriptDataLessThanSignState);
264 else if (cc == kEndOfFileMarker)
265 return emitEndOfFile(source);
266 else {
267 bufferCharacter(cc);
268 HTML_ADVANCE_TO(ScriptDataState);
271 END_STATE()
273 HTML_BEGIN_STATE(PLAINTEXTState) {
274 if (cc == kEndOfFileMarker)
275 return emitEndOfFile(source);
276 bufferCharacter(cc);
277 HTML_ADVANCE_TO(PLAINTEXTState);
279 END_STATE()
281 HTML_BEGIN_STATE(TagOpenState) {
282 if (cc == '!')
283 HTML_ADVANCE_TO(MarkupDeclarationOpenState);
284 else if (cc == '/')
285 HTML_ADVANCE_TO(EndTagOpenState);
286 else if (isASCIIUpper(cc)) {
287 m_token->beginStartTag(toLowerCase(cc));
288 HTML_ADVANCE_TO(TagNameState);
289 } else if (isASCIILower(cc)) {
290 m_token->beginStartTag(cc);
291 HTML_ADVANCE_TO(TagNameState);
292 } else if (cc == '?') {
293 parseError();
294 // The spec consumes the current character before switching
295 // to the bogus comment state, but it's easier to implement
296 // if we reconsume the current character.
297 HTML_RECONSUME_IN(BogusCommentState);
298 } else {
299 parseError();
300 bufferCharacter('<');
301 HTML_RECONSUME_IN(DataState);
304 END_STATE()
306 HTML_BEGIN_STATE(EndTagOpenState) {
307 if (isASCIIUpper(cc)) {
308 m_token->beginEndTag(static_cast<LChar>(toLowerCase(cc)));
309 m_appropriateEndTagName.clear();
310 HTML_ADVANCE_TO(TagNameState);
311 } else if (isASCIILower(cc)) {
312 m_token->beginEndTag(static_cast<LChar>(cc));
313 m_appropriateEndTagName.clear();
314 HTML_ADVANCE_TO(TagNameState);
315 } else if (cc == '>') {
316 parseError();
317 HTML_ADVANCE_TO(DataState);
318 } else if (cc == kEndOfFileMarker) {
319 parseError();
320 bufferCharacter('<');
321 bufferCharacter('/');
322 HTML_RECONSUME_IN(DataState);
323 } else {
324 parseError();
325 HTML_RECONSUME_IN(BogusCommentState);
328 END_STATE()
330 HTML_BEGIN_STATE(TagNameState) {
331 if (isTokenizerWhitespace(cc))
332 HTML_ADVANCE_TO(BeforeAttributeNameState);
333 else if (cc == '/')
334 HTML_ADVANCE_TO(SelfClosingStartTagState);
335 else if (cc == '>')
336 return emitAndResumeIn(source, HTMLTokenizer::DataState);
337 else if (isASCIIUpper(cc)) {
338 m_token->appendToName(toLowerCase(cc));
339 HTML_ADVANCE_TO(TagNameState);
340 } else if (cc == kEndOfFileMarker) {
341 parseError();
342 HTML_RECONSUME_IN(DataState);
343 } else {
344 m_token->appendToName(cc);
345 HTML_ADVANCE_TO(TagNameState);
348 END_STATE()
350 HTML_BEGIN_STATE(RCDATALessThanSignState) {
351 if (cc == '/') {
352 m_temporaryBuffer.clear();
353 ASSERT(m_bufferedEndTagName.isEmpty());
354 HTML_ADVANCE_TO(RCDATAEndTagOpenState);
355 } else {
356 bufferCharacter('<');
357 HTML_RECONSUME_IN(RCDATAState);
360 END_STATE()
362 HTML_BEGIN_STATE(RCDATAEndTagOpenState) {
363 if (isASCIIUpper(cc)) {
364 m_temporaryBuffer.append(static_cast<LChar>(cc));
365 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc)));
366 HTML_ADVANCE_TO(RCDATAEndTagNameState);
367 } else if (isASCIILower(cc)) {
368 m_temporaryBuffer.append(static_cast<LChar>(cc));
369 addToPossibleEndTag(static_cast<LChar>(cc));
370 HTML_ADVANCE_TO(RCDATAEndTagNameState);
371 } else {
372 bufferCharacter('<');
373 bufferCharacter('/');
374 HTML_RECONSUME_IN(RCDATAState);
377 END_STATE()
379 HTML_BEGIN_STATE(RCDATAEndTagNameState) {
380 if (isASCIIUpper(cc)) {
381 m_temporaryBuffer.append(static_cast<LChar>(cc));
382 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc)));
383 HTML_ADVANCE_TO(RCDATAEndTagNameState);
384 } else if (isASCIILower(cc)) {
385 m_temporaryBuffer.append(static_cast<LChar>(cc));
386 addToPossibleEndTag(static_cast<LChar>(cc));
387 HTML_ADVANCE_TO(RCDATAEndTagNameState);
388 } else {
389 if (isTokenizerWhitespace(cc)) {
390 if (isAppropriateEndTag()) {
391 m_temporaryBuffer.append(static_cast<LChar>(cc));
392 FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState);
394 } else if (cc == '/') {
395 if (isAppropriateEndTag()) {
396 m_temporaryBuffer.append(static_cast<LChar>(cc));
397 FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState);
399 } else if (cc == '>') {
400 if (isAppropriateEndTag()) {
401 m_temporaryBuffer.append(static_cast<LChar>(cc));
402 return flushEmitAndResumeIn(source, HTMLTokenizer::DataState);
405 bufferCharacter('<');
406 bufferCharacter('/');
407 m_token->appendToCharacter(m_temporaryBuffer);
408 m_bufferedEndTagName.clear();
409 m_temporaryBuffer.clear();
410 HTML_RECONSUME_IN(RCDATAState);
413 END_STATE()
415 HTML_BEGIN_STATE(RAWTEXTLessThanSignState) {
416 if (cc == '/') {
417 m_temporaryBuffer.clear();
418 ASSERT(m_bufferedEndTagName.isEmpty());
419 HTML_ADVANCE_TO(RAWTEXTEndTagOpenState);
420 } else {
421 bufferCharacter('<');
422 HTML_RECONSUME_IN(RAWTEXTState);
425 END_STATE()
427 HTML_BEGIN_STATE(RAWTEXTEndTagOpenState) {
428 if (isASCIIUpper(cc)) {
429 m_temporaryBuffer.append(static_cast<LChar>(cc));
430 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc)));
431 HTML_ADVANCE_TO(RAWTEXTEndTagNameState);
432 } else if (isASCIILower(cc)) {
433 m_temporaryBuffer.append(static_cast<LChar>(cc));
434 addToPossibleEndTag(static_cast<LChar>(cc));
435 HTML_ADVANCE_TO(RAWTEXTEndTagNameState);
436 } else {
437 bufferCharacter('<');
438 bufferCharacter('/');
439 HTML_RECONSUME_IN(RAWTEXTState);
442 END_STATE()
444 HTML_BEGIN_STATE(RAWTEXTEndTagNameState) {
445 if (isASCIIUpper(cc)) {
446 m_temporaryBuffer.append(static_cast<LChar>(cc));
447 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc)));
448 HTML_ADVANCE_TO(RAWTEXTEndTagNameState);
449 } else if (isASCIILower(cc)) {
450 m_temporaryBuffer.append(static_cast<LChar>(cc));
451 addToPossibleEndTag(static_cast<LChar>(cc));
452 HTML_ADVANCE_TO(RAWTEXTEndTagNameState);
453 } else {
454 if (isTokenizerWhitespace(cc)) {
455 if (isAppropriateEndTag()) {
456 m_temporaryBuffer.append(static_cast<LChar>(cc));
457 FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState);
459 } else if (cc == '/') {
460 if (isAppropriateEndTag()) {
461 m_temporaryBuffer.append(static_cast<LChar>(cc));
462 FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState);
464 } else if (cc == '>') {
465 if (isAppropriateEndTag()) {
466 m_temporaryBuffer.append(static_cast<LChar>(cc));
467 return flushEmitAndResumeIn(source, HTMLTokenizer::DataState);
470 bufferCharacter('<');
471 bufferCharacter('/');
472 m_token->appendToCharacter(m_temporaryBuffer);
473 m_bufferedEndTagName.clear();
474 m_temporaryBuffer.clear();
475 HTML_RECONSUME_IN(RAWTEXTState);
478 END_STATE()
480 HTML_BEGIN_STATE(ScriptDataLessThanSignState) {
481 if (cc == '/') {
482 m_temporaryBuffer.clear();
483 ASSERT(m_bufferedEndTagName.isEmpty());
484 HTML_ADVANCE_TO(ScriptDataEndTagOpenState);
485 } else if (cc == '!') {
486 bufferCharacter('<');
487 bufferCharacter('!');
488 HTML_ADVANCE_TO(ScriptDataEscapeStartState);
489 } else {
490 bufferCharacter('<');
491 HTML_RECONSUME_IN(ScriptDataState);
494 END_STATE()
496 HTML_BEGIN_STATE(ScriptDataEndTagOpenState) {
497 if (isASCIIUpper(cc)) {
498 m_temporaryBuffer.append(static_cast<LChar>(cc));
499 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc)));
500 HTML_ADVANCE_TO(ScriptDataEndTagNameState);
501 } else if (isASCIILower(cc)) {
502 m_temporaryBuffer.append(static_cast<LChar>(cc));
503 addToPossibleEndTag(static_cast<LChar>(cc));
504 HTML_ADVANCE_TO(ScriptDataEndTagNameState);
505 } else {
506 bufferCharacter('<');
507 bufferCharacter('/');
508 HTML_RECONSUME_IN(ScriptDataState);
511 END_STATE()
513 HTML_BEGIN_STATE(ScriptDataEndTagNameState) {
514 if (isASCIIUpper(cc)) {
515 m_temporaryBuffer.append(static_cast<LChar>(cc));
516 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc)));
517 HTML_ADVANCE_TO(ScriptDataEndTagNameState);
518 } else if (isASCIILower(cc)) {
519 m_temporaryBuffer.append(static_cast<LChar>(cc));
520 addToPossibleEndTag(static_cast<LChar>(cc));
521 HTML_ADVANCE_TO(ScriptDataEndTagNameState);
522 } else {
523 if (isTokenizerWhitespace(cc)) {
524 if (isAppropriateEndTag()) {
525 m_temporaryBuffer.append(static_cast<LChar>(cc));
526 FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState);
528 } else if (cc == '/') {
529 if (isAppropriateEndTag()) {
530 m_temporaryBuffer.append(static_cast<LChar>(cc));
531 FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState);
533 } else if (cc == '>') {
534 if (isAppropriateEndTag()) {
535 m_temporaryBuffer.append(static_cast<LChar>(cc));
536 return flushEmitAndResumeIn(source, HTMLTokenizer::DataState);
539 bufferCharacter('<');
540 bufferCharacter('/');
541 m_token->appendToCharacter(m_temporaryBuffer);
542 m_bufferedEndTagName.clear();
543 m_temporaryBuffer.clear();
544 HTML_RECONSUME_IN(ScriptDataState);
547 END_STATE()
549 HTML_BEGIN_STATE(ScriptDataEscapeStartState) {
550 if (cc == '-') {
551 bufferCharacter(cc);
552 HTML_ADVANCE_TO(ScriptDataEscapeStartDashState);
553 } else
554 HTML_RECONSUME_IN(ScriptDataState);
556 END_STATE()
558 HTML_BEGIN_STATE(ScriptDataEscapeStartDashState) {
559 if (cc == '-') {
560 bufferCharacter(cc);
561 HTML_ADVANCE_TO(ScriptDataEscapedDashDashState);
562 } else
563 HTML_RECONSUME_IN(ScriptDataState);
565 END_STATE()
567 HTML_BEGIN_STATE(ScriptDataEscapedState) {
568 if (cc == '-') {
569 bufferCharacter(cc);
570 HTML_ADVANCE_TO(ScriptDataEscapedDashState);
571 } else if (cc == '<')
572 HTML_ADVANCE_TO(ScriptDataEscapedLessThanSignState);
573 else if (cc == kEndOfFileMarker) {
574 parseError();
575 HTML_RECONSUME_IN(DataState);
576 } else {
577 bufferCharacter(cc);
578 HTML_ADVANCE_TO(ScriptDataEscapedState);
581 END_STATE()
583 HTML_BEGIN_STATE(ScriptDataEscapedDashState) {
584 if (cc == '-') {
585 bufferCharacter(cc);
586 HTML_ADVANCE_TO(ScriptDataEscapedDashDashState);
587 } else if (cc == '<')
588 HTML_ADVANCE_TO(ScriptDataEscapedLessThanSignState);
589 else if (cc == kEndOfFileMarker) {
590 parseError();
591 HTML_RECONSUME_IN(DataState);
592 } else {
593 bufferCharacter(cc);
594 HTML_ADVANCE_TO(ScriptDataEscapedState);
597 END_STATE()
599 HTML_BEGIN_STATE(ScriptDataEscapedDashDashState) {
600 if (cc == '-') {
601 bufferCharacter(cc);
602 HTML_ADVANCE_TO(ScriptDataEscapedDashDashState);
603 } else if (cc == '<')
604 HTML_ADVANCE_TO(ScriptDataEscapedLessThanSignState);
605 else if (cc == '>') {
606 bufferCharacter(cc);
607 HTML_ADVANCE_TO(ScriptDataState);
608 } else if (cc == kEndOfFileMarker) {
609 parseError();
610 HTML_RECONSUME_IN(DataState);
611 } else {
612 bufferCharacter(cc);
613 HTML_ADVANCE_TO(ScriptDataEscapedState);
616 END_STATE()
618 HTML_BEGIN_STATE(ScriptDataEscapedLessThanSignState) {
619 if (cc == '/') {
620 m_temporaryBuffer.clear();
621 ASSERT(m_bufferedEndTagName.isEmpty());
622 HTML_ADVANCE_TO(ScriptDataEscapedEndTagOpenState);
623 } else if (isASCIIUpper(cc)) {
624 bufferCharacter('<');
625 bufferCharacter(cc);
626 m_temporaryBuffer.clear();
627 m_temporaryBuffer.append(toLowerCase(cc));
628 HTML_ADVANCE_TO(ScriptDataDoubleEscapeStartState);
629 } else if (isASCIILower(cc)) {
630 bufferCharacter('<');
631 bufferCharacter(cc);
632 m_temporaryBuffer.clear();
633 m_temporaryBuffer.append(static_cast<LChar>(cc));
634 HTML_ADVANCE_TO(ScriptDataDoubleEscapeStartState);
635 } else {
636 bufferCharacter('<');
637 HTML_RECONSUME_IN(ScriptDataEscapedState);
640 END_STATE()
642 HTML_BEGIN_STATE(ScriptDataEscapedEndTagOpenState) {
643 if (isASCIIUpper(cc)) {
644 m_temporaryBuffer.append(static_cast<LChar>(cc));
645 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc)));
646 HTML_ADVANCE_TO(ScriptDataEscapedEndTagNameState);
647 } else if (isASCIILower(cc)) {
648 m_temporaryBuffer.append(static_cast<LChar>(cc));
649 addToPossibleEndTag(static_cast<LChar>(cc));
650 HTML_ADVANCE_TO(ScriptDataEscapedEndTagNameState);
651 } else {
652 bufferCharacter('<');
653 bufferCharacter('/');
654 HTML_RECONSUME_IN(ScriptDataEscapedState);
657 END_STATE()
659 HTML_BEGIN_STATE(ScriptDataEscapedEndTagNameState) {
660 if (isASCIIUpper(cc)) {
661 m_temporaryBuffer.append(static_cast<LChar>(cc));
662 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc)));
663 HTML_ADVANCE_TO(ScriptDataEscapedEndTagNameState);
664 } else if (isASCIILower(cc)) {
665 m_temporaryBuffer.append(static_cast<LChar>(cc));
666 addToPossibleEndTag(static_cast<LChar>(cc));
667 HTML_ADVANCE_TO(ScriptDataEscapedEndTagNameState);
668 } else {
669 if (isTokenizerWhitespace(cc)) {
670 if (isAppropriateEndTag()) {
671 m_temporaryBuffer.append(static_cast<LChar>(cc));
672 FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState);
674 } else if (cc == '/') {
675 if (isAppropriateEndTag()) {
676 m_temporaryBuffer.append(static_cast<LChar>(cc));
677 FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState);
679 } else if (cc == '>') {
680 if (isAppropriateEndTag()) {
681 m_temporaryBuffer.append(static_cast<LChar>(cc));
682 return flushEmitAndResumeIn(source, HTMLTokenizer::DataState);
685 bufferCharacter('<');
686 bufferCharacter('/');
687 m_token->appendToCharacter(m_temporaryBuffer);
688 m_bufferedEndTagName.clear();
689 m_temporaryBuffer.clear();
690 HTML_RECONSUME_IN(ScriptDataEscapedState);
693 END_STATE()
695 HTML_BEGIN_STATE(ScriptDataDoubleEscapeStartState) {
696 if (isTokenizerWhitespace(cc) || cc == '/' || cc == '>') {
697 bufferCharacter(cc);
698 if (temporaryBufferIs(scriptTag.localName()))
699 HTML_ADVANCE_TO(ScriptDataDoubleEscapedState);
700 else
701 HTML_ADVANCE_TO(ScriptDataEscapedState);
702 } else if (isASCIIUpper(cc)) {
703 bufferCharacter(cc);
704 m_temporaryBuffer.append(toLowerCase(cc));
705 HTML_ADVANCE_TO(ScriptDataDoubleEscapeStartState);
706 } else if (isASCIILower(cc)) {
707 bufferCharacter(cc);
708 m_temporaryBuffer.append(static_cast<LChar>(cc));
709 HTML_ADVANCE_TO(ScriptDataDoubleEscapeStartState);
710 } else
711 HTML_RECONSUME_IN(ScriptDataEscapedState);
713 END_STATE()
715 HTML_BEGIN_STATE(ScriptDataDoubleEscapedState) {
716 if (cc == '-') {
717 bufferCharacter(cc);
718 HTML_ADVANCE_TO(ScriptDataDoubleEscapedDashState);
719 } else if (cc == '<') {
720 bufferCharacter(cc);
721 HTML_ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState);
722 } else if (cc == kEndOfFileMarker) {
723 parseError();
724 HTML_RECONSUME_IN(DataState);
725 } else {
726 bufferCharacter(cc);
727 HTML_ADVANCE_TO(ScriptDataDoubleEscapedState);
730 END_STATE()
732 HTML_BEGIN_STATE(ScriptDataDoubleEscapedDashState) {
733 if (cc == '-') {
734 bufferCharacter(cc);
735 HTML_ADVANCE_TO(ScriptDataDoubleEscapedDashDashState);
736 } else if (cc == '<') {
737 bufferCharacter(cc);
738 HTML_ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState);
739 } else if (cc == kEndOfFileMarker) {
740 parseError();
741 HTML_RECONSUME_IN(DataState);
742 } else {
743 bufferCharacter(cc);
744 HTML_ADVANCE_TO(ScriptDataDoubleEscapedState);
747 END_STATE()
749 HTML_BEGIN_STATE(ScriptDataDoubleEscapedDashDashState) {
750 if (cc == '-') {
751 bufferCharacter(cc);
752 HTML_ADVANCE_TO(ScriptDataDoubleEscapedDashDashState);
753 } else if (cc == '<') {
754 bufferCharacter(cc);
755 HTML_ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState);
756 } else if (cc == '>') {
757 bufferCharacter(cc);
758 HTML_ADVANCE_TO(ScriptDataState);
759 } else if (cc == kEndOfFileMarker) {
760 parseError();
761 HTML_RECONSUME_IN(DataState);
762 } else {
763 bufferCharacter(cc);
764 HTML_ADVANCE_TO(ScriptDataDoubleEscapedState);
767 END_STATE()
769 HTML_BEGIN_STATE(ScriptDataDoubleEscapedLessThanSignState) {
770 if (cc == '/') {
771 bufferCharacter(cc);
772 m_temporaryBuffer.clear();
773 HTML_ADVANCE_TO(ScriptDataDoubleEscapeEndState);
774 } else
775 HTML_RECONSUME_IN(ScriptDataDoubleEscapedState);
777 END_STATE()
779 HTML_BEGIN_STATE(ScriptDataDoubleEscapeEndState) {
780 if (isTokenizerWhitespace(cc) || cc == '/' || cc == '>') {
781 bufferCharacter(cc);
782 if (temporaryBufferIs(scriptTag.localName()))
783 HTML_ADVANCE_TO(ScriptDataEscapedState);
784 else
785 HTML_ADVANCE_TO(ScriptDataDoubleEscapedState);
786 } else if (isASCIIUpper(cc)) {
787 bufferCharacter(cc);
788 m_temporaryBuffer.append(toLowerCase(cc));
789 HTML_ADVANCE_TO(ScriptDataDoubleEscapeEndState);
790 } else if (isASCIILower(cc)) {
791 bufferCharacter(cc);
792 m_temporaryBuffer.append(static_cast<LChar>(cc));
793 HTML_ADVANCE_TO(ScriptDataDoubleEscapeEndState);
794 } else
795 HTML_RECONSUME_IN(ScriptDataDoubleEscapedState);
797 END_STATE()
799 HTML_BEGIN_STATE(BeforeAttributeNameState) {
800 if (isTokenizerWhitespace(cc))
801 HTML_ADVANCE_TO(BeforeAttributeNameState);
802 else if (cc == '/')
803 HTML_ADVANCE_TO(SelfClosingStartTagState);
804 else if (cc == '>')
805 return emitAndResumeIn(source, HTMLTokenizer::DataState);
806 else if (isASCIIUpper(cc)) {
807 m_token->addNewAttribute();
808 m_token->beginAttributeName(source.numberOfCharactersConsumed());
809 m_token->appendToAttributeName(toLowerCase(cc));
810 HTML_ADVANCE_TO(AttributeNameState);
811 } else if (cc == kEndOfFileMarker) {
812 parseError();
813 HTML_RECONSUME_IN(DataState);
814 } else {
815 if (cc == '"' || cc == '\'' || cc == '<' || cc == '=')
816 parseError();
817 m_token->addNewAttribute();
818 m_token->beginAttributeName(source.numberOfCharactersConsumed());
819 m_token->appendToAttributeName(cc);
820 HTML_ADVANCE_TO(AttributeNameState);
823 END_STATE()
825 HTML_BEGIN_STATE(AttributeNameState) {
826 if (isTokenizerWhitespace(cc)) {
827 m_token->endAttributeName(source.numberOfCharactersConsumed());
828 HTML_ADVANCE_TO(AfterAttributeNameState);
829 } else if (cc == '/') {
830 m_token->endAttributeName(source.numberOfCharactersConsumed());
831 HTML_ADVANCE_TO(SelfClosingStartTagState);
832 } else if (cc == '=') {
833 m_token->endAttributeName(source.numberOfCharactersConsumed());
834 HTML_ADVANCE_TO(BeforeAttributeValueState);
835 } else if (cc == '>') {
836 m_token->endAttributeName(source.numberOfCharactersConsumed());
837 return emitAndResumeIn(source, HTMLTokenizer::DataState);
838 } else if (isASCIIUpper(cc)) {
839 m_token->appendToAttributeName(toLowerCase(cc));
840 HTML_ADVANCE_TO(AttributeNameState);
841 } else if (cc == kEndOfFileMarker) {
842 parseError();
843 m_token->endAttributeName(source.numberOfCharactersConsumed());
844 HTML_RECONSUME_IN(DataState);
845 } else {
846 if (cc == '"' || cc == '\'' || cc == '<' || cc == '=')
847 parseError();
848 m_token->appendToAttributeName(cc);
849 HTML_ADVANCE_TO(AttributeNameState);
852 END_STATE()
854 HTML_BEGIN_STATE(AfterAttributeNameState) {
855 if (isTokenizerWhitespace(cc))
856 HTML_ADVANCE_TO(AfterAttributeNameState);
857 else if (cc == '/')
858 HTML_ADVANCE_TO(SelfClosingStartTagState);
859 else if (cc == '=')
860 HTML_ADVANCE_TO(BeforeAttributeValueState);
861 else if (cc == '>')
862 return emitAndResumeIn(source, HTMLTokenizer::DataState);
863 else if (isASCIIUpper(cc)) {
864 m_token->addNewAttribute();
865 m_token->beginAttributeName(source.numberOfCharactersConsumed());
866 m_token->appendToAttributeName(toLowerCase(cc));
867 HTML_ADVANCE_TO(AttributeNameState);
868 } else if (cc == kEndOfFileMarker) {
869 parseError();
870 HTML_RECONSUME_IN(DataState);
871 } else {
872 if (cc == '"' || cc == '\'' || cc == '<')
873 parseError();
874 m_token->addNewAttribute();
875 m_token->beginAttributeName(source.numberOfCharactersConsumed());
876 m_token->appendToAttributeName(cc);
877 HTML_ADVANCE_TO(AttributeNameState);
880 END_STATE()
882 HTML_BEGIN_STATE(BeforeAttributeValueState) {
883 if (isTokenizerWhitespace(cc))
884 HTML_ADVANCE_TO(BeforeAttributeValueState);
885 else if (cc == '"') {
886 m_token->beginAttributeValue(source.numberOfCharactersConsumed() + 1);
887 HTML_ADVANCE_TO(AttributeValueDoubleQuotedState);
888 } else if (cc == '&') {
889 m_token->beginAttributeValue(source.numberOfCharactersConsumed());
890 HTML_RECONSUME_IN(AttributeValueUnquotedState);
891 } else if (cc == '\'') {
892 m_token->beginAttributeValue(source.numberOfCharactersConsumed() + 1);
893 HTML_ADVANCE_TO(AttributeValueSingleQuotedState);
894 } else if (cc == '>') {
895 parseError();
896 return emitAndResumeIn(source, HTMLTokenizer::DataState);
897 } else if (cc == kEndOfFileMarker) {
898 parseError();
899 HTML_RECONSUME_IN(DataState);
900 } else {
901 if (cc == '<' || cc == '=' || cc == '`')
902 parseError();
903 m_token->beginAttributeValue(source.numberOfCharactersConsumed());
904 m_token->appendToAttributeValue(cc);
905 HTML_ADVANCE_TO(AttributeValueUnquotedState);
908 END_STATE()
910 HTML_BEGIN_STATE(AttributeValueDoubleQuotedState) {
911 if (cc == '"') {
912 m_token->endAttributeValue(source.numberOfCharactersConsumed());
913 HTML_ADVANCE_TO(AfterAttributeValueQuotedState);
914 } else if (cc == '&') {
915 m_additionalAllowedCharacter = '"';
916 HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState);
917 } else if (cc == kEndOfFileMarker) {
918 parseError();
919 m_token->endAttributeValue(source.numberOfCharactersConsumed());
920 HTML_RECONSUME_IN(DataState);
921 } else {
922 m_token->appendToAttributeValue(cc);
923 HTML_ADVANCE_TO(AttributeValueDoubleQuotedState);
926 END_STATE()
928 HTML_BEGIN_STATE(AttributeValueSingleQuotedState) {
929 if (cc == '\'') {
930 m_token->endAttributeValue(source.numberOfCharactersConsumed());
931 HTML_ADVANCE_TO(AfterAttributeValueQuotedState);
932 } else if (cc == '&') {
933 m_additionalAllowedCharacter = '\'';
934 HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState);
935 } else if (cc == kEndOfFileMarker) {
936 parseError();
937 m_token->endAttributeValue(source.numberOfCharactersConsumed());
938 HTML_RECONSUME_IN(DataState);
939 } else {
940 m_token->appendToAttributeValue(cc);
941 HTML_ADVANCE_TO(AttributeValueSingleQuotedState);
944 END_STATE()
946 HTML_BEGIN_STATE(AttributeValueUnquotedState) {
947 if (isTokenizerWhitespace(cc)) {
948 m_token->endAttributeValue(source.numberOfCharactersConsumed());
949 HTML_ADVANCE_TO(BeforeAttributeNameState);
950 } else if (cc == '&') {
951 m_additionalAllowedCharacter = '>';
952 HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState);
953 } else if (cc == '>') {
954 m_token->endAttributeValue(source.numberOfCharactersConsumed());
955 return emitAndResumeIn(source, HTMLTokenizer::DataState);
956 } else if (cc == kEndOfFileMarker) {
957 parseError();
958 m_token->endAttributeValue(source.numberOfCharactersConsumed());
959 HTML_RECONSUME_IN(DataState);
960 } else {
961 if (cc == '"' || cc == '\'' || cc == '<' || cc == '=' || cc == '`')
962 parseError();
963 m_token->appendToAttributeValue(cc);
964 HTML_ADVANCE_TO(AttributeValueUnquotedState);
967 END_STATE()
969 HTML_BEGIN_STATE(CharacterReferenceInAttributeValueState) {
970 bool notEnoughCharacters = false;
971 DecodedHTMLEntity decodedEntity;
972 bool success = consumeHTMLEntity(source, decodedEntity, notEnoughCharacters, m_additionalAllowedCharacter);
973 if (notEnoughCharacters)
974 return haveBufferedCharacterToken();
975 if (!success) {
976 ASSERT(decodedEntity.isEmpty());
977 m_token->appendToAttributeValue('&');
978 } else {
979 for (unsigned i = 0; i < decodedEntity.length; ++i)
980 m_token->appendToAttributeValue(decodedEntity.data[i]);
982 // We're supposed to switch back to the attribute value state that
983 // we were in when we were switched into this state. Rather than
984 // keeping track of this explictly, we observe that the previous
985 // state can be determined by m_additionalAllowedCharacter.
986 if (m_additionalAllowedCharacter == '"')
987 HTML_SWITCH_TO(AttributeValueDoubleQuotedState);
988 else if (m_additionalAllowedCharacter == '\'')
989 HTML_SWITCH_TO(AttributeValueSingleQuotedState);
990 else if (m_additionalAllowedCharacter == '>')
991 HTML_SWITCH_TO(AttributeValueUnquotedState);
992 else
993 ASSERT_NOT_REACHED();
995 END_STATE()
997 HTML_BEGIN_STATE(AfterAttributeValueQuotedState) {
998 if (isTokenizerWhitespace(cc))
999 HTML_ADVANCE_TO(BeforeAttributeNameState);
1000 else if (cc == '/')
1001 HTML_ADVANCE_TO(SelfClosingStartTagState);
1002 else if (cc == '>')
1003 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1004 else if (cc == kEndOfFileMarker) {
1005 parseError();
1006 HTML_RECONSUME_IN(DataState);
1007 } else {
1008 parseError();
1009 HTML_RECONSUME_IN(BeforeAttributeNameState);
1012 END_STATE()
1014 HTML_BEGIN_STATE(SelfClosingStartTagState) {
1015 if (cc == '>') {
1016 m_token->setSelfClosing();
1017 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1018 } else if (cc == kEndOfFileMarker) {
1019 parseError();
1020 HTML_RECONSUME_IN(DataState);
1021 } else {
1022 parseError();
1023 HTML_RECONSUME_IN(BeforeAttributeNameState);
1026 END_STATE()
1028 HTML_BEGIN_STATE(BogusCommentState) {
1029 m_token->beginComment();
1030 HTML_RECONSUME_IN(ContinueBogusCommentState);
1032 END_STATE()
1034 HTML_BEGIN_STATE(ContinueBogusCommentState) {
1035 if (cc == '>')
1036 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1037 else if (cc == kEndOfFileMarker)
1038 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1039 else {
1040 m_token->appendToComment(cc);
1041 HTML_ADVANCE_TO(ContinueBogusCommentState);
1044 END_STATE()
1046 HTML_BEGIN_STATE(MarkupDeclarationOpenState) {
1047 if (cc == '-') {
1048 SegmentedString::LookAheadResult result = source.lookAhead(HTMLTokenizerNames::dashDash);
1049 if (result == SegmentedString::DidMatch) {
1050 source.advanceAndASSERT('-');
1051 source.advanceAndASSERT('-');
1052 m_token->beginComment();
1053 HTML_SWITCH_TO(CommentStartState);
1054 } else if (result == SegmentedString::NotEnoughCharacters)
1055 return haveBufferedCharacterToken();
1056 } else if (cc == 'D' || cc == 'd') {
1057 SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(HTMLTokenizerNames::doctype);
1058 if (result == SegmentedString::DidMatch) {
1059 advanceStringAndASSERTIgnoringCase(source, "doctype");
1060 HTML_SWITCH_TO(DOCTYPEState);
1061 } else if (result == SegmentedString::NotEnoughCharacters)
1062 return haveBufferedCharacterToken();
1063 } else if (cc == '[' && shouldAllowCDATA()) {
1064 SegmentedString::LookAheadResult result = source.lookAhead(HTMLTokenizerNames::cdata);
1065 if (result == SegmentedString::DidMatch) {
1066 advanceStringAndASSERT(source, "[CDATA[");
1067 HTML_SWITCH_TO(CDATASectionState);
1068 } else if (result == SegmentedString::NotEnoughCharacters)
1069 return haveBufferedCharacterToken();
1071 parseError();
1072 HTML_RECONSUME_IN(BogusCommentState);
1074 END_STATE()
1076 HTML_BEGIN_STATE(CommentStartState) {
1077 if (cc == '-')
1078 HTML_ADVANCE_TO(CommentStartDashState);
1079 else if (cc == '>') {
1080 parseError();
1081 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1082 } else if (cc == kEndOfFileMarker) {
1083 parseError();
1084 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1085 } else {
1086 m_token->appendToComment(cc);
1087 HTML_ADVANCE_TO(CommentState);
1090 END_STATE()
1092 HTML_BEGIN_STATE(CommentStartDashState) {
1093 if (cc == '-')
1094 HTML_ADVANCE_TO(CommentEndState);
1095 else if (cc == '>') {
1096 parseError();
1097 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1098 } else if (cc == kEndOfFileMarker) {
1099 parseError();
1100 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1101 } else {
1102 m_token->appendToComment('-');
1103 m_token->appendToComment(cc);
1104 HTML_ADVANCE_TO(CommentState);
1107 END_STATE()
1109 HTML_BEGIN_STATE(CommentState) {
1110 if (cc == '-')
1111 HTML_ADVANCE_TO(CommentEndDashState);
1112 else if (cc == kEndOfFileMarker) {
1113 parseError();
1114 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1115 } else {
1116 m_token->appendToComment(cc);
1117 HTML_ADVANCE_TO(CommentState);
1120 END_STATE()
1122 HTML_BEGIN_STATE(CommentEndDashState) {
1123 if (cc == '-')
1124 HTML_ADVANCE_TO(CommentEndState);
1125 else if (cc == kEndOfFileMarker) {
1126 parseError();
1127 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1128 } else {
1129 m_token->appendToComment('-');
1130 m_token->appendToComment(cc);
1131 HTML_ADVANCE_TO(CommentState);
1134 END_STATE()
1136 HTML_BEGIN_STATE(CommentEndState) {
1137 if (cc == '>')
1138 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1139 else if (cc == '!') {
1140 parseError();
1141 HTML_ADVANCE_TO(CommentEndBangState);
1142 } else if (cc == '-') {
1143 parseError();
1144 m_token->appendToComment('-');
1145 HTML_ADVANCE_TO(CommentEndState);
1146 } else if (cc == kEndOfFileMarker) {
1147 parseError();
1148 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1149 } else {
1150 parseError();
1151 m_token->appendToComment('-');
1152 m_token->appendToComment('-');
1153 m_token->appendToComment(cc);
1154 HTML_ADVANCE_TO(CommentState);
1157 END_STATE()
1159 HTML_BEGIN_STATE(CommentEndBangState) {
1160 if (cc == '-') {
1161 m_token->appendToComment('-');
1162 m_token->appendToComment('-');
1163 m_token->appendToComment('!');
1164 HTML_ADVANCE_TO(CommentEndDashState);
1165 } else if (cc == '>')
1166 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1167 else if (cc == kEndOfFileMarker) {
1168 parseError();
1169 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1170 } else {
1171 m_token->appendToComment('-');
1172 m_token->appendToComment('-');
1173 m_token->appendToComment('!');
1174 m_token->appendToComment(cc);
1175 HTML_ADVANCE_TO(CommentState);
1178 END_STATE()
1180 HTML_BEGIN_STATE(DOCTYPEState) {
1181 if (isTokenizerWhitespace(cc))
1182 HTML_ADVANCE_TO(BeforeDOCTYPENameState);
1183 else if (cc == kEndOfFileMarker) {
1184 parseError();
1185 m_token->beginDOCTYPE();
1186 m_token->setForceQuirks();
1187 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1188 } else {
1189 parseError();
1190 HTML_RECONSUME_IN(BeforeDOCTYPENameState);
1193 END_STATE()
1195 HTML_BEGIN_STATE(BeforeDOCTYPENameState) {
1196 if (isTokenizerWhitespace(cc))
1197 HTML_ADVANCE_TO(BeforeDOCTYPENameState);
1198 else if (isASCIIUpper(cc)) {
1199 m_token->beginDOCTYPE(toLowerCase(cc));
1200 HTML_ADVANCE_TO(DOCTYPENameState);
1201 } else if (cc == '>') {
1202 parseError();
1203 m_token->beginDOCTYPE();
1204 m_token->setForceQuirks();
1205 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1206 } else if (cc == kEndOfFileMarker) {
1207 parseError();
1208 m_token->beginDOCTYPE();
1209 m_token->setForceQuirks();
1210 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1211 } else {
1212 m_token->beginDOCTYPE(cc);
1213 HTML_ADVANCE_TO(DOCTYPENameState);
1216 END_STATE()
1218 HTML_BEGIN_STATE(DOCTYPENameState) {
1219 if (isTokenizerWhitespace(cc))
1220 HTML_ADVANCE_TO(AfterDOCTYPENameState);
1221 else if (cc == '>')
1222 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1223 else if (isASCIIUpper(cc)) {
1224 m_token->appendToName(toLowerCase(cc));
1225 HTML_ADVANCE_TO(DOCTYPENameState);
1226 } else if (cc == kEndOfFileMarker) {
1227 parseError();
1228 m_token->setForceQuirks();
1229 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1230 } else {
1231 m_token->appendToName(cc);
1232 HTML_ADVANCE_TO(DOCTYPENameState);
1235 END_STATE()
1237 HTML_BEGIN_STATE(AfterDOCTYPENameState) {
1238 if (isTokenizerWhitespace(cc))
1239 HTML_ADVANCE_TO(AfterDOCTYPENameState);
1240 if (cc == '>')
1241 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1242 else if (cc == kEndOfFileMarker) {
1243 parseError();
1244 m_token->setForceQuirks();
1245 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1246 } else {
1247 if (cc == 'P' || cc == 'p') {
1248 SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(HTMLTokenizerNames::publicString);
1249 if (result == SegmentedString::DidMatch) {
1250 advanceStringAndASSERTIgnoringCase(source, "public");
1251 HTML_SWITCH_TO(AfterDOCTYPEPublicKeywordState);
1252 } else if (result == SegmentedString::NotEnoughCharacters)
1253 return haveBufferedCharacterToken();
1254 } else if (cc == 'S' || cc == 's') {
1255 SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(HTMLTokenizerNames::system);
1256 if (result == SegmentedString::DidMatch) {
1257 advanceStringAndASSERTIgnoringCase(source, "system");
1258 HTML_SWITCH_TO(AfterDOCTYPESystemKeywordState);
1259 } else if (result == SegmentedString::NotEnoughCharacters)
1260 return haveBufferedCharacterToken();
1262 parseError();
1263 m_token->setForceQuirks();
1264 HTML_ADVANCE_TO(BogusDOCTYPEState);
1267 END_STATE()
1269 HTML_BEGIN_STATE(AfterDOCTYPEPublicKeywordState) {
1270 if (isTokenizerWhitespace(cc))
1271 HTML_ADVANCE_TO(BeforeDOCTYPEPublicIdentifierState);
1272 else if (cc == '"') {
1273 parseError();
1274 m_token->setPublicIdentifierToEmptyString();
1275 HTML_ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState);
1276 } else if (cc == '\'') {
1277 parseError();
1278 m_token->setPublicIdentifierToEmptyString();
1279 HTML_ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState);
1280 } else if (cc == '>') {
1281 parseError();
1282 m_token->setForceQuirks();
1283 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1284 } else if (cc == kEndOfFileMarker) {
1285 parseError();
1286 m_token->setForceQuirks();
1287 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1288 } else {
1289 parseError();
1290 m_token->setForceQuirks();
1291 HTML_ADVANCE_TO(BogusDOCTYPEState);
1294 END_STATE()
1296 HTML_BEGIN_STATE(BeforeDOCTYPEPublicIdentifierState) {
1297 if (isTokenizerWhitespace(cc))
1298 HTML_ADVANCE_TO(BeforeDOCTYPEPublicIdentifierState);
1299 else if (cc == '"') {
1300 m_token->setPublicIdentifierToEmptyString();
1301 HTML_ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState);
1302 } else if (cc == '\'') {
1303 m_token->setPublicIdentifierToEmptyString();
1304 HTML_ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState);
1305 } else if (cc == '>') {
1306 parseError();
1307 m_token->setForceQuirks();
1308 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1309 } else if (cc == kEndOfFileMarker) {
1310 parseError();
1311 m_token->setForceQuirks();
1312 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1313 } else {
1314 parseError();
1315 m_token->setForceQuirks();
1316 HTML_ADVANCE_TO(BogusDOCTYPEState);
1319 END_STATE()
1321 HTML_BEGIN_STATE(DOCTYPEPublicIdentifierDoubleQuotedState) {
1322 if (cc == '"')
1323 HTML_ADVANCE_TO(AfterDOCTYPEPublicIdentifierState);
1324 else if (cc == '>') {
1325 parseError();
1326 m_token->setForceQuirks();
1327 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1328 } else if (cc == kEndOfFileMarker) {
1329 parseError();
1330 m_token->setForceQuirks();
1331 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1332 } else {
1333 m_token->appendToPublicIdentifier(cc);
1334 HTML_ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState);
1337 END_STATE()
1339 HTML_BEGIN_STATE(DOCTYPEPublicIdentifierSingleQuotedState) {
1340 if (cc == '\'')
1341 HTML_ADVANCE_TO(AfterDOCTYPEPublicIdentifierState);
1342 else if (cc == '>') {
1343 parseError();
1344 m_token->setForceQuirks();
1345 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1346 } else if (cc == kEndOfFileMarker) {
1347 parseError();
1348 m_token->setForceQuirks();
1349 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1350 } else {
1351 m_token->appendToPublicIdentifier(cc);
1352 HTML_ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState);
1355 END_STATE()
1357 HTML_BEGIN_STATE(AfterDOCTYPEPublicIdentifierState) {
1358 if (isTokenizerWhitespace(cc))
1359 HTML_ADVANCE_TO(BetweenDOCTYPEPublicAndSystemIdentifiersState);
1360 else if (cc == '>')
1361 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1362 else if (cc == '"') {
1363 parseError();
1364 m_token->setSystemIdentifierToEmptyString();
1365 HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
1366 } else if (cc == '\'') {
1367 parseError();
1368 m_token->setSystemIdentifierToEmptyString();
1369 HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
1370 } else if (cc == kEndOfFileMarker) {
1371 parseError();
1372 m_token->setForceQuirks();
1373 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1374 } else {
1375 parseError();
1376 m_token->setForceQuirks();
1377 HTML_ADVANCE_TO(BogusDOCTYPEState);
1380 END_STATE()
1382 HTML_BEGIN_STATE(BetweenDOCTYPEPublicAndSystemIdentifiersState) {
1383 if (isTokenizerWhitespace(cc))
1384 HTML_ADVANCE_TO(BetweenDOCTYPEPublicAndSystemIdentifiersState);
1385 else if (cc == '>')
1386 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1387 else if (cc == '"') {
1388 m_token->setSystemIdentifierToEmptyString();
1389 HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
1390 } else if (cc == '\'') {
1391 m_token->setSystemIdentifierToEmptyString();
1392 HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
1393 } else if (cc == kEndOfFileMarker) {
1394 parseError();
1395 m_token->setForceQuirks();
1396 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1397 } else {
1398 parseError();
1399 m_token->setForceQuirks();
1400 HTML_ADVANCE_TO(BogusDOCTYPEState);
1403 END_STATE()
1405 HTML_BEGIN_STATE(AfterDOCTYPESystemKeywordState) {
1406 if (isTokenizerWhitespace(cc))
1407 HTML_ADVANCE_TO(BeforeDOCTYPESystemIdentifierState);
1408 else if (cc == '"') {
1409 parseError();
1410 m_token->setSystemIdentifierToEmptyString();
1411 HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
1412 } else if (cc == '\'') {
1413 parseError();
1414 m_token->setSystemIdentifierToEmptyString();
1415 HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
1416 } else if (cc == '>') {
1417 parseError();
1418 m_token->setForceQuirks();
1419 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1420 } else if (cc == kEndOfFileMarker) {
1421 parseError();
1422 m_token->setForceQuirks();
1423 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1424 } else {
1425 parseError();
1426 m_token->setForceQuirks();
1427 HTML_ADVANCE_TO(BogusDOCTYPEState);
1430 END_STATE()
1432 HTML_BEGIN_STATE(BeforeDOCTYPESystemIdentifierState) {
1433 if (isTokenizerWhitespace(cc))
1434 HTML_ADVANCE_TO(BeforeDOCTYPESystemIdentifierState);
1435 if (cc == '"') {
1436 m_token->setSystemIdentifierToEmptyString();
1437 HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
1438 } else if (cc == '\'') {
1439 m_token->setSystemIdentifierToEmptyString();
1440 HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
1441 } else if (cc == '>') {
1442 parseError();
1443 m_token->setForceQuirks();
1444 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1445 } else if (cc == kEndOfFileMarker) {
1446 parseError();
1447 m_token->setForceQuirks();
1448 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1449 } else {
1450 parseError();
1451 m_token->setForceQuirks();
1452 HTML_ADVANCE_TO(BogusDOCTYPEState);
1455 END_STATE()
1457 HTML_BEGIN_STATE(DOCTYPESystemIdentifierDoubleQuotedState) {
1458 if (cc == '"')
1459 HTML_ADVANCE_TO(AfterDOCTYPESystemIdentifierState);
1460 else if (cc == '>') {
1461 parseError();
1462 m_token->setForceQuirks();
1463 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1464 } else if (cc == kEndOfFileMarker) {
1465 parseError();
1466 m_token->setForceQuirks();
1467 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1468 } else {
1469 m_token->appendToSystemIdentifier(cc);
1470 HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
1473 END_STATE()
1475 HTML_BEGIN_STATE(DOCTYPESystemIdentifierSingleQuotedState) {
1476 if (cc == '\'')
1477 HTML_ADVANCE_TO(AfterDOCTYPESystemIdentifierState);
1478 else if (cc == '>') {
1479 parseError();
1480 m_token->setForceQuirks();
1481 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1482 } else if (cc == kEndOfFileMarker) {
1483 parseError();
1484 m_token->setForceQuirks();
1485 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1486 } else {
1487 m_token->appendToSystemIdentifier(cc);
1488 HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
1491 END_STATE()
1493 HTML_BEGIN_STATE(AfterDOCTYPESystemIdentifierState) {
1494 if (isTokenizerWhitespace(cc))
1495 HTML_ADVANCE_TO(AfterDOCTYPESystemIdentifierState);
1496 else if (cc == '>')
1497 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1498 else if (cc == kEndOfFileMarker) {
1499 parseError();
1500 m_token->setForceQuirks();
1501 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1502 } else {
1503 parseError();
1504 HTML_ADVANCE_TO(BogusDOCTYPEState);
1507 END_STATE()
1509 HTML_BEGIN_STATE(BogusDOCTYPEState) {
1510 if (cc == '>')
1511 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1512 else if (cc == kEndOfFileMarker)
1513 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1514 HTML_ADVANCE_TO(BogusDOCTYPEState);
1516 END_STATE()
1518 HTML_BEGIN_STATE(CDATASectionState) {
1519 if (cc == ']')
1520 HTML_ADVANCE_TO(CDATASectionRightSquareBracketState);
1521 else if (cc == kEndOfFileMarker)
1522 HTML_RECONSUME_IN(DataState);
1523 else {
1524 bufferCharacter(cc);
1525 HTML_ADVANCE_TO(CDATASectionState);
1528 END_STATE()
1530 HTML_BEGIN_STATE(CDATASectionRightSquareBracketState) {
1531 if (cc == ']')
1532 HTML_ADVANCE_TO(CDATASectionDoubleRightSquareBracketState);
1533 else {
1534 bufferCharacter(']');
1535 HTML_RECONSUME_IN(CDATASectionState);
1539 HTML_BEGIN_STATE(CDATASectionDoubleRightSquareBracketState) {
1540 if (cc == '>')
1541 HTML_ADVANCE_TO(DataState);
1542 else {
1543 bufferCharacter(']');
1544 bufferCharacter(']');
1545 HTML_RECONSUME_IN(CDATASectionState);
1548 END_STATE()
1552 ASSERT_NOT_REACHED();
1553 return false;
1556 String HTMLTokenizer::bufferedCharacters() const
1558 // FIXME: Add an assert about m_state.
1559 StringBuilder characters;
1560 characters.reserveCapacity(numberOfBufferedCharacters());
1561 characters.append('<');
1562 characters.append('/');
1563 characters.append(m_temporaryBuffer.data(), m_temporaryBuffer.size());
1564 return characters.toString();
1567 void HTMLTokenizer::updateStateFor(const String& tagName)
1569 if (threadSafeMatch(tagName, textareaTag) || threadSafeMatch(tagName, titleTag))
1570 setState(HTMLTokenizer::RCDATAState);
1571 else if (threadSafeMatch(tagName, plaintextTag))
1572 setState(HTMLTokenizer::PLAINTEXTState);
1573 else if (threadSafeMatch(tagName, scriptTag))
1574 setState(HTMLTokenizer::ScriptDataState);
1575 else if (threadSafeMatch(tagName, styleTag)
1576 || threadSafeMatch(tagName, iframeTag)
1577 || threadSafeMatch(tagName, xmpTag)
1578 || (threadSafeMatch(tagName, noembedTag) && m_options.pluginsEnabled)
1579 || threadSafeMatch(tagName, noframesTag)
1580 || (threadSafeMatch(tagName, noscriptTag) && m_options.scriptEnabled))
1581 setState(HTMLTokenizer::RAWTEXTState);
1584 inline bool HTMLTokenizer::temporaryBufferIs(const String& expectedString)
1586 return vectorEqualsString(m_temporaryBuffer, expectedString);
1589 inline void HTMLTokenizer::addToPossibleEndTag(LChar cc)
1591 ASSERT(isEndTagBufferingState(m_state));
1592 m_bufferedEndTagName.append(cc);
1595 inline bool HTMLTokenizer::isAppropriateEndTag()
1597 if (m_bufferedEndTagName.size() != m_appropriateEndTagName.size())
1598 return false;
1600 size_t numCharacters = m_bufferedEndTagName.size();
1602 for (size_t i = 0; i < numCharacters; i++) {
1603 if (m_bufferedEndTagName[i] != m_appropriateEndTagName[i])
1604 return false;
1607 return true;
1610 inline void HTMLTokenizer::parseError()
1612 notImplemented();