2 * Copyright (C) 2008 Apple Inc. All Rights Reserved.
3 * Copyright (C) 2010 Google, Inc. All Rights Reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 #ifndef HTMLTokenizer_h
28 #define HTMLTokenizer_h
30 #include "core/html/parser/HTMLParserOptions.h"
31 #include "core/html/parser/HTMLToken.h"
32 #include "core/html/parser/InputStreamPreprocessor.h"
33 #include "platform/text/SegmentedString.h"
38 WTF_MAKE_NONCOPYABLE(HTMLTokenizer
);
39 WTF_MAKE_FAST_ALLOCATED(HTMLTokenizer
);
41 static PassOwnPtr
<HTMLTokenizer
> create(const HTMLParserOptions
& options
) { return adoptPtr(new HTMLTokenizer(options
)); }
48 CharacterReferenceInDataState
,
50 CharacterReferenceInRCDATAState
,
57 RCDATALessThanSignState
,
58 RCDATAEndTagOpenState
,
59 RCDATAEndTagNameState
,
60 RAWTEXTLessThanSignState
,
61 RAWTEXTEndTagOpenState
,
62 RAWTEXTEndTagNameState
,
63 ScriptDataLessThanSignState
,
64 ScriptDataEndTagOpenState
,
65 ScriptDataEndTagNameState
,
66 ScriptDataEscapeStartState
,
67 ScriptDataEscapeStartDashState
,
68 ScriptDataEscapedState
,
69 ScriptDataEscapedDashState
,
70 ScriptDataEscapedDashDashState
,
71 ScriptDataEscapedLessThanSignState
,
72 ScriptDataEscapedEndTagOpenState
,
73 ScriptDataEscapedEndTagNameState
,
74 ScriptDataDoubleEscapeStartState
,
75 ScriptDataDoubleEscapedState
,
76 ScriptDataDoubleEscapedDashState
,
77 ScriptDataDoubleEscapedDashDashState
,
78 ScriptDataDoubleEscapedLessThanSignState
,
79 ScriptDataDoubleEscapeEndState
,
80 BeforeAttributeNameState
,
82 AfterAttributeNameState
,
83 BeforeAttributeValueState
,
84 AttributeValueDoubleQuotedState
,
85 AttributeValueSingleQuotedState
,
86 AttributeValueUnquotedState
,
87 CharacterReferenceInAttributeValueState
,
88 AfterAttributeValueQuotedState
,
89 SelfClosingStartTagState
,
91 // The ContinueBogusCommentState is not in the HTML5 spec, but we use
92 // it internally to keep track of whether we've started the bogus
94 ContinueBogusCommentState
,
95 MarkupDeclarationOpenState
,
97 CommentStartDashState
,
103 BeforeDOCTYPENameState
,
105 AfterDOCTYPENameState
,
106 AfterDOCTYPEPublicKeywordState
,
107 BeforeDOCTYPEPublicIdentifierState
,
108 DOCTYPEPublicIdentifierDoubleQuotedState
,
109 DOCTYPEPublicIdentifierSingleQuotedState
,
110 AfterDOCTYPEPublicIdentifierState
,
111 BetweenDOCTYPEPublicAndSystemIdentifiersState
,
112 AfterDOCTYPESystemKeywordState
,
113 BeforeDOCTYPESystemIdentifierState
,
114 DOCTYPESystemIdentifierDoubleQuotedState
,
115 DOCTYPESystemIdentifierSingleQuotedState
,
116 AfterDOCTYPESystemIdentifierState
,
119 // These CDATA states are not in the HTML5 spec, but we use them internally.
120 CDATASectionRightSquareBracketState
,
121 CDATASectionDoubleRightSquareBracketState
,
124 // This function returns true if it emits a token. Otherwise, callers
125 // must provide the same (in progress) token on the next call (unless
126 // they call reset() first).
127 bool nextToken(SegmentedString
&, HTMLToken
&);
129 // Returns a copy of any characters buffered internally by the tokenizer.
130 // The tokenizer buffers characters when searching for the </script> token
131 // that terminates a script element.
132 String
bufferedCharacters() const;
134 size_t numberOfBufferedCharacters() const
136 // Notice that we add 2 to the length of the m_temporaryBuffer to
137 // account for the "</" characters, which are effecitvely buffered in
138 // the tokenizer's state machine.
139 return m_temporaryBuffer
.size() ? m_temporaryBuffer
.size() + 2 : 0;
142 // Updates the tokenizer's state according to the given tag name. This is
143 // an approximation of how the tree builder would update the tokenizer's
144 // state. This method is useful for approximating HTML tokenization. To
145 // get exactly the correct tokenization, you need the real tree builder.
147 // The main failures in the approximation are as follows:
149 // * The first set of character tokens emitted for a <pre> element might
150 // contain an extra leading newline.
151 // * The replacement of U+0000 with U+FFFD will not be sensitive to the
152 // tree builder's insertion mode.
153 // * CDATA sections in foreign content will be tokenized as bogus comments
154 // instead of as character tokens.
156 void updateStateFor(const String
& tagName
);
158 bool forceNullCharacterReplacement() const { return m_forceNullCharacterReplacement
; }
159 void setForceNullCharacterReplacement(bool value
) { m_forceNullCharacterReplacement
= value
; }
161 bool shouldAllowCDATA() const { return m_shouldAllowCDATA
; }
162 void setShouldAllowCDATA(bool value
) { m_shouldAllowCDATA
= value
; }
164 State
state() const { return m_state
; }
165 void setState(State state
) { m_state
= state
; }
167 inline bool shouldSkipNullCharacters() const
169 return !m_forceNullCharacterReplacement
170 && (m_state
== HTMLTokenizer::DataState
171 || m_state
== HTMLTokenizer::RCDATAState
172 || m_state
== HTMLTokenizer::RAWTEXTState
);
176 explicit HTMLTokenizer(const HTMLParserOptions
&);
178 inline bool processEntity(SegmentedString
&);
180 inline void parseError();
182 inline void bufferCharacter(UChar character
)
184 ASSERT(character
!= kEndOfFileMarker
);
185 m_token
->ensureIsCharacterToken();
186 m_token
->appendToCharacter(character
);
189 inline bool emitAndResumeIn(SegmentedString
& source
, State state
)
191 saveEndTagNameIfNeeded();
193 source
.advanceAndUpdateLineNumber();
197 inline bool emitAndReconsumeIn(SegmentedString
&, State state
)
199 saveEndTagNameIfNeeded();
204 inline bool emitEndOfFile(SegmentedString
& source
)
206 if (haveBufferedCharacterToken())
208 m_state
= HTMLTokenizer::DataState
;
209 source
.advanceAndUpdateLineNumber();
211 m_token
->makeEndOfFile();
215 inline bool flushEmitAndResumeIn(SegmentedString
&, State
);
217 // Return whether we need to emit a character token before dealing with
218 // the buffered end tag.
219 inline bool flushBufferedEndTag(SegmentedString
&);
220 inline bool temporaryBufferIs(const String
&);
222 // Sometimes we speculatively consume input characters and we don't
223 // know whether they represent end tags or RCDATA, etc. These
224 // functions help manage these state.
225 inline void addToPossibleEndTag(LChar cc
);
227 inline void saveEndTagNameIfNeeded()
229 ASSERT(m_token
->type() != HTMLToken::Uninitialized
);
230 if (m_token
->type() == HTMLToken::StartTag
)
231 m_appropriateEndTagName
= m_token
->name();
233 inline bool isAppropriateEndTag();
236 inline bool haveBufferedCharacterToken()
238 return m_token
->type() == HTMLToken::Character
;
242 bool m_forceNullCharacterReplacement
;
243 bool m_shouldAllowCDATA
;
245 // m_token is owned by the caller. If nextToken is not on the stack,
246 // this member might be pointing to unallocated memory.
249 // http://www.whatwg.org/specs/web-apps/current-work/#additional-allowed-character
250 UChar m_additionalAllowedCharacter
;
252 // http://www.whatwg.org/specs/web-apps/current-work/#preprocessing-the-input-stream
253 InputStreamPreprocessor
<HTMLTokenizer
> m_inputStreamPreprocessor
;
255 Vector
<UChar
, 32> m_appropriateEndTagName
;
257 // http://www.whatwg.org/specs/web-apps/current-work/#temporary-buffer
258 Vector
<LChar
, 32> m_temporaryBuffer
;
260 // We occationally want to emit both a character token and an end tag
261 // token (e.g., when lexing script). We buffer the name of the end tag
262 // token here so we remember it next time we re-enter the tokenizer.
263 Vector
<LChar
, 32> m_bufferedEndTagName
;
265 HTMLParserOptions m_options
;