Move parseFontFaceDescriptor to CSSPropertyParser.cpp
[chromium-blink-merge.git] / third_party / WebKit / Source / core / html / parser / HTMLTokenizer.h
blobc23ab6ca3085ea819b6604fd04b741809900fa66
1 /*
2 * Copyright (C) 2008 Apple Inc. All Rights Reserved.
3 * Copyright (C) 2010 Google, Inc. All Rights Reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 #ifndef HTMLTokenizer_h
28 #define HTMLTokenizer_h
30 #include "core/html/parser/HTMLParserOptions.h"
31 #include "core/html/parser/HTMLToken.h"
32 #include "core/html/parser/InputStreamPreprocessor.h"
33 #include "platform/text/SegmentedString.h"
35 namespace blink {
37 class HTMLTokenizer {
38 WTF_MAKE_NONCOPYABLE(HTMLTokenizer);
39 WTF_MAKE_FAST_ALLOCATED(HTMLTokenizer);
40 public:
41 static PassOwnPtr<HTMLTokenizer> create(const HTMLParserOptions& options) { return adoptPtr(new HTMLTokenizer(options)); }
42 ~HTMLTokenizer();
44 void reset();
46 enum State {
47 DataState,
48 CharacterReferenceInDataState,
49 RCDATAState,
50 CharacterReferenceInRCDATAState,
51 RAWTEXTState,
52 ScriptDataState,
53 PLAINTEXTState,
54 TagOpenState,
55 EndTagOpenState,
56 TagNameState,
57 RCDATALessThanSignState,
58 RCDATAEndTagOpenState,
59 RCDATAEndTagNameState,
60 RAWTEXTLessThanSignState,
61 RAWTEXTEndTagOpenState,
62 RAWTEXTEndTagNameState,
63 ScriptDataLessThanSignState,
64 ScriptDataEndTagOpenState,
65 ScriptDataEndTagNameState,
66 ScriptDataEscapeStartState,
67 ScriptDataEscapeStartDashState,
68 ScriptDataEscapedState,
69 ScriptDataEscapedDashState,
70 ScriptDataEscapedDashDashState,
71 ScriptDataEscapedLessThanSignState,
72 ScriptDataEscapedEndTagOpenState,
73 ScriptDataEscapedEndTagNameState,
74 ScriptDataDoubleEscapeStartState,
75 ScriptDataDoubleEscapedState,
76 ScriptDataDoubleEscapedDashState,
77 ScriptDataDoubleEscapedDashDashState,
78 ScriptDataDoubleEscapedLessThanSignState,
79 ScriptDataDoubleEscapeEndState,
80 BeforeAttributeNameState,
81 AttributeNameState,
82 AfterAttributeNameState,
83 BeforeAttributeValueState,
84 AttributeValueDoubleQuotedState,
85 AttributeValueSingleQuotedState,
86 AttributeValueUnquotedState,
87 CharacterReferenceInAttributeValueState,
88 AfterAttributeValueQuotedState,
89 SelfClosingStartTagState,
90 BogusCommentState,
91 // The ContinueBogusCommentState is not in the HTML5 spec, but we use
92 // it internally to keep track of whether we've started the bogus
93 // comment token yet.
94 ContinueBogusCommentState,
95 MarkupDeclarationOpenState,
96 CommentStartState,
97 CommentStartDashState,
98 CommentState,
99 CommentEndDashState,
100 CommentEndState,
101 CommentEndBangState,
102 DOCTYPEState,
103 BeforeDOCTYPENameState,
104 DOCTYPENameState,
105 AfterDOCTYPENameState,
106 AfterDOCTYPEPublicKeywordState,
107 BeforeDOCTYPEPublicIdentifierState,
108 DOCTYPEPublicIdentifierDoubleQuotedState,
109 DOCTYPEPublicIdentifierSingleQuotedState,
110 AfterDOCTYPEPublicIdentifierState,
111 BetweenDOCTYPEPublicAndSystemIdentifiersState,
112 AfterDOCTYPESystemKeywordState,
113 BeforeDOCTYPESystemIdentifierState,
114 DOCTYPESystemIdentifierDoubleQuotedState,
115 DOCTYPESystemIdentifierSingleQuotedState,
116 AfterDOCTYPESystemIdentifierState,
117 BogusDOCTYPEState,
118 CDATASectionState,
119 // These CDATA states are not in the HTML5 spec, but we use them internally.
120 CDATASectionRightSquareBracketState,
121 CDATASectionDoubleRightSquareBracketState,
124 // This function returns true if it emits a token. Otherwise, callers
125 // must provide the same (in progress) token on the next call (unless
126 // they call reset() first).
127 bool nextToken(SegmentedString&, HTMLToken&);
129 // Returns a copy of any characters buffered internally by the tokenizer.
130 // The tokenizer buffers characters when searching for the </script> token
131 // that terminates a script element.
132 String bufferedCharacters() const;
134 size_t numberOfBufferedCharacters() const
136 // Notice that we add 2 to the length of the m_temporaryBuffer to
137 // account for the "</" characters, which are effecitvely buffered in
138 // the tokenizer's state machine.
139 return m_temporaryBuffer.size() ? m_temporaryBuffer.size() + 2 : 0;
142 // Updates the tokenizer's state according to the given tag name. This is
143 // an approximation of how the tree builder would update the tokenizer's
144 // state. This method is useful for approximating HTML tokenization. To
145 // get exactly the correct tokenization, you need the real tree builder.
147 // The main failures in the approximation are as follows:
149 // * The first set of character tokens emitted for a <pre> element might
150 // contain an extra leading newline.
151 // * The replacement of U+0000 with U+FFFD will not be sensitive to the
152 // tree builder's insertion mode.
153 // * CDATA sections in foreign content will be tokenized as bogus comments
154 // instead of as character tokens.
156 void updateStateFor(const String& tagName);
158 bool forceNullCharacterReplacement() const { return m_forceNullCharacterReplacement; }
159 void setForceNullCharacterReplacement(bool value) { m_forceNullCharacterReplacement = value; }
161 bool shouldAllowCDATA() const { return m_shouldAllowCDATA; }
162 void setShouldAllowCDATA(bool value) { m_shouldAllowCDATA = value; }
164 State state() const { return m_state; }
165 void setState(State state) { m_state = state; }
167 inline bool shouldSkipNullCharacters() const
169 return !m_forceNullCharacterReplacement
170 && (m_state == HTMLTokenizer::DataState
171 || m_state == HTMLTokenizer::RCDATAState
172 || m_state == HTMLTokenizer::RAWTEXTState);
175 private:
176 explicit HTMLTokenizer(const HTMLParserOptions&);
178 inline bool processEntity(SegmentedString&);
180 inline void parseError();
182 inline void bufferCharacter(UChar character)
184 ASSERT(character != kEndOfFileMarker);
185 m_token->ensureIsCharacterToken();
186 m_token->appendToCharacter(character);
189 inline bool emitAndResumeIn(SegmentedString& source, State state)
191 saveEndTagNameIfNeeded();
192 m_state = state;
193 source.advanceAndUpdateLineNumber();
194 return true;
197 inline bool emitAndReconsumeIn(SegmentedString&, State state)
199 saveEndTagNameIfNeeded();
200 m_state = state;
201 return true;
204 inline bool emitEndOfFile(SegmentedString& source)
206 if (haveBufferedCharacterToken())
207 return true;
208 m_state = HTMLTokenizer::DataState;
209 source.advanceAndUpdateLineNumber();
210 m_token->clear();
211 m_token->makeEndOfFile();
212 return true;
215 inline bool flushEmitAndResumeIn(SegmentedString&, State);
217 // Return whether we need to emit a character token before dealing with
218 // the buffered end tag.
219 inline bool flushBufferedEndTag(SegmentedString&);
220 inline bool temporaryBufferIs(const String&);
222 // Sometimes we speculatively consume input characters and we don't
223 // know whether they represent end tags or RCDATA, etc. These
224 // functions help manage these state.
225 inline void addToPossibleEndTag(LChar cc);
227 inline void saveEndTagNameIfNeeded()
229 ASSERT(m_token->type() != HTMLToken::Uninitialized);
230 if (m_token->type() == HTMLToken::StartTag)
231 m_appropriateEndTagName = m_token->name();
233 inline bool isAppropriateEndTag();
236 inline bool haveBufferedCharacterToken()
238 return m_token->type() == HTMLToken::Character;
241 State m_state;
242 bool m_forceNullCharacterReplacement;
243 bool m_shouldAllowCDATA;
245 // m_token is owned by the caller. If nextToken is not on the stack,
246 // this member might be pointing to unallocated memory.
247 HTMLToken* m_token;
249 // http://www.whatwg.org/specs/web-apps/current-work/#additional-allowed-character
250 UChar m_additionalAllowedCharacter;
252 // http://www.whatwg.org/specs/web-apps/current-work/#preprocessing-the-input-stream
253 InputStreamPreprocessor<HTMLTokenizer> m_inputStreamPreprocessor;
255 Vector<UChar, 32> m_appropriateEndTagName;
257 // http://www.whatwg.org/specs/web-apps/current-work/#temporary-buffer
258 Vector<LChar, 32> m_temporaryBuffer;
260 // We occationally want to emit both a character token and an end tag
261 // token (e.g., when lexing script). We buffer the name of the end tag
262 // token here so we remember it next time we re-enter the tokenizer.
263 Vector<LChar, 32> m_bufferedEndTagName;
265 HTMLParserOptions m_options;
270 #endif