third_party/WebKit/Source/core/html/parser/HTMLTokenizer.h

   1 /*
   2  * Copyright (C) 2008 Apple Inc. All Rights Reserved.
   3  * Copyright (C) 2010 Google, Inc. All Rights Reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  *
  14  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
  15  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  17  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
  18  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  19  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  20  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  21  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  22  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  24  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  25  */
  26
  27 #ifndef HTMLTokenizer_h
  28 #define HTMLTokenizer_h
  29
  30 #include "core/html/parser/HTMLParserOptions.h"
  31 #include "core/html/parser/HTMLToken.h"
  32 #include "core/html/parser/InputStreamPreprocessor.h"
  33 #include "platform/text/SegmentedString.h"
  34
  35 namespace blink {
  36
  37 class HTMLTokenizer {
  38     WTF_MAKE_NONCOPYABLE(HTMLTokenizer);
  39     WTF_MAKE_FAST_ALLOCATED(HTMLTokenizer);
  40 public:
  41     static PassOwnPtr<HTMLTokenizer> create(const HTMLParserOptions& options) { return adoptPtr(new HTMLTokenizer(options)); }
  42     ~HTMLTokenizer();
  43
  44     void reset();
  45
  46     enum State {
  47         DataState,
  48         CharacterReferenceInDataState,
  49         RCDATAState,
  50         CharacterReferenceInRCDATAState,
  51         RAWTEXTState,
  52         ScriptDataState,
  53         PLAINTEXTState,
  54         TagOpenState,
  55         EndTagOpenState,
  56         TagNameState,
  57         RCDATALessThanSignState,
  58         RCDATAEndTagOpenState,
  59         RCDATAEndTagNameState,
  60         RAWTEXTLessThanSignState,
  61         RAWTEXTEndTagOpenState,
  62         RAWTEXTEndTagNameState,
  63         ScriptDataLessThanSignState,
  64         ScriptDataEndTagOpenState,
  65         ScriptDataEndTagNameState,
  66         ScriptDataEscapeStartState,
  67         ScriptDataEscapeStartDashState,
  68         ScriptDataEscapedState,
  69         ScriptDataEscapedDashState,
  70         ScriptDataEscapedDashDashState,
  71         ScriptDataEscapedLessThanSignState,
  72         ScriptDataEscapedEndTagOpenState,
  73         ScriptDataEscapedEndTagNameState,
  74         ScriptDataDoubleEscapeStartState,
  75         ScriptDataDoubleEscapedState,
  76         ScriptDataDoubleEscapedDashState,
  77         ScriptDataDoubleEscapedDashDashState,
  78         ScriptDataDoubleEscapedLessThanSignState,
  79         ScriptDataDoubleEscapeEndState,
  80         BeforeAttributeNameState,
  81         AttributeNameState,
  82         AfterAttributeNameState,
  83         BeforeAttributeValueState,
  84         AttributeValueDoubleQuotedState,
  85         AttributeValueSingleQuotedState,
  86         AttributeValueUnquotedState,
  87         CharacterReferenceInAttributeValueState,
  88         AfterAttributeValueQuotedState,
  89         SelfClosingStartTagState,
  90         BogusCommentState,
  91         // The ContinueBogusCommentState is not in the HTML5 spec, but we use
  92         // it internally to keep track of whether we've started the bogus
  93         // comment token yet.
  94         ContinueBogusCommentState,
  95         MarkupDeclarationOpenState,
  96         CommentStartState,
  97         CommentStartDashState,
  98         CommentState,
  99         CommentEndDashState,
 100         CommentEndState,
 101         CommentEndBangState,
 102         DOCTYPEState,
 103         BeforeDOCTYPENameState,
 104         DOCTYPENameState,
 105         AfterDOCTYPENameState,
 106         AfterDOCTYPEPublicKeywordState,
 107         BeforeDOCTYPEPublicIdentifierState,
 108         DOCTYPEPublicIdentifierDoubleQuotedState,
 109         DOCTYPEPublicIdentifierSingleQuotedState,
 110         AfterDOCTYPEPublicIdentifierState,
 111         BetweenDOCTYPEPublicAndSystemIdentifiersState,
 112         AfterDOCTYPESystemKeywordState,
 113         BeforeDOCTYPESystemIdentifierState,
 114         DOCTYPESystemIdentifierDoubleQuotedState,
 115         DOCTYPESystemIdentifierSingleQuotedState,
 116         AfterDOCTYPESystemIdentifierState,
 117         BogusDOCTYPEState,
 118         CDATASectionState,
 119         // These CDATA states are not in the HTML5 spec, but we use them internally.
 120         CDATASectionRightSquareBracketState,
 121         CDATASectionDoubleRightSquareBracketState,
 122     };
 123
 124     // This function returns true if it emits a token. Otherwise, callers
 125     // must provide the same (in progress) token on the next call (unless
 126     // they call reset() first).
 127     bool nextToken(SegmentedString&, HTMLToken&);
 128
 129     // Returns a copy of any characters buffered internally by the tokenizer.
 130     // The tokenizer buffers characters when searching for the </script> token
 131     // that terminates a script element.
 132     String bufferedCharacters() const;
 133
 134     size_t numberOfBufferedCharacters() const
 135     {
 136         // Notice that we add 2 to the length of the m_temporaryBuffer to
 137         // account for the "</" characters, which are effecitvely buffered in
 138         // the tokenizer's state machine.
 139         return m_temporaryBuffer.size() ? m_temporaryBuffer.size() + 2 : 0;
 140     }
 141
 142     // Updates the tokenizer's state according to the given tag name. This is
 143     // an approximation of how the tree builder would update the tokenizer's
 144     // state. This method is useful for approximating HTML tokenization. To
 145     // get exactly the correct tokenization, you need the real tree builder.
 146     //
 147     // The main failures in the approximation are as follows:
 148     //
 149     //  * The first set of character tokens emitted for a <pre> element might
 150     //    contain an extra leading newline.
 151     //  * The replacement of U+0000 with U+FFFD will not be sensitive to the
 152     //    tree builder's insertion mode.
 153     //  * CDATA sections in foreign content will be tokenized as bogus comments
 154     //    instead of as character tokens.
 155     //
 156     void updateStateFor(const String& tagName);
 157
 158     bool forceNullCharacterReplacement() const { return m_forceNullCharacterReplacement; }
 159     void setForceNullCharacterReplacement(bool value) { m_forceNullCharacterReplacement = value; }
 160
 161     bool shouldAllowCDATA() const { return m_shouldAllowCDATA; }
 162     void setShouldAllowCDATA(bool value) { m_shouldAllowCDATA = value; }
 163
 164     State state() const { return m_state; }
 165     void setState(State state) { m_state = state; }
 166
 167     inline bool shouldSkipNullCharacters() const
 168     {
 169         return !m_forceNullCharacterReplacement
 170             && (m_state == HTMLTokenizer::DataState
 171                 || m_state == HTMLTokenizer::RCDATAState
 172                 || m_state == HTMLTokenizer::RAWTEXTState);
 173     }
 174
 175 private:
 176     explicit HTMLTokenizer(const HTMLParserOptions&);
 177
 178     inline bool processEntity(SegmentedString&);
 179
 180     inline void parseError();
 181
 182     inline void bufferCharacter(UChar character)
 183     {
 184         ASSERT(character != kEndOfFileMarker);
 185         m_token->ensureIsCharacterToken();
 186         m_token->appendToCharacter(character);
 187     }
 188
 189     inline bool emitAndResumeIn(SegmentedString& source, State state)
 190     {
 191         saveEndTagNameIfNeeded();
 192         m_state = state;
 193         source.advanceAndUpdateLineNumber();
 194         return true;
 195     }
 196
 197     inline bool emitAndReconsumeIn(SegmentedString&, State state)
 198     {
 199         saveEndTagNameIfNeeded();
 200         m_state = state;
 201         return true;
 202     }
 203
 204     inline bool emitEndOfFile(SegmentedString& source)
 205     {
 206         if (haveBufferedCharacterToken())
 207             return true;
 208         m_state = HTMLTokenizer::DataState;
 209         source.advanceAndUpdateLineNumber();
 210         m_token->clear();
 211         m_token->makeEndOfFile();
 212         return true;
 213     }
 214
 215     inline bool flushEmitAndResumeIn(SegmentedString&, State);
 216
 217     // Return whether we need to emit a character token before dealing with
 218     // the buffered end tag.
 219     inline bool flushBufferedEndTag(SegmentedString&);
 220     inline bool temporaryBufferIs(const String&);
 221
 222     // Sometimes we speculatively consume input characters and we don't
 223     // know whether they represent end tags or RCDATA, etc. These
 224     // functions help manage these state.
 225     inline void addToPossibleEndTag(LChar cc);
 226
 227     inline void saveEndTagNameIfNeeded()
 228     {
 229         ASSERT(m_token->type() != HTMLToken::Uninitialized);
 230         if (m_token->type() == HTMLToken::StartTag)
 231             m_appropriateEndTagName = m_token->name();
 232     }
 233     inline bool isAppropriateEndTag();
 234
 235
 236     inline bool haveBufferedCharacterToken()
 237     {
 238         return m_token->type() == HTMLToken::Character;
 239     }
 240
 241     State m_state;
 242     bool m_forceNullCharacterReplacement;
 243     bool m_shouldAllowCDATA;
 244
 245     // m_token is owned by the caller. If nextToken is not on the stack,
 246     // this member might be pointing to unallocated memory.
 247     HTMLToken* m_token;
 248
 249     // http://www.whatwg.org/specs/web-apps/current-work/#additional-allowed-character
 250     UChar m_additionalAllowedCharacter;
 251
 252     // http://www.whatwg.org/specs/web-apps/current-work/#preprocessing-the-input-stream
 253     InputStreamPreprocessor<HTMLTokenizer> m_inputStreamPreprocessor;
 254
 255     Vector<UChar, 32> m_appropriateEndTagName;
 256
 257     // http://www.whatwg.org/specs/web-apps/current-work/#temporary-buffer
 258     Vector<LChar, 32> m_temporaryBuffer;
 259
 260     // We occationally want to emit both a character token and an end tag
 261     // token (e.g., when lexing script). We buffer the name of the end tag
 262     // token here so we remember it next time we re-enter the tokenizer.
 263     Vector<LChar, 32> m_bufferedEndTagName;
 264
 265     HTMLParserOptions m_options;
 266 };
 267
 268 }
 269
 270 #endif