2 * Copyright (C) 2008 Apple Inc. All Rights Reserved.
3 * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/
4 * Copyright (C) 2013 Google, Inc. All Rights Reserved.
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
15 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
16 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
19 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 #ifndef InputStreamPreprocessor_h
29 #define InputStreamPreprocessor_h
31 #include "html_character_provider.h"
35 // http://www.whatwg.org/specs/web-apps/current-work/#preprocessing-the-input-stream
36 template <typename Tokenizer
>
37 class InputStreamPreprocessor
{
38 WTF_MAKE_NONCOPYABLE(InputStreamPreprocessor
);
40 InputStreamPreprocessor(Tokenizer
* tokenizer
)
41 : m_tokenizer(tokenizer
)
46 ALWAYS_INLINE UChar
nextInputCharacter() const { return m_nextInputCharacter
; }
48 // Returns whether we succeeded in peeking at the next character.
49 // The only way we can fail to peek is if there are no more
50 // characters in |source| (after collapsing \r\n, etc).
51 ALWAYS_INLINE
bool peek(CharacterProvider
& source
)
53 m_nextInputCharacter
= source
.currentCharacter();
55 // Every branch in this function is expensive, so we have a
56 // fast-reject branch for characters that don't require special
57 // handling. Please run the parser benchmark whenever you touch
58 // this function. It's very hot.
59 static const UChar specialCharacterMask
= '\n' | '\r' | '\0';
60 if (m_nextInputCharacter
& ~specialCharacterMask
) {
61 m_skipNextNewLine
= false;
64 return processNextInputCharacter(source
);
67 // Returns whether there are more characters in |source| after advancing.
68 ALWAYS_INLINE
bool advance(CharacterProvider
& source
)
76 void reset(bool skipNextNewLine
= false)
78 m_nextInputCharacter
= '\0';
79 m_skipNextNewLine
= skipNextNewLine
;
83 bool processNextInputCharacter(CharacterProvider
& source
)
86 ASSERT(m_nextInputCharacter
== source
.currentCharacter());
88 if (m_nextInputCharacter
== '\n' && m_skipNextNewLine
) {
89 m_skipNextNewLine
= false;
93 m_nextInputCharacter
= source
.currentCharacter();
95 if (m_nextInputCharacter
== '\r') {
96 m_nextInputCharacter
= '\n';
97 m_skipNextNewLine
= true;
99 m_skipNextNewLine
= false;
100 // FIXME: The spec indicates that the surrogate pair range as well as
101 // a number of specific character values are parse errors and should be replaced
102 // by the replacement character. We suspect this is a problem with the spec as doing
103 // that filtering breaks surrogate pair handling and causes us not to match Minefield.
104 if (m_nextInputCharacter
== '\0' && !shouldTreatNullAsEndOfFileMarker(source
)) {
105 if (m_tokenizer
->shouldSkipNullCharacters()) {
107 if (source
.isEmpty())
109 m_nextInputCharacter
= source
.currentCharacter();
112 m_nextInputCharacter
= 0xFFFD;
118 bool shouldTreatNullAsEndOfFileMarker(CharacterProvider
& source
) const
120 return source
.remainingBytes() == 1;
123 Tokenizer
* m_tokenizer
;
125 // http://www.whatwg.org/specs/web-apps/current-work/#next-input-character
126 UChar m_nextInputCharacter
;
127 bool m_skipNextNewLine
;
132 #endif // InputStreamPreprocessor_h