1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set sw=2 ts=2 et tw=78: */
3 /* ***** BEGIN LICENSE BLOCK *****
4 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
6 * The contents of this file are subject to the Mozilla Public License Version
7 * 1.1 (the "License"); you may not use this file except in compliance with
8 * the License. You may obtain a copy of the License at
9 * http://www.mozilla.org/MPL/
11 * Software distributed under the License is distributed on an "AS IS" basis,
12 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
13 * for the specific language governing rights and limitations under the
16 * The Original Code is mozilla.org code.
18 * The Initial Developer of the Original Code is
19 * Netscape Communications Corporation.
20 * Portions created by the Initial Developer are Copyright (C) 1998
21 * the Initial Developer. All Rights Reserved.
24 * Blake Kaplan <mrbkap@gmail.com>
26 * Alternatively, the contents of this file may be used under the terms of
27 * either of the GNU General Public License Version 2 or later (the "GPL"),
28 * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
29 * in which case the provisions of the GPL or the LGPL are applicable instead
30 * of those above. If you wish to allow use of your version of this file only
31 * under the terms of either the GPL or the LGPL, and not to allow others to
32 * use your version of this file under the terms of the MPL, indicate your
33 * decision by deleting the provisions above and replace them with the notice
34 * and other provisions required by the GPL or the LGPL. If you do not delete
35 * the provisions above, a recipient may use your version of this file under
36 * the terms of any one of the MPL, the GPL or the LGPL.
38 * ***** END LICENSE BLOCK ***** */
42 * @file nsHTMLTokenizer.cpp
43 * This is an implementation of the nsITokenizer interface.
44 * This file contains the implementation of a tokenizer to tokenize an HTML
45 * document. It attempts to do so, making tradeoffs between compatibility with
46 * older parsers and the SGML specification. Note that most of the real
47 * "tokenization" takes place in nsHTMLTokens.cpp.
51 #include "nsHTMLTokenizer.h"
52 #include "nsScanner.h"
53 #include "nsElementTable.h"
54 #include "CParserContext.h"
55 #include "nsReadableUtils.h"
56 #include "nsUnicharUtils.h"
58 /************************************************************************
59 And now for the main class -- nsHTMLTokenizer...
60 ************************************************************************/
63 * Satisfy the nsISupports interface.
65 NS_IMPL_ISUPPORTS1(nsHTMLTokenizer
, nsITokenizer
)
70 * @param aParseMode The current mode the document is in (quirks, etc.)
71 * @param aDocType The document type of the current document
72 * @param aCommand What we are trying to do (view-source, parse a fragment, etc.)
74 nsHTMLTokenizer::nsHTMLTokenizer(PRInt32 aParseMode
,
75 eParserDocType aDocType
,
76 eParserCommands aCommand
,
78 nsITokenizer(), mTokenDeque(0), mFlags(aFlags
)
80 if (aParseMode
== eDTDMode_full_standards
||
81 aParseMode
== eDTDMode_almost_standards
) {
82 mFlags
|= NS_IPARSER_FLAG_STRICT_MODE
;
83 } else if (aParseMode
== eDTDMode_quirks
) {
84 mFlags
|= NS_IPARSER_FLAG_QUIRKS_MODE
;
85 } else if (aParseMode
== eDTDMode_autodetect
) {
86 mFlags
|= NS_IPARSER_FLAG_AUTO_DETECT_MODE
;
88 mFlags
|= NS_IPARSER_FLAG_UNKNOWN_MODE
;
91 if (aDocType
== ePlainText
) {
92 mFlags
|= NS_IPARSER_FLAG_PLAIN_TEXT
;
93 } else if (aDocType
== eXML
) {
94 mFlags
|= NS_IPARSER_FLAG_XML
;
95 } else if (aDocType
== eHTML_Quirks
||
96 aDocType
== eHTML3_Quirks
||
97 aDocType
== eHTML_Strict
) {
98 mFlags
|= NS_IPARSER_FLAG_HTML
;
101 mFlags
|= aCommand
== eViewSource
102 ? NS_IPARSER_FLAG_VIEW_SOURCE
103 : NS_IPARSER_FLAG_VIEW_NORMAL
;
105 NS_ASSERTION(!(mFlags
& NS_IPARSER_FLAG_XML
) ||
106 (mFlags
& NS_IPARSER_FLAG_VIEW_SOURCE
),
107 "Why isn't this XML document going through our XML parser?");
109 mTokenAllocator
= nsnull
;
114 * The destructor ensures that we don't leak any left over tokens.
116 nsHTMLTokenizer::~nsHTMLTokenizer()
118 if (mTokenDeque
.GetSize()) {
119 CTokenDeallocator
theDeallocator(mTokenAllocator
->GetArenaPool());
120 mTokenDeque
.ForEach(theDeallocator
);
125 /*******************************************************************
126 Here begins the real working methods for the tokenizer.
127 *******************************************************************/
130 * Adds a token onto the end of the deque if aResult is a successful result.
131 * Otherwise, this function frees aToken and sets it to nsnull.
133 * @param aToken The token that wants to be added.
134 * @param aResult The error code that will be used to determine if we actually
135 * want to push this token.
136 * @param aDeque The deque we want to push aToken onto.
137 * @param aTokenAllocator The allocator we use to free aToken in case aResult
138 * is not a success code.
142 nsHTMLTokenizer::AddToken(CToken
*& aToken
,
145 nsTokenAllocator
* aTokenAllocator
)
147 if (aToken
&& aDeque
) {
148 if (NS_SUCCEEDED(aResult
)) {
149 aDeque
->Push(aToken
);
151 IF_FREE(aToken
, aTokenAllocator
);
157 * Retrieve a pointer to the global token recycler...
159 * @return Pointer to recycler (or null)
162 nsHTMLTokenizer::GetTokenAllocator()
164 return mTokenAllocator
;
168 * This method provides access to the topmost token in the tokenDeque.
169 * The token is not really removed from the list.
171 * @return Pointer to token
174 nsHTMLTokenizer::PeekToken()
176 return (CToken
*)mTokenDeque
.PeekFront();
180 * This method provides access to the topmost token in the tokenDeque.
181 * The token is really removed from the list; if the list is empty we return 0.
183 * @return Pointer to token or NULL
186 nsHTMLTokenizer::PopToken()
188 return (CToken
*)mTokenDeque
.PopFront();
193 * Pushes a token onto the front of our deque such that the next call to
194 * PopToken() or PeekToken() will return that token.
196 * @param theToken The next token to be processed
200 nsHTMLTokenizer::PushTokenFront(CToken
* theToken
)
202 mTokenDeque
.PushFront(theToken
);
207 * Pushes a token onto the deque.
209 * @param theToken the new token.
213 nsHTMLTokenizer::PushToken(CToken
* theToken
)
215 mTokenDeque
.Push(theToken
);
220 * Returns the size of the deque.
222 * @return The number of remaining tokens.
225 nsHTMLTokenizer::GetCount()
227 return mTokenDeque
.GetSize();
231 * Allows access to an arbitrary token in the deque. The accessed token is left
234 * @param anIndex The index of the target token. Token 0 would be the same as
235 * the result of a call to PeekToken()
236 * @return The requested token.
239 nsHTMLTokenizer::GetTokenAt(PRInt32 anIndex
)
241 return (CToken
*)mTokenDeque
.ObjectAt(anIndex
);
245 * This method is part of the "sandwich" that occurs when we want to tokenize
246 * a document. This prepares us to be able to tokenize properly.
248 * @param aIsFinalChunk Whether this is the last chunk of data that we will
250 * @param aTokenAllocator The token allocator to use for this document.
251 * @return Our success in setting up.
254 nsHTMLTokenizer::WillTokenize(PRBool aIsFinalChunk
,
255 nsTokenAllocator
* aTokenAllocator
)
257 mTokenAllocator
= aTokenAllocator
;
258 mIsFinalChunk
= aIsFinalChunk
;
260 // Cause ScanDocStructure to search from here for new tokens...
261 mTokenScanPos
= mTokenDeque
.GetSize();
266 * Pushes all of the tokens in aDeque onto the front of our deque so they
267 * get processed before any other tokens.
269 * @param aDeque The deque with the tokens in it.
272 nsHTMLTokenizer::PrependTokens(nsDeque
& aDeque
)
274 PRInt32 aCount
= aDeque
.GetSize();
276 for (PRInt32 anIndex
= 0; anIndex
< aCount
; ++anIndex
) {
277 CToken
* theToken
= (CToken
*)aDeque
.Pop();
278 PushTokenFront(theToken
);
283 * Copies the state flags from aTokenizer into this tokenizer. This is used
284 * to pass information around between the main tokenizer and tokenizers
285 * created for document.write() calls.
287 * @param aTokenizer The tokenizer with more information in it.
291 nsHTMLTokenizer::CopyState(nsITokenizer
* aTokenizer
)
294 mFlags
= ((nsHTMLTokenizer
*)aTokenizer
)->mFlags
;
301 * This is a utilty method for ScanDocStructure, which finds a given
302 * tag in the stack. The return value is meant to be used with
303 * nsDeque::ObjectAt() on aTagStack.
305 * @param aTag -- the ID of the tag we're seeking
306 * @param aTagStack -- the stack to be searched
307 * @return index position of tag in stack if found, otherwise kNotFound
310 FindLastIndexOfTag(eHTMLTags aTag
, nsDeque
&aTagStack
)
312 PRInt32 theCount
= aTagStack
.GetSize();
314 while (0 < theCount
) {
315 CHTMLToken
* theToken
= (CHTMLToken
*)aTagStack
.ObjectAt(--theCount
);
317 eHTMLTags theTag
= (eHTMLTags
)theToken
->GetTypeID();
318 if (theTag
== aTag
) {
328 * This method scans the sequence of tokens to determine whether or not the
329 * tag structure of the document is well formed. In well formed cases, we can
330 * skip doing residual style handling and allow inlines to contain block-level
333 * @param aFinalChunk Is unused.
334 * @return Success (currently, this function cannot fail).
336 nsresult
nsHTMLTokenizer::ScanDocStructure(PRBool aFinalChunk
)
338 nsresult result
= NS_OK
;
339 if (!mTokenDeque
.GetSize()) {
343 CHTMLToken
* theToken
= (CHTMLToken
*)mTokenDeque
.ObjectAt(mTokenScanPos
);
345 // Start by finding the first start tag that hasn't been reviewed.
346 while (mTokenScanPos
> 0) {
348 eHTMLTokenTypes theType
= eHTMLTokenTypes(theToken
->GetTokenType());
349 if (theType
== eToken_start
&&
350 theToken
->GetContainerInfo() == eFormUnknown
) {
354 theToken
= (CHTMLToken
*)mTokenDeque
.ObjectAt(--mTokenScanPos
);
357 // Now that we know where to start, let's walk through the
358 // tokens to see which are well-formed. Stop when you run out
362 nsDeque
tempStack(0);
363 PRInt32 theStackDepth
= 0;
364 // Don't bother if we get ridiculously deep.
365 static const PRInt32 theMaxStackDepth
= 200;
367 while (theToken
&& theStackDepth
< theMaxStackDepth
) {
368 eHTMLTokenTypes theType
= eHTMLTokenTypes(theToken
->GetTokenType());
369 eHTMLTags theTag
= (eHTMLTags
)theToken
->GetTypeID();
371 if (nsHTMLElement::IsContainer(theTag
)) { // Bug 54117
372 PRBool theTagIsBlock
= gHTMLElements
[theTag
].IsMemberOf(kBlockEntity
);
373 PRBool theTagIsInline
= theTagIsBlock
375 : gHTMLElements
[theTag
].IsMemberOf(kInlineEntity
);
377 if (theTagIsBlock
|| theTagIsInline
|| eHTMLTag_table
== theTag
) {
381 if (gHTMLElements
[theTag
].ShouldVerifyHierarchy()) {
382 PRInt32 earlyPos
= FindLastIndexOfTag(theTag
, theStack
);
383 if (earlyPos
!= kNotFound
) {
384 // Uh-oh, we've found a tag that is not allowed to nest at
385 // all. Mark the previous one and all of its children as
386 // malformed to increase our chances of doing RS handling
387 // on all of them. We want to do this for cases such as:
388 // <a><div><a></a></div></a>.
389 // Note that we have to iterate through all of the chilren
390 // of the original malformed tag to protect against:
391 // <a><font><div><a></a></div></font></a>, so that the <font>
392 // is allowed to contain the <div>.
393 // XXX What about <a><span><a>, where the second <a> closes
395 nsDequeIterator
it(theStack
, earlyPos
), end(theStack
.End());
397 CHTMLToken
*theMalformedToken
=
398 static_cast<CHTMLToken
*>(it
++);
400 theMalformedToken
->SetContainerInfo(eMalformed
);
405 theStack
.Push(theToken
);
411 CHTMLToken
*theLastToken
=
412 static_cast<CHTMLToken
*>(theStack
.Peek());
414 if (theTag
== theLastToken
->GetTypeID()) {
415 theStack
.Pop(); // Yank it for real
417 theLastToken
->SetContainerInfo(eWellFormed
);
419 // This token wasn't what we expected it to be! We need to
420 // go searching for its real start tag on our stack. Each
421 // tag in between the end tag and start tag must be malformed
423 if (FindLastIndexOfTag(theTag
, theStack
) != kNotFound
) {
424 // Find theTarget in the stack, marking each (malformed!)
426 theStack
.Pop(); // Pop off theLastToken for real.
428 theLastToken
->SetContainerInfo(eMalformed
);
429 tempStack
.Push(theLastToken
);
430 theLastToken
= static_cast<CHTMLToken
*>(theStack
.Pop());
431 } while (theLastToken
&& theTag
!= theLastToken
->GetTypeID());
432 // XXX The above test can confuse two different userdefined
435 NS_ASSERTION(theLastToken
,
436 "FindLastIndexOfTag lied to us!"
437 " We couldn't find theTag on theStack");
438 theLastToken
->SetContainerInfo(eMalformed
);
440 // Great, now push all of the other tokens back onto the
441 // stack to preserve the general structure of the document.
442 // Note that we don't push the target token back onto the
443 // the stack (since it was just closed).
444 while (tempStack
.GetSize() != 0) {
445 theStack
.Push(tempStack
.Pop());
458 theToken
= (CHTMLToken
*)mTokenDeque
.ObjectAt(++mTokenScanPos
);
465 * This method is called after we're done tokenizing a chunk of data.
467 * @param aFinalChunk Tells us if this was the last chunk of data.
468 * @return Error result.
471 nsHTMLTokenizer::DidTokenize(PRBool aFinalChunk
)
473 return ScanDocStructure(aFinalChunk
);
477 * This method is repeatedly called by the tokenizer.
478 * Each time, we determine the kind of token we're about to
479 * read, and then we call the appropriate method to handle
482 * @param aScanner The source of our input.
483 * @param aFlushTokens An OUT parameter to tell the caller whether it should
484 * process our queued tokens up to now (e.g., when we
486 * @return Success or error
489 nsHTMLTokenizer::ConsumeToken(nsScanner
& aScanner
, PRBool
& aFlushTokens
)
492 CToken
* theToken
= nsnull
;
494 nsresult result
= aScanner
.Peek(theChar
);
498 // Tell our caller that'we finished.
503 if (!(mFlags
& NS_IPARSER_FLAG_PLAIN_TEXT
)) {
504 if (kLessThan
== theChar
) {
505 return ConsumeTag(theChar
, theToken
, aScanner
, aFlushTokens
);
506 } else if (kAmpersand
== theChar
) {
507 return ConsumeEntity(theChar
, theToken
, aScanner
);
511 if (kCR
== theChar
|| kLF
== theChar
) {
512 return ConsumeNewline(theChar
, theToken
, aScanner
);
514 if (!nsCRT::IsAsciiSpace(theChar
)) {
515 if (theChar
!= '\0') {
516 result
= ConsumeText(theToken
, aScanner
);
518 // Skip the embedded null char. Fix bug 64098.
519 aScanner
.GetChar(theChar
);
523 result
= ConsumeWhitespace(theChar
, theToken
, aScanner
);
532 * This method is called just after a "<" has been consumed
533 * and we know we're at the start of some kind of tagged
534 * element. We don't know yet if it's a tag or a comment.
536 * @param aChar is the last char read
537 * @param aToken is the out arg holding our new token (the function allocates
538 * the return token using mTokenAllocator).
539 * @param aScanner represents our input source
540 * @param aFlushTokens is an OUT parameter use to tell consumers to flush
541 * the current tokens after processing the current one.
542 * @return error code.
545 nsHTMLTokenizer::ConsumeTag(PRUnichar aChar
,
548 PRBool
& aFlushTokens
)
550 PRUnichar theNextChar
, oldChar
;
551 nsresult result
= aScanner
.Peek(aChar
, 1);
553 if (NS_OK
== result
) {
556 result
= aScanner
.Peek(theNextChar
, 2);
558 if (NS_OK
== result
) {
559 // Get the original "<" (we've already seen it with a Peek)
560 aScanner
.GetChar(oldChar
);
562 // XML allows non ASCII tag names, consume this as an end tag. This
563 // is needed to make XML view source work
564 PRBool isXML
= !!(mFlags
& NS_IPARSER_FLAG_XML
);
565 if (nsCRT::IsAsciiAlpha(theNextChar
) ||
566 kGreaterThan
== theNextChar
||
567 (isXML
&& !nsCRT::IsAscii(theNextChar
))) {
568 result
= ConsumeEndTag(aChar
, aToken
, aScanner
);
570 result
= ConsumeComment(aChar
, aToken
, aScanner
);
577 result
= aScanner
.Peek(theNextChar
, 2);
579 if (NS_OK
== result
) {
580 // Get the original "<" (we've already seen it with a Peek)
581 aScanner
.GetChar(oldChar
);
583 if (kMinus
== theNextChar
|| kGreaterThan
== theNextChar
) {
584 result
= ConsumeComment(aChar
, aToken
, aScanner
);
586 result
= ConsumeSpecialMarkup(aChar
, aToken
, aScanner
);
592 // It must be a processing instruction...
593 // Get the original "<" (we've already seen it with a Peek)
594 aScanner
.GetChar(oldChar
);
595 result
= ConsumeProcessingInstruction(aChar
, aToken
, aScanner
);
599 // XML allows non ASCII tag names, consume this as a start tag.
600 PRBool isXML
= !!(mFlags
& NS_IPARSER_FLAG_XML
);
601 if (nsCRT::IsAsciiAlpha(aChar
) ||
602 (isXML
&& !nsCRT::IsAscii(aChar
))) {
603 // Get the original "<" (we've already seen it with a Peek)
604 aScanner
.GetChar(oldChar
);
605 result
= ConsumeStartTag(aChar
, aToken
, aScanner
, aFlushTokens
);
607 // We are not dealing with a tag. So, don't consume the original
608 // char and leave the decision to ConsumeText().
609 result
= ConsumeText(aToken
, aScanner
);
614 // Last ditch attempt to make sure we don't lose data.
615 if (kEOF
== result
&& !aScanner
.IsIncremental()) {
616 // Whoops, we don't want to lose any data! Consume the rest as text.
617 // This normally happens for either a trailing < or </
618 result
= ConsumeText(aToken
, aScanner
);
625 * This method is called just after we've consumed a start or end
626 * tag, and we now have to consume its attributes.
628 * @param aChar is the last char read
629 * @param aToken is the start or end tag that "owns" these attributes.
630 * @param aScanner represents our input source
631 * @return Error result.
634 nsHTMLTokenizer::ConsumeAttributes(PRUnichar aChar
,
638 PRBool done
= PR_FALSE
;
639 nsresult result
= NS_OK
;
640 PRInt16 theAttrCount
= 0;
642 nsTokenAllocator
* theAllocator
= this->GetTokenAllocator();
644 while (!done
&& result
== NS_OK
) {
645 CAttributeToken
* theToken
=
646 static_cast<CAttributeToken
*>
647 (theAllocator
->CreateTokenOfType(eToken_attribute
,
649 if (NS_LIKELY(theToken
!= nsnull
)) {
650 // Tell the new token to finish consuming text...
651 result
= theToken
->Consume(aChar
, aScanner
, mFlags
);
653 if (NS_SUCCEEDED(result
)) {
655 AddToken((CToken
*&)theToken
, result
, &mTokenDeque
, theAllocator
);
657 IF_FREE(theToken
, mTokenAllocator
);
658 // Bad attribute returns shouldn't propagate out.
659 if (NS_ERROR_HTMLPARSER_BADATTRIBUTE
== result
) {
665 result
= NS_ERROR_OUT_OF_MEMORY
;
669 if (NS_SUCCEEDED(result
)) {
671 aScanner
.SkipWhitespace(newline
);
672 NS_ASSERTION(newline
== 0,
673 "CAttribute::Consume() failed to collect all the newlines!");
676 if (NS_SUCCEEDED(result
)) {
677 result
= aScanner
.Peek(aChar
);
678 if (NS_SUCCEEDED(result
)) {
679 if (aChar
== kGreaterThan
) { // You just ate the '>'
680 aScanner
.GetChar(aChar
); // Skip the '>'
682 } else if (aChar
== kLessThan
) {
683 aToken
->SetInError(PR_TRUE
);
690 if (NS_FAILED(result
)) {
691 aToken
->SetInError(PR_TRUE
);
693 if (!aScanner
.IsIncremental()) {
698 aToken
->SetAttributeCount(theAttrCount
);
703 * This method consumes a start tag and all of its attributes.
705 * @param aChar The last character read from the scanner.
706 * @param aToken The OUT parameter that holds our resulting token. (allocated
707 * by the function using mTokenAllocator
708 * @param aScanner Our source of data
709 * @param aFlushTokens is an OUT parameter use to tell consumers to flush
710 * the current tokens after processing the current one.
711 * @return Error result.
714 nsHTMLTokenizer::ConsumeStartTag(PRUnichar aChar
,
717 PRBool
& aFlushTokens
)
719 // Remember this for later in case you have to unwind...
720 PRInt32 theDequeSize
= mTokenDeque
.GetSize();
721 nsresult result
= NS_OK
;
723 nsTokenAllocator
* theAllocator
= this->GetTokenAllocator();
724 aToken
= theAllocator
->CreateTokenOfType(eToken_start
, eHTMLTag_unknown
);
725 NS_ENSURE_TRUE(aToken
, NS_ERROR_OUT_OF_MEMORY
);
727 // Tell the new token to finish consuming text...
728 result
= aToken
->Consume(aChar
, aScanner
, mFlags
);
730 if (NS_SUCCEEDED(result
)) {
731 AddToken(aToken
, result
, &mTokenDeque
, theAllocator
);
733 eHTMLTags theTag
= (eHTMLTags
)aToken
->GetTypeID();
735 // Good. Now, let's see if the next char is ">".
736 // If so, we have a complete tag, otherwise, we have attributes.
737 result
= aScanner
.Peek(aChar
);
738 if (NS_FAILED(result
)) {
739 aToken
->SetInError(PR_TRUE
);
741 // Don't return early here so we can create a text and end token for
742 // the special <iframe>, <script> and similar tags down below.
745 if (kGreaterThan
!= aChar
) { // Look for a '>'
746 result
= ConsumeAttributes(aChar
, aToken
, aScanner
);
748 aScanner
.GetChar(aChar
);
752 /* Now that that's over with, we have one more problem to solve.
753 In the case that we just read a <SCRIPT> or <STYLE> tags, we should go and
754 consume all the content itself.
755 But XML doesn't treat these tags differently, so we shouldn't if the
758 if (NS_SUCCEEDED(result
) && !(mFlags
& NS_IPARSER_FLAG_XML
)) {
759 PRBool isCDATA
= gHTMLElements
[theTag
].CanContainType(kCDATA
);
760 PRBool isPCDATA
= eHTMLTag_textarea
== theTag
||
761 eHTMLTag_title
== theTag
;
763 // XXX This is an evil hack, we should be able to handle these properly
765 if ((eHTMLTag_iframe
== theTag
&&
766 (mFlags
& NS_IPARSER_FLAG_FRAMES_ENABLED
)) ||
767 (eHTMLTag_noframes
== theTag
&&
768 (mFlags
& NS_IPARSER_FLAG_FRAMES_ENABLED
)) ||
769 (eHTMLTag_noscript
== theTag
&&
770 (mFlags
& NS_IPARSER_FLAG_SCRIPT_ENABLED
)) ||
771 (eHTMLTag_noembed
== theTag
)) {
775 // Plaintext contains CDATA, but it's special, so we handle it
776 // differently than the other CDATA elements
777 if (eHTMLTag_plaintext
== theTag
) {
780 // Note: We check in ConsumeToken() for this flag, and if we see it
781 // we only construct text tokens (which is what we want).
782 mFlags
|= NS_IPARSER_FLAG_PLAIN_TEXT
;
786 if (isCDATA
|| isPCDATA
) {
787 PRBool done
= PR_FALSE
;
788 nsDependentString
endTagName(nsHTMLTags::GetStringValue(theTag
));
791 theAllocator
->CreateTokenOfType(eToken_text
, eHTMLTag_text
);
792 NS_ENSURE_TRUE(text
, NS_ERROR_OUT_OF_MEMORY
);
794 CTextToken
* textToken
= static_cast<CTextToken
*>(text
);
797 result
= textToken
->ConsumeCharacterData(theTag
!= eHTMLTag_script
,
803 // Only flush tokens for <script>, to give ourselves more of a
804 // chance of allowing inlines to contain blocks.
805 aFlushTokens
= done
&& theTag
== eHTMLTag_script
;
806 } else if (isPCDATA
) {
807 // Title is consumed conservatively in order to not regress
809 result
= textToken
->ConsumeParsedCharacterData(
810 theTag
== eHTMLTag_textarea
,
811 theTag
== eHTMLTag_title
,
817 // Note: we *don't* set aFlushTokens here.
820 // We want to do this unless result is kEOF, in which case we will
821 // simply unwind our stack and wait for more data anyway.
822 if (kEOF
!= result
) {
823 AddToken(text
, NS_OK
, &mTokenDeque
, theAllocator
);
824 CToken
* endToken
= nsnull
;
826 if (NS_SUCCEEDED(result
) && done
) {
829 result
= aScanner
.GetChar(theChar
);
830 NS_ASSERTION(NS_SUCCEEDED(result
) && theChar
== kLessThan
,
831 "CTextToken::Consume*Data is broken!");
833 // Ensure we have a /
834 PRUnichar tempChar
; // Don't change non-debug vars in debug-only code
835 result
= aScanner
.Peek(tempChar
);
836 NS_ASSERTION(NS_SUCCEEDED(result
) && tempChar
== kForwardSlash
,
837 "CTextToken::Consume*Data is broken!");
839 result
= ConsumeEndTag(PRUnichar('/'), endToken
, aScanner
);
840 if (!(mFlags
& NS_IPARSER_FLAG_VIEW_SOURCE
) &&
841 NS_SUCCEEDED(result
)) {
842 // If ConsumeCharacterData returned a success result (and
843 // we're not in view source), then we want to make sure that
844 // we're going to execute this script (since the result means
845 // that we've found an end tag that satisfies all of the right
847 endToken
->SetInError(PR_FALSE
);
849 } else if (result
== kFakeEndTag
&&
850 !(mFlags
& NS_IPARSER_FLAG_VIEW_SOURCE
)) {
852 endToken
= theAllocator
->CreateTokenOfType(eToken_end
, theTag
,
854 AddToken(endToken
, result
, &mTokenDeque
, theAllocator
);
855 if (NS_LIKELY(endToken
!= nsnull
)) {
856 endToken
->SetInError(PR_TRUE
);
859 result
= NS_ERROR_OUT_OF_MEMORY
;
861 } else if (result
== kFakeEndTag
) {
862 // If we are here, we are both faking having seen the end tag
863 // and are in view-source.
867 IF_FREE(text
, mTokenAllocator
);
872 // This code is confusing, so pay attention.
873 // If you're here, it's because we were in the midst of consuming a start
874 // tag but ran out of data (not in the stream, but in this *part* of the
875 // stream. For simplicity, we have to unwind our input. Therefore, we pop
876 // and discard any new tokens we've queued this round. Later we can get
877 // smarter about this.
878 if (NS_FAILED(result
)) {
879 while (mTokenDeque
.GetSize()>theDequeSize
) {
880 CToken
* theToken
= (CToken
*)mTokenDeque
.Pop();
881 IF_FREE(theToken
, mTokenAllocator
);
885 IF_FREE(aToken
, mTokenAllocator
);
892 * This method consumes an end tag and any "attributes" that may come after it.
894 * @param aChar The last character read from the scanner.
895 * @param aToken The OUT parameter that holds our resulting token.
896 * @param aScanner Our source of data
897 * @return Error result
900 nsHTMLTokenizer::ConsumeEndTag(PRUnichar aChar
,
904 // Get the "/" (we've already seen it with a Peek)
905 aScanner
.GetChar(aChar
);
907 nsTokenAllocator
* theAllocator
= this->GetTokenAllocator();
908 aToken
= theAllocator
->CreateTokenOfType(eToken_end
, eHTMLTag_unknown
);
909 NS_ENSURE_TRUE(aToken
, NS_ERROR_OUT_OF_MEMORY
);
911 // Remember this for later in case you have to unwind...
912 PRInt32 theDequeSize
= mTokenDeque
.GetSize();
913 nsresult result
= NS_OK
;
915 // Tell the new token to finish consuming text...
916 result
= aToken
->Consume(aChar
, aScanner
, mFlags
);
917 AddToken(aToken
, result
, &mTokenDeque
, theAllocator
);
918 if (NS_FAILED(result
)) {
919 // Note that this early-return here is safe because we have not yet
920 // added any of our tokens to the queue (AddToken only adds the token if
921 // result is a success), so we don't need to fall through.
925 result
= aScanner
.Peek(aChar
);
926 if (NS_FAILED(result
)) {
927 aToken
->SetInError(PR_TRUE
);
929 // Note: We know here that the scanner is not incremental since if
930 // this peek fails, then we've already masked over a kEOF coming from
931 // the Consume() call above.
935 if (kGreaterThan
!= aChar
) {
936 result
= ConsumeAttributes(aChar
, aToken
, aScanner
);
938 aScanner
.GetChar(aChar
);
941 // Do the same thing as we do in ConsumeStartTag. Basically, if we've run
942 // out of room in this *section* of the document, pop all of the tokens
943 // we've consumed this round and wait for more data.
944 if (NS_FAILED(result
)) {
945 while (mTokenDeque
.GetSize() > theDequeSize
) {
946 CToken
* theToken
= (CToken
*)mTokenDeque
.Pop();
947 IF_FREE(theToken
, mTokenAllocator
);
955 * This method is called just after a "&" has been consumed
956 * and we know we're at the start of an entity.
958 * @param aChar The last character read from the scanner.
959 * @param aToken The OUT parameter that holds our resulting token.
960 * @param aScanner Our source of data
961 * @return Error result.
964 nsHTMLTokenizer::ConsumeEntity(PRUnichar aChar
,
969 nsresult result
= aScanner
.Peek(theChar
, 1);
971 nsTokenAllocator
* theAllocator
= this->GetTokenAllocator();
972 if (NS_SUCCEEDED(result
)) {
973 if (nsCRT::IsAsciiAlpha(theChar
) || theChar
== kHashsign
) {
974 aToken
= theAllocator
->CreateTokenOfType(eToken_entity
, eHTMLTag_entity
);
975 NS_ENSURE_TRUE(aToken
, NS_ERROR_OUT_OF_MEMORY
);
976 result
= aToken
->Consume(theChar
, aScanner
, mFlags
);
978 if (result
== NS_HTMLTOKENS_NOT_AN_ENTITY
) {
979 IF_FREE(aToken
, mTokenAllocator
);
981 if (result
== kEOF
&& !aScanner
.IsIncremental()) {
982 result
= NS_OK
; // Use as much of the entity as you can get.
985 AddToken(aToken
, result
, &mTokenDeque
, theAllocator
);
990 // Oops, we're actually looking at plain text...
991 result
= ConsumeText(aToken
, aScanner
);
992 } else if (result
== kEOF
&& !aScanner
.IsIncremental()) {
993 // If the last character in the file is an &, consume it as text.
994 result
= ConsumeText(aToken
, aScanner
);
996 aToken
->SetInError(PR_TRUE
);
1005 * This method is called just after whitespace has been
1006 * consumed and we know we're at the start a whitespace run.
1008 * @param aChar The last character read from the scanner.
1009 * @param aToken The OUT parameter that holds our resulting token.
1010 * @param aScanner Our source of data
1011 * @return Error result.
1014 nsHTMLTokenizer::ConsumeWhitespace(PRUnichar aChar
,
1016 nsScanner
& aScanner
)
1018 // Get the whitespace character
1019 aScanner
.GetChar(aChar
);
1021 nsTokenAllocator
* theAllocator
= this->GetTokenAllocator();
1022 aToken
= theAllocator
->CreateTokenOfType(eToken_whitespace
,
1023 eHTMLTag_whitespace
);
1024 nsresult result
= NS_OK
;
1026 result
= aToken
->Consume(aChar
, aScanner
, mFlags
);
1027 AddToken(aToken
, result
, &mTokenDeque
, theAllocator
);
1034 * This method is called just after a "<!" has been consumed
1035 * and we know we're at the start of a comment.
1037 * @param aChar The last character read from the scanner.
1038 * @param aToken The OUT parameter that holds our resulting token.
1039 * @param aScanner Our source of data
1040 * @return Error result.
1043 nsHTMLTokenizer::ConsumeComment(PRUnichar aChar
,
1045 nsScanner
& aScanner
)
1048 aScanner
.GetChar(aChar
);
1050 nsTokenAllocator
* theAllocator
= this->GetTokenAllocator();
1051 aToken
= theAllocator
->CreateTokenOfType(eToken_comment
, eHTMLTag_comment
);
1052 nsresult result
= NS_OK
;
1054 result
= aToken
->Consume(aChar
, aScanner
, mFlags
);
1055 AddToken(aToken
, result
, &mTokenDeque
, theAllocator
);
1058 if (kNotAComment
== result
) {
1059 // AddToken has IF_FREE()'d our token, so...
1060 result
= ConsumeText(aToken
, aScanner
);
1067 * This method is called just after a known text char has
1068 * been consumed and we should read a text run. Note: we actually ignore the
1069 * first character of the text run so that we can consume invalid markup
1072 * @param aToken The OUT parameter that holds our resulting token.
1073 * @param aScanner Our source of data
1074 * @return Error result.
1077 nsHTMLTokenizer::ConsumeText(CToken
*& aToken
, nsScanner
& aScanner
)
1079 nsresult result
= NS_OK
;
1080 nsTokenAllocator
* theAllocator
= this->GetTokenAllocator();
1081 CTextToken
* theToken
=
1082 (CTextToken
*)theAllocator
->CreateTokenOfType(eToken_text
, eHTMLTag_text
);
1084 PRUnichar ch
= '\0';
1085 result
= theToken
->Consume(ch
, aScanner
, mFlags
);
1086 if (NS_FAILED(result
)) {
1087 if (0 == theToken
->GetTextLength()) {
1088 IF_FREE(aToken
, mTokenAllocator
);
1096 AddToken(aToken
, result
, &mTokenDeque
, theAllocator
);
1103 * This method is called just after a "<!" has been consumed.
1104 * NOTE: Here we might consume DOCTYPE and "special" markups.
1106 * @param aChar The last character read from the scanner.
1107 * @param aToken The OUT parameter that holds our resulting token.
1108 * @param aScanner Our source of data
1109 * @return Error result.
1112 nsHTMLTokenizer::ConsumeSpecialMarkup(PRUnichar aChar
,
1114 nsScanner
& aScanner
)
1117 aScanner
.GetChar(aChar
);
1119 nsresult result
= NS_OK
;
1120 nsAutoString theBufCopy
;
1121 aScanner
.Peek(theBufCopy
, 20);
1122 ToUpperCase(theBufCopy
);
1123 PRInt32 theIndex
= theBufCopy
.Find("DOCTYPE", PR_FALSE
, 0, 0);
1124 nsTokenAllocator
* theAllocator
= this->GetTokenAllocator();
1126 if (theIndex
== kNotFound
) {
1127 if ('[' == theBufCopy
.CharAt(0)) {
1128 aToken
= theAllocator
->CreateTokenOfType(eToken_cdatasection
,
1130 } else if (StringBeginsWith(theBufCopy
, NS_LITERAL_STRING("ELEMENT")) ||
1131 StringBeginsWith(theBufCopy
, NS_LITERAL_STRING("ATTLIST")) ||
1132 StringBeginsWith(theBufCopy
, NS_LITERAL_STRING("ENTITY")) ||
1133 StringBeginsWith(theBufCopy
, NS_LITERAL_STRING("NOTATION"))) {
1134 aToken
= theAllocator
->CreateTokenOfType(eToken_markupDecl
,
1135 eHTMLTag_markupDecl
);
1137 aToken
= theAllocator
->CreateTokenOfType(eToken_comment
,
1141 aToken
= theAllocator
->CreateTokenOfType(eToken_doctypeDecl
,
1142 eHTMLTag_doctypeDecl
);
1146 result
= aToken
->Consume(aChar
, aScanner
, mFlags
);
1147 AddToken(aToken
, result
, &mTokenDeque
, theAllocator
);
1150 if (result
== kNotAComment
) {
1151 result
= ConsumeText(aToken
, aScanner
);
1158 * This method is called just after a newline has been consumed.
1160 * @param aChar The last character read from the scanner.
1161 * @param aToken The OUT parameter that holds our resulting token.
1162 * @param aScanner Our source of data
1163 * @return Error result.
1166 nsHTMLTokenizer::ConsumeNewline(PRUnichar aChar
,
1168 nsScanner
& aScanner
)
1170 // Get the newline character
1171 aScanner
.GetChar(aChar
);
1173 nsTokenAllocator
* theAllocator
= this->GetTokenAllocator();
1174 aToken
= theAllocator
->CreateTokenOfType(eToken_newline
, eHTMLTag_newline
);
1175 nsresult result
= NS_OK
;
1177 result
= aToken
->Consume(aChar
, aScanner
, mFlags
);
1178 AddToken(aToken
, result
, &mTokenDeque
, theAllocator
);
1186 * This method is called just after a <? has been consumed.
1188 * @param aChar The last character read from the scanner.
1189 * @param aToken The OUT parameter that holds our resulting token.
1190 * @param aScanner Our source of data
1191 * @return Error result.
1194 nsHTMLTokenizer::ConsumeProcessingInstruction(PRUnichar aChar
,
1196 nsScanner
& aScanner
)
1199 aScanner
.GetChar(aChar
);
1201 nsTokenAllocator
* theAllocator
= this->GetTokenAllocator();
1202 aToken
= theAllocator
->CreateTokenOfType(eToken_instruction
,
1204 nsresult result
= NS_OK
;
1206 result
= aToken
->Consume(aChar
, aScanner
, mFlags
);
1207 AddToken(aToken
, result
, &mTokenDeque
, theAllocator
);