1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set sw=2 ts=2 et tw=78: */
3 /* ***** BEGIN LICENSE BLOCK *****
4 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
6 * The contents of this file are subject to the Mozilla Public License Version
7 * 1.1 (the "License"); you may not use this file except in compliance with
8 * the License. You may obtain a copy of the License at
9 * http://www.mozilla.org/MPL/
11 * Software distributed under the License is distributed on an "AS IS" basis,
12 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
13 * for the specific language governing rights and limitations under the
16 * The Original Code is mozilla.org code.
18 * The Initial Developer of the Original Code is
19 * Netscape Communications Corporation.
20 * Portions created by the Initial Developer are Copyright (C) 1998
21 * the Initial Developer. All Rights Reserved.
24 * Blake Kaplan <mrbkap@gmail.com>
26 * Alternatively, the contents of this file may be used under the terms of
27 * either of the GNU General Public License Version 2 or later (the "GPL"),
28 * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
29 * in which case the provisions of the GPL or the LGPL are applicable instead
30 * of those above. If you wish to allow use of your version of this file only
31 * under the terms of either the GPL or the LGPL, and not to allow others to
32 * use your version of this file under the terms of the MPL, indicate your
33 * decision by deleting the provisions above and replace them with the notice
34 * and other provisions required by the GPL or the LGPL. If you do not delete
35 * the provisions above, a recipient may use your version of this file under
36 * the terms of any one of the MPL, the GPL or the LGPL.
38 * ***** END LICENSE BLOCK ***** */
42 * @file nsHTMLTokenizer.cpp
43 * This is an implementation of the nsITokenizer interface.
44 * This file contains the implementation of a tokenizer to tokenize an HTML
45 * document. It attempts to do so, making tradeoffs between compatibility with
46 * older parsers and the SGML specification. Note that most of the real
47 * "tokenization" takes place in nsHTMLTokens.cpp.
51 #include "nsHTMLTokenizer.h"
52 #include "nsScanner.h"
53 #include "nsElementTable.h"
54 #include "nsReadableUtils.h"
55 #include "nsUnicharUtils.h"
57 /************************************************************************
58 And now for the main class -- nsHTMLTokenizer...
59 ************************************************************************/
62 * Satisfy the nsISupports interface.
64 NS_IMPL_ISUPPORTS1(nsHTMLTokenizer
, nsITokenizer
)
69 * @param aParseMode The current mode the document is in (quirks, etc.)
70 * @param aDocType The document type of the current document
71 * @param aCommand What we are trying to do (view-source, parse a fragment, etc.)
73 nsHTMLTokenizer::nsHTMLTokenizer(PRInt32 aParseMode
,
74 eParserDocType aDocType
,
75 eParserCommands aCommand
,
77 nsITokenizer(), mTokenDeque(0), mFlags(aFlags
)
79 if (aParseMode
== eDTDMode_full_standards
||
80 aParseMode
== eDTDMode_almost_standards
) {
81 mFlags
|= NS_IPARSER_FLAG_STRICT_MODE
;
82 } else if (aParseMode
== eDTDMode_quirks
) {
83 mFlags
|= NS_IPARSER_FLAG_QUIRKS_MODE
;
84 } else if (aParseMode
== eDTDMode_autodetect
) {
85 mFlags
|= NS_IPARSER_FLAG_AUTO_DETECT_MODE
;
87 mFlags
|= NS_IPARSER_FLAG_UNKNOWN_MODE
;
90 if (aDocType
== ePlainText
) {
91 mFlags
|= NS_IPARSER_FLAG_PLAIN_TEXT
;
92 } else if (aDocType
== eXML
) {
93 mFlags
|= NS_IPARSER_FLAG_XML
;
94 } else if (aDocType
== eHTML_Quirks
||
95 aDocType
== eHTML_Strict
) {
96 mFlags
|= NS_IPARSER_FLAG_HTML
;
99 mFlags
|= aCommand
== eViewSource
100 ? NS_IPARSER_FLAG_VIEW_SOURCE
101 : NS_IPARSER_FLAG_VIEW_NORMAL
;
103 NS_ASSERTION(!(mFlags
& NS_IPARSER_FLAG_XML
) ||
104 (mFlags
& NS_IPARSER_FLAG_VIEW_SOURCE
),
105 "Why isn't this XML document going through our XML parser?");
107 mTokenAllocator
= nsnull
;
112 * The destructor ensures that we don't leak any left over tokens.
114 nsHTMLTokenizer::~nsHTMLTokenizer()
116 if (mTokenDeque
.GetSize()) {
117 CTokenDeallocator
theDeallocator(mTokenAllocator
->GetArenaPool());
118 mTokenDeque
.ForEach(theDeallocator
);
123 /*******************************************************************
124 Here begins the real working methods for the tokenizer.
125 *******************************************************************/
128 * Adds a token onto the end of the deque if aResult is a successful result.
129 * Otherwise, this function frees aToken and sets it to nsnull.
131 * @param aToken The token that wants to be added.
132 * @param aResult The error code that will be used to determine if we actually
133 * want to push this token.
134 * @param aDeque The deque we want to push aToken onto.
135 * @param aTokenAllocator The allocator we use to free aToken in case aResult
136 * is not a success code.
140 nsHTMLTokenizer::AddToken(CToken
*& aToken
,
143 nsTokenAllocator
* aTokenAllocator
)
145 if (aToken
&& aDeque
) {
146 if (NS_SUCCEEDED(aResult
)) {
147 aDeque
->Push(aToken
);
149 IF_FREE(aToken
, aTokenAllocator
);
155 * Retrieve a pointer to the global token recycler...
157 * @return Pointer to recycler (or null)
160 nsHTMLTokenizer::GetTokenAllocator()
162 return mTokenAllocator
;
166 * This method provides access to the topmost token in the tokenDeque.
167 * The token is not really removed from the list.
169 * @return Pointer to token
172 nsHTMLTokenizer::PeekToken()
174 return (CToken
*)mTokenDeque
.PeekFront();
178 * This method provides access to the topmost token in the tokenDeque.
179 * The token is really removed from the list; if the list is empty we return 0.
181 * @return Pointer to token or NULL
184 nsHTMLTokenizer::PopToken()
186 return (CToken
*)mTokenDeque
.PopFront();
191 * Pushes a token onto the front of our deque such that the next call to
192 * PopToken() or PeekToken() will return that token.
194 * @param theToken The next token to be processed
198 nsHTMLTokenizer::PushTokenFront(CToken
* theToken
)
200 mTokenDeque
.PushFront(theToken
);
205 * Pushes a token onto the deque.
207 * @param theToken the new token.
211 nsHTMLTokenizer::PushToken(CToken
* theToken
)
213 mTokenDeque
.Push(theToken
);
218 * Returns the size of the deque.
220 * @return The number of remaining tokens.
223 nsHTMLTokenizer::GetCount()
225 return mTokenDeque
.GetSize();
229 * Allows access to an arbitrary token in the deque. The accessed token is left
232 * @param anIndex The index of the target token. Token 0 would be the same as
233 * the result of a call to PeekToken()
234 * @return The requested token.
237 nsHTMLTokenizer::GetTokenAt(PRInt32 anIndex
)
239 return (CToken
*)mTokenDeque
.ObjectAt(anIndex
);
243 * This method is part of the "sandwich" that occurs when we want to tokenize
244 * a document. This prepares us to be able to tokenize properly.
246 * @param aIsFinalChunk Whether this is the last chunk of data that we will
248 * @param aTokenAllocator The token allocator to use for this document.
249 * @return Our success in setting up.
252 nsHTMLTokenizer::WillTokenize(PRBool aIsFinalChunk
,
253 nsTokenAllocator
* aTokenAllocator
)
255 mTokenAllocator
= aTokenAllocator
;
256 mIsFinalChunk
= aIsFinalChunk
;
258 // Cause ScanDocStructure to search from here for new tokens...
259 mTokenScanPos
= mTokenDeque
.GetSize();
264 * Pushes all of the tokens in aDeque onto the front of our deque so they
265 * get processed before any other tokens.
267 * @param aDeque The deque with the tokens in it.
270 nsHTMLTokenizer::PrependTokens(nsDeque
& aDeque
)
272 PRInt32 aCount
= aDeque
.GetSize();
274 for (PRInt32 anIndex
= 0; anIndex
< aCount
; ++anIndex
) {
275 CToken
* theToken
= (CToken
*)aDeque
.Pop();
276 PushTokenFront(theToken
);
281 * Copies the state flags from aTokenizer into this tokenizer. This is used
282 * to pass information around between the main tokenizer and tokenizers
283 * created for document.write() calls.
285 * @param aTokenizer The tokenizer with more information in it.
289 nsHTMLTokenizer::CopyState(nsITokenizer
* aTokenizer
)
292 mFlags
= ((nsHTMLTokenizer
*)aTokenizer
)->mFlags
;
299 * This is a utilty method for ScanDocStructure, which finds a given
300 * tag in the stack. The return value is meant to be used with
301 * nsDeque::ObjectAt() on aTagStack.
303 * @param aTag -- the ID of the tag we're seeking
304 * @param aTagStack -- the stack to be searched
305 * @return index position of tag in stack if found, otherwise kNotFound
308 FindLastIndexOfTag(eHTMLTags aTag
, nsDeque
&aTagStack
)
310 PRInt32 theCount
= aTagStack
.GetSize();
312 while (0 < theCount
) {
313 CHTMLToken
* theToken
= (CHTMLToken
*)aTagStack
.ObjectAt(--theCount
);
315 eHTMLTags theTag
= (eHTMLTags
)theToken
->GetTypeID();
316 if (theTag
== aTag
) {
326 * This method scans the sequence of tokens to determine whether or not the
327 * tag structure of the document is well formed. In well formed cases, we can
328 * skip doing residual style handling and allow inlines to contain block-level
331 * @param aFinalChunk Is unused.
332 * @return Success (currently, this function cannot fail).
334 nsresult
nsHTMLTokenizer::ScanDocStructure(PRBool aFinalChunk
)
336 nsresult result
= NS_OK
;
337 if (!mTokenDeque
.GetSize()) {
341 CHTMLToken
* theToken
= (CHTMLToken
*)mTokenDeque
.ObjectAt(mTokenScanPos
);
343 // Start by finding the first start tag that hasn't been reviewed.
344 while (mTokenScanPos
> 0) {
346 eHTMLTokenTypes theType
= eHTMLTokenTypes(theToken
->GetTokenType());
347 if (theType
== eToken_start
&&
348 theToken
->GetContainerInfo() == eFormUnknown
) {
352 theToken
= (CHTMLToken
*)mTokenDeque
.ObjectAt(--mTokenScanPos
);
355 // Now that we know where to start, let's walk through the
356 // tokens to see which are well-formed. Stop when you run out
360 nsDeque
tempStack(0);
361 PRInt32 theStackDepth
= 0;
362 // Don't bother if we get ridiculously deep.
363 static const PRInt32 theMaxStackDepth
= 200;
365 while (theToken
&& theStackDepth
< theMaxStackDepth
) {
366 eHTMLTokenTypes theType
= eHTMLTokenTypes(theToken
->GetTokenType());
367 eHTMLTags theTag
= (eHTMLTags
)theToken
->GetTypeID();
369 if (nsHTMLElement::IsContainer(theTag
)) { // Bug 54117
370 PRBool theTagIsBlock
= gHTMLElements
[theTag
].IsMemberOf(kBlockEntity
);
371 PRBool theTagIsInline
= theTagIsBlock
373 : gHTMLElements
[theTag
].IsMemberOf(kInlineEntity
);
375 if (theTagIsBlock
|| theTagIsInline
|| eHTMLTag_table
== theTag
) {
379 if (gHTMLElements
[theTag
].ShouldVerifyHierarchy()) {
380 PRInt32 earlyPos
= FindLastIndexOfTag(theTag
, theStack
);
381 if (earlyPos
!= kNotFound
) {
382 // Uh-oh, we've found a tag that is not allowed to nest at
383 // all. Mark the previous one and all of its children as
384 // malformed to increase our chances of doing RS handling
385 // on all of them. We want to do this for cases such as:
386 // <a><div><a></a></div></a>.
387 // Note that we have to iterate through all of the chilren
388 // of the original malformed tag to protect against:
389 // <a><font><div><a></a></div></font></a>, so that the <font>
390 // is allowed to contain the <div>.
391 // XXX What about <a><span><a>, where the second <a> closes
393 nsDequeIterator
it(theStack
, earlyPos
), end(theStack
.End());
395 CHTMLToken
*theMalformedToken
=
396 static_cast<CHTMLToken
*>(it
++);
398 theMalformedToken
->SetContainerInfo(eMalformed
);
403 theStack
.Push(theToken
);
409 CHTMLToken
*theLastToken
=
410 static_cast<CHTMLToken
*>(theStack
.Peek());
412 if (theTag
== theLastToken
->GetTypeID()) {
413 theStack
.Pop(); // Yank it for real
415 theLastToken
->SetContainerInfo(eWellFormed
);
417 // This token wasn't what we expected it to be! We need to
418 // go searching for its real start tag on our stack. Each
419 // tag in between the end tag and start tag must be malformed
421 if (FindLastIndexOfTag(theTag
, theStack
) != kNotFound
) {
422 // Find theTarget in the stack, marking each (malformed!)
424 theStack
.Pop(); // Pop off theLastToken for real.
426 theLastToken
->SetContainerInfo(eMalformed
);
427 tempStack
.Push(theLastToken
);
428 theLastToken
= static_cast<CHTMLToken
*>(theStack
.Pop());
429 } while (theLastToken
&& theTag
!= theLastToken
->GetTypeID());
430 // XXX The above test can confuse two different userdefined
433 NS_ASSERTION(theLastToken
,
434 "FindLastIndexOfTag lied to us!"
435 " We couldn't find theTag on theStack");
436 theLastToken
->SetContainerInfo(eMalformed
);
438 // Great, now push all of the other tokens back onto the
439 // stack to preserve the general structure of the document.
440 // Note that we don't push the target token back onto the
441 // the stack (since it was just closed).
442 while (tempStack
.GetSize() != 0) {
443 theStack
.Push(tempStack
.Pop());
456 theToken
= (CHTMLToken
*)mTokenDeque
.ObjectAt(++mTokenScanPos
);
463 * This method is called after we're done tokenizing a chunk of data.
465 * @param aFinalChunk Tells us if this was the last chunk of data.
466 * @return Error result.
469 nsHTMLTokenizer::DidTokenize(PRBool aFinalChunk
)
471 return ScanDocStructure(aFinalChunk
);
475 * This method is repeatedly called by the tokenizer.
476 * Each time, we determine the kind of token we're about to
477 * read, and then we call the appropriate method to handle
480 * @param aScanner The source of our input.
481 * @param aFlushTokens An OUT parameter to tell the caller whether it should
482 * process our queued tokens up to now (e.g., when we
484 * @return Success or error
487 nsHTMLTokenizer::ConsumeToken(nsScanner
& aScanner
, PRBool
& aFlushTokens
)
490 CToken
* theToken
= nsnull
;
492 nsresult result
= aScanner
.Peek(theChar
);
496 // Tell our caller that'we finished.
501 if (!(mFlags
& NS_IPARSER_FLAG_PLAIN_TEXT
)) {
502 if (kLessThan
== theChar
) {
503 return ConsumeTag(theChar
, theToken
, aScanner
, aFlushTokens
);
504 } else if (kAmpersand
== theChar
) {
505 return ConsumeEntity(theChar
, theToken
, aScanner
);
509 if (kCR
== theChar
|| kLF
== theChar
) {
510 return ConsumeNewline(theChar
, theToken
, aScanner
);
512 if (!nsCRT::IsAsciiSpace(theChar
)) {
513 if (theChar
!= '\0') {
514 result
= ConsumeText(theToken
, aScanner
);
516 // Skip the embedded null char. Fix bug 64098.
517 aScanner
.GetChar(theChar
);
521 result
= ConsumeWhitespace(theChar
, theToken
, aScanner
);
530 * This method is called just after a "<" has been consumed
531 * and we know we're at the start of some kind of tagged
532 * element. We don't know yet if it's a tag or a comment.
534 * @param aChar is the last char read
535 * @param aToken is the out arg holding our new token (the function allocates
536 * the return token using mTokenAllocator).
537 * @param aScanner represents our input source
538 * @param aFlushTokens is an OUT parameter use to tell consumers to flush
539 * the current tokens after processing the current one.
540 * @return error code.
543 nsHTMLTokenizer::ConsumeTag(PRUnichar aChar
,
546 PRBool
& aFlushTokens
)
548 PRUnichar theNextChar
, oldChar
;
549 nsresult result
= aScanner
.Peek(aChar
, 1);
551 if (NS_OK
== result
) {
554 result
= aScanner
.Peek(theNextChar
, 2);
556 if (NS_OK
== result
) {
557 // Get the original "<" (we've already seen it with a Peek)
558 aScanner
.GetChar(oldChar
);
560 // XML allows non ASCII tag names, consume this as an end tag. This
561 // is needed to make XML view source work
562 PRBool isXML
= !!(mFlags
& NS_IPARSER_FLAG_XML
);
563 if (nsCRT::IsAsciiAlpha(theNextChar
) ||
564 kGreaterThan
== theNextChar
||
565 (isXML
&& !nsCRT::IsAscii(theNextChar
))) {
566 result
= ConsumeEndTag(aChar
, aToken
, aScanner
);
568 result
= ConsumeComment(aChar
, aToken
, aScanner
);
575 result
= aScanner
.Peek(theNextChar
, 2);
577 if (NS_OK
== result
) {
578 // Get the original "<" (we've already seen it with a Peek)
579 aScanner
.GetChar(oldChar
);
581 if (kMinus
== theNextChar
|| kGreaterThan
== theNextChar
) {
582 result
= ConsumeComment(aChar
, aToken
, aScanner
);
584 result
= ConsumeSpecialMarkup(aChar
, aToken
, aScanner
);
590 // It must be a processing instruction...
591 // Get the original "<" (we've already seen it with a Peek)
592 aScanner
.GetChar(oldChar
);
593 result
= ConsumeProcessingInstruction(aChar
, aToken
, aScanner
);
597 // XML allows non ASCII tag names, consume this as a start tag.
598 PRBool isXML
= !!(mFlags
& NS_IPARSER_FLAG_XML
);
599 if (nsCRT::IsAsciiAlpha(aChar
) ||
600 (isXML
&& !nsCRT::IsAscii(aChar
))) {
601 // Get the original "<" (we've already seen it with a Peek)
602 aScanner
.GetChar(oldChar
);
603 result
= ConsumeStartTag(aChar
, aToken
, aScanner
, aFlushTokens
);
605 // We are not dealing with a tag. So, don't consume the original
606 // char and leave the decision to ConsumeText().
607 result
= ConsumeText(aToken
, aScanner
);
612 // Last ditch attempt to make sure we don't lose data.
613 if (kEOF
== result
&& !aScanner
.IsIncremental()) {
614 // Whoops, we don't want to lose any data! Consume the rest as text.
615 // This normally happens for either a trailing < or </
616 result
= ConsumeText(aToken
, aScanner
);
623 * This method is called just after we've consumed a start or end
624 * tag, and we now have to consume its attributes.
626 * @param aChar is the last char read
627 * @param aToken is the start or end tag that "owns" these attributes.
628 * @param aScanner represents our input source
629 * @return Error result.
632 nsHTMLTokenizer::ConsumeAttributes(PRUnichar aChar
,
636 PRBool done
= PR_FALSE
;
637 nsresult result
= NS_OK
;
638 PRInt16 theAttrCount
= 0;
640 nsTokenAllocator
* theAllocator
= this->GetTokenAllocator();
642 while (!done
&& result
== NS_OK
) {
643 CAttributeToken
* theToken
=
644 static_cast<CAttributeToken
*>
645 (theAllocator
->CreateTokenOfType(eToken_attribute
,
647 if (NS_LIKELY(theToken
!= nsnull
)) {
648 // Tell the new token to finish consuming text...
649 result
= theToken
->Consume(aChar
, aScanner
, mFlags
);
651 if (NS_SUCCEEDED(result
)) {
653 AddToken((CToken
*&)theToken
, result
, &mTokenDeque
, theAllocator
);
655 IF_FREE(theToken
, mTokenAllocator
);
656 // Bad attribute returns shouldn't propagate out.
657 if (NS_ERROR_HTMLPARSER_BADATTRIBUTE
== result
) {
663 result
= NS_ERROR_OUT_OF_MEMORY
;
667 if (NS_SUCCEEDED(result
)) {
669 aScanner
.SkipWhitespace(newline
);
670 NS_ASSERTION(newline
== 0,
671 "CAttribute::Consume() failed to collect all the newlines!");
674 if (NS_SUCCEEDED(result
)) {
675 result
= aScanner
.Peek(aChar
);
676 if (NS_SUCCEEDED(result
)) {
677 if (aChar
== kGreaterThan
) { // You just ate the '>'
678 aScanner
.GetChar(aChar
); // Skip the '>'
680 } else if (aChar
== kLessThan
) {
681 aToken
->SetInError(PR_TRUE
);
688 if (NS_FAILED(result
)) {
689 aToken
->SetInError(PR_TRUE
);
691 if (!aScanner
.IsIncremental()) {
696 aToken
->SetAttributeCount(theAttrCount
);
701 * This method consumes a start tag and all of its attributes.
703 * @param aChar The last character read from the scanner.
704 * @param aToken The OUT parameter that holds our resulting token. (allocated
705 * by the function using mTokenAllocator
706 * @param aScanner Our source of data
707 * @param aFlushTokens is an OUT parameter use to tell consumers to flush
708 * the current tokens after processing the current one.
709 * @return Error result.
712 nsHTMLTokenizer::ConsumeStartTag(PRUnichar aChar
,
715 PRBool
& aFlushTokens
)
717 // Remember this for later in case you have to unwind...
718 PRInt32 theDequeSize
= mTokenDeque
.GetSize();
719 nsresult result
= NS_OK
;
721 nsTokenAllocator
* theAllocator
= this->GetTokenAllocator();
722 aToken
= theAllocator
->CreateTokenOfType(eToken_start
, eHTMLTag_unknown
);
723 NS_ENSURE_TRUE(aToken
, NS_ERROR_OUT_OF_MEMORY
);
725 // Tell the new token to finish consuming text...
726 result
= aToken
->Consume(aChar
, aScanner
, mFlags
);
728 if (NS_SUCCEEDED(result
)) {
729 AddToken(aToken
, result
, &mTokenDeque
, theAllocator
);
731 eHTMLTags theTag
= (eHTMLTags
)aToken
->GetTypeID();
733 // Good. Now, let's see if the next char is ">".
734 // If so, we have a complete tag, otherwise, we have attributes.
735 result
= aScanner
.Peek(aChar
);
736 if (NS_FAILED(result
)) {
737 aToken
->SetInError(PR_TRUE
);
739 // Don't return early here so we can create a text and end token for
740 // the special <iframe>, <script> and similar tags down below.
743 if (kGreaterThan
!= aChar
) { // Look for a '>'
744 result
= ConsumeAttributes(aChar
, aToken
, aScanner
);
746 aScanner
.GetChar(aChar
);
750 /* Now that that's over with, we have one more problem to solve.
751 In the case that we just read a <SCRIPT> or <STYLE> tags, we should go and
752 consume all the content itself.
753 But XML doesn't treat these tags differently, so we shouldn't if the
756 if (NS_SUCCEEDED(result
) && !(mFlags
& NS_IPARSER_FLAG_XML
)) {
757 PRBool isCDATA
= gHTMLElements
[theTag
].CanContainType(kCDATA
);
758 PRBool isPCDATA
= eHTMLTag_textarea
== theTag
||
759 eHTMLTag_title
== theTag
;
761 // XXX This is an evil hack, we should be able to handle these properly
763 if ((eHTMLTag_iframe
== theTag
&&
764 (mFlags
& NS_IPARSER_FLAG_FRAMES_ENABLED
)) ||
765 (eHTMLTag_noframes
== theTag
&&
766 (mFlags
& NS_IPARSER_FLAG_FRAMES_ENABLED
)) ||
767 (eHTMLTag_noscript
== theTag
&&
768 (mFlags
& NS_IPARSER_FLAG_SCRIPT_ENABLED
)) ||
769 (eHTMLTag_noembed
== theTag
)) {
773 // Plaintext contains CDATA, but it's special, so we handle it
774 // differently than the other CDATA elements
775 if (eHTMLTag_plaintext
== theTag
) {
778 // Note: We check in ConsumeToken() for this flag, and if we see it
779 // we only construct text tokens (which is what we want).
780 mFlags
|= NS_IPARSER_FLAG_PLAIN_TEXT
;
784 if (isCDATA
|| isPCDATA
) {
785 PRBool done
= PR_FALSE
;
786 nsDependentString
endTagName(nsHTMLTags::GetStringValue(theTag
));
789 theAllocator
->CreateTokenOfType(eToken_text
, eHTMLTag_text
);
790 NS_ENSURE_TRUE(text
, NS_ERROR_OUT_OF_MEMORY
);
792 CTextToken
* textToken
= static_cast<CTextToken
*>(text
);
795 result
= textToken
->ConsumeCharacterData(theTag
!= eHTMLTag_script
,
801 // Only flush tokens for <script>, to give ourselves more of a
802 // chance of allowing inlines to contain blocks.
803 aFlushTokens
= done
&& theTag
== eHTMLTag_script
;
804 } else if (isPCDATA
) {
805 // Title is consumed conservatively in order to not regress
807 result
= textToken
->ConsumeParsedCharacterData(
808 theTag
== eHTMLTag_textarea
,
809 theTag
== eHTMLTag_title
,
815 // Note: we *don't* set aFlushTokens here.
818 // We want to do this unless result is kEOF, in which case we will
819 // simply unwind our stack and wait for more data anyway.
820 if (kEOF
!= result
) {
821 AddToken(text
, NS_OK
, &mTokenDeque
, theAllocator
);
822 CToken
* endToken
= nsnull
;
824 if (NS_SUCCEEDED(result
) && done
) {
827 result
= aScanner
.GetChar(theChar
);
828 NS_ASSERTION(NS_SUCCEEDED(result
) && theChar
== kLessThan
,
829 "CTextToken::Consume*Data is broken!");
831 // Ensure we have a /
832 PRUnichar tempChar
; // Don't change non-debug vars in debug-only code
833 result
= aScanner
.Peek(tempChar
);
834 NS_ASSERTION(NS_SUCCEEDED(result
) && tempChar
== kForwardSlash
,
835 "CTextToken::Consume*Data is broken!");
837 result
= ConsumeEndTag(PRUnichar('/'), endToken
, aScanner
);
838 if (!(mFlags
& NS_IPARSER_FLAG_VIEW_SOURCE
) &&
839 NS_SUCCEEDED(result
)) {
840 // If ConsumeCharacterData returned a success result (and
841 // we're not in view source), then we want to make sure that
842 // we're going to execute this script (since the result means
843 // that we've found an end tag that satisfies all of the right
845 endToken
->SetInError(PR_FALSE
);
847 } else if (result
== kFakeEndTag
&&
848 !(mFlags
& NS_IPARSER_FLAG_VIEW_SOURCE
)) {
850 endToken
= theAllocator
->CreateTokenOfType(eToken_end
, theTag
,
852 AddToken(endToken
, result
, &mTokenDeque
, theAllocator
);
853 if (NS_LIKELY(endToken
!= nsnull
)) {
854 endToken
->SetInError(PR_TRUE
);
857 result
= NS_ERROR_OUT_OF_MEMORY
;
859 } else if (result
== kFakeEndTag
) {
860 // If we are here, we are both faking having seen the end tag
861 // and are in view-source.
865 IF_FREE(text
, mTokenAllocator
);
870 // This code is confusing, so pay attention.
871 // If you're here, it's because we were in the midst of consuming a start
872 // tag but ran out of data (not in the stream, but in this *part* of the
873 // stream. For simplicity, we have to unwind our input. Therefore, we pop
874 // and discard any new tokens we've queued this round. Later we can get
875 // smarter about this.
876 if (NS_FAILED(result
)) {
877 while (mTokenDeque
.GetSize()>theDequeSize
) {
878 CToken
* theToken
= (CToken
*)mTokenDeque
.Pop();
879 IF_FREE(theToken
, mTokenAllocator
);
883 IF_FREE(aToken
, mTokenAllocator
);
890 * This method consumes an end tag and any "attributes" that may come after it.
892 * @param aChar The last character read from the scanner.
893 * @param aToken The OUT parameter that holds our resulting token.
894 * @param aScanner Our source of data
895 * @return Error result
898 nsHTMLTokenizer::ConsumeEndTag(PRUnichar aChar
,
902 // Get the "/" (we've already seen it with a Peek)
903 aScanner
.GetChar(aChar
);
905 nsTokenAllocator
* theAllocator
= this->GetTokenAllocator();
906 aToken
= theAllocator
->CreateTokenOfType(eToken_end
, eHTMLTag_unknown
);
907 NS_ENSURE_TRUE(aToken
, NS_ERROR_OUT_OF_MEMORY
);
909 // Remember this for later in case you have to unwind...
910 PRInt32 theDequeSize
= mTokenDeque
.GetSize();
911 nsresult result
= NS_OK
;
913 // Tell the new token to finish consuming text...
914 result
= aToken
->Consume(aChar
, aScanner
, mFlags
);
915 AddToken(aToken
, result
, &mTokenDeque
, theAllocator
);
916 if (NS_FAILED(result
)) {
917 // Note that this early-return here is safe because we have not yet
918 // added any of our tokens to the queue (AddToken only adds the token if
919 // result is a success), so we don't need to fall through.
923 result
= aScanner
.Peek(aChar
);
924 if (NS_FAILED(result
)) {
925 aToken
->SetInError(PR_TRUE
);
927 // Note: We know here that the scanner is not incremental since if
928 // this peek fails, then we've already masked over a kEOF coming from
929 // the Consume() call above.
933 if (kGreaterThan
!= aChar
) {
934 result
= ConsumeAttributes(aChar
, aToken
, aScanner
);
936 aScanner
.GetChar(aChar
);
939 // Do the same thing as we do in ConsumeStartTag. Basically, if we've run
940 // out of room in this *section* of the document, pop all of the tokens
941 // we've consumed this round and wait for more data.
942 if (NS_FAILED(result
)) {
943 while (mTokenDeque
.GetSize() > theDequeSize
) {
944 CToken
* theToken
= (CToken
*)mTokenDeque
.Pop();
945 IF_FREE(theToken
, mTokenAllocator
);
953 * This method is called just after a "&" has been consumed
954 * and we know we're at the start of an entity.
956 * @param aChar The last character read from the scanner.
957 * @param aToken The OUT parameter that holds our resulting token.
958 * @param aScanner Our source of data
959 * @return Error result.
962 nsHTMLTokenizer::ConsumeEntity(PRUnichar aChar
,
967 nsresult result
= aScanner
.Peek(theChar
, 1);
969 nsTokenAllocator
* theAllocator
= this->GetTokenAllocator();
970 if (NS_SUCCEEDED(result
)) {
971 if (nsCRT::IsAsciiAlpha(theChar
) || theChar
== kHashsign
) {
972 aToken
= theAllocator
->CreateTokenOfType(eToken_entity
, eHTMLTag_entity
);
973 NS_ENSURE_TRUE(aToken
, NS_ERROR_OUT_OF_MEMORY
);
974 result
= aToken
->Consume(theChar
, aScanner
, mFlags
);
976 if (result
== NS_HTMLTOKENS_NOT_AN_ENTITY
) {
977 IF_FREE(aToken
, mTokenAllocator
);
979 if (result
== kEOF
&& !aScanner
.IsIncremental()) {
980 result
= NS_OK
; // Use as much of the entity as you can get.
983 AddToken(aToken
, result
, &mTokenDeque
, theAllocator
);
988 // Oops, we're actually looking at plain text...
989 result
= ConsumeText(aToken
, aScanner
);
990 } else if (result
== kEOF
&& !aScanner
.IsIncremental()) {
991 // If the last character in the file is an &, consume it as text.
992 result
= ConsumeText(aToken
, aScanner
);
994 aToken
->SetInError(PR_TRUE
);
1003 * This method is called just after whitespace has been
1004 * consumed and we know we're at the start a whitespace run.
1006 * @param aChar The last character read from the scanner.
1007 * @param aToken The OUT parameter that holds our resulting token.
1008 * @param aScanner Our source of data
1009 * @return Error result.
1012 nsHTMLTokenizer::ConsumeWhitespace(PRUnichar aChar
,
1014 nsScanner
& aScanner
)
1016 // Get the whitespace character
1017 aScanner
.GetChar(aChar
);
1019 nsTokenAllocator
* theAllocator
= this->GetTokenAllocator();
1020 aToken
= theAllocator
->CreateTokenOfType(eToken_whitespace
,
1021 eHTMLTag_whitespace
);
1022 nsresult result
= NS_OK
;
1024 result
= aToken
->Consume(aChar
, aScanner
, mFlags
);
1025 AddToken(aToken
, result
, &mTokenDeque
, theAllocator
);
1032 * This method is called just after a "<!" has been consumed
1033 * and we know we're at the start of a comment.
1035 * @param aChar The last character read from the scanner.
1036 * @param aToken The OUT parameter that holds our resulting token.
1037 * @param aScanner Our source of data
1038 * @return Error result.
1041 nsHTMLTokenizer::ConsumeComment(PRUnichar aChar
,
1043 nsScanner
& aScanner
)
1046 aScanner
.GetChar(aChar
);
1048 nsTokenAllocator
* theAllocator
= this->GetTokenAllocator();
1049 aToken
= theAllocator
->CreateTokenOfType(eToken_comment
, eHTMLTag_comment
);
1050 nsresult result
= NS_OK
;
1052 result
= aToken
->Consume(aChar
, aScanner
, mFlags
);
1053 AddToken(aToken
, result
, &mTokenDeque
, theAllocator
);
1056 if (kNotAComment
== result
) {
1057 // AddToken has IF_FREE()'d our token, so...
1058 result
= ConsumeText(aToken
, aScanner
);
1065 * This method is called just after a known text char has
1066 * been consumed and we should read a text run. Note: we actually ignore the
1067 * first character of the text run so that we can consume invalid markup
1070 * @param aToken The OUT parameter that holds our resulting token.
1071 * @param aScanner Our source of data
1072 * @return Error result.
1075 nsHTMLTokenizer::ConsumeText(CToken
*& aToken
, nsScanner
& aScanner
)
1077 nsresult result
= NS_OK
;
1078 nsTokenAllocator
* theAllocator
= this->GetTokenAllocator();
1079 CTextToken
* theToken
=
1080 (CTextToken
*)theAllocator
->CreateTokenOfType(eToken_text
, eHTMLTag_text
);
1082 PRUnichar ch
= '\0';
1083 result
= theToken
->Consume(ch
, aScanner
, mFlags
);
1084 if (NS_FAILED(result
)) {
1085 if (0 == theToken
->GetTextLength()) {
1086 IF_FREE(aToken
, mTokenAllocator
);
1094 AddToken(aToken
, result
, &mTokenDeque
, theAllocator
);
1101 * This method is called just after a "<!" has been consumed.
1102 * NOTE: Here we might consume DOCTYPE and "special" markups.
1104 * @param aChar The last character read from the scanner.
1105 * @param aToken The OUT parameter that holds our resulting token.
1106 * @param aScanner Our source of data
1107 * @return Error result.
1110 nsHTMLTokenizer::ConsumeSpecialMarkup(PRUnichar aChar
,
1112 nsScanner
& aScanner
)
1115 aScanner
.GetChar(aChar
);
1117 nsresult result
= NS_OK
;
1118 nsAutoString theBufCopy
;
1119 aScanner
.Peek(theBufCopy
, 20);
1120 ToUpperCase(theBufCopy
);
1121 PRInt32 theIndex
= theBufCopy
.Find("DOCTYPE", PR_FALSE
, 0, 0);
1122 nsTokenAllocator
* theAllocator
= this->GetTokenAllocator();
1124 if (theIndex
== kNotFound
) {
1125 if ('[' == theBufCopy
.CharAt(0)) {
1126 aToken
= theAllocator
->CreateTokenOfType(eToken_cdatasection
,
1128 } else if (StringBeginsWith(theBufCopy
, NS_LITERAL_STRING("ELEMENT")) ||
1129 StringBeginsWith(theBufCopy
, NS_LITERAL_STRING("ATTLIST")) ||
1130 StringBeginsWith(theBufCopy
, NS_LITERAL_STRING("ENTITY")) ||
1131 StringBeginsWith(theBufCopy
, NS_LITERAL_STRING("NOTATION"))) {
1132 aToken
= theAllocator
->CreateTokenOfType(eToken_markupDecl
,
1133 eHTMLTag_markupDecl
);
1135 aToken
= theAllocator
->CreateTokenOfType(eToken_comment
,
1139 aToken
= theAllocator
->CreateTokenOfType(eToken_doctypeDecl
,
1140 eHTMLTag_doctypeDecl
);
1144 result
= aToken
->Consume(aChar
, aScanner
, mFlags
);
1145 AddToken(aToken
, result
, &mTokenDeque
, theAllocator
);
1148 if (result
== kNotAComment
) {
1149 result
= ConsumeText(aToken
, aScanner
);
1156 * This method is called just after a newline has been consumed.
1158 * @param aChar The last character read from the scanner.
1159 * @param aToken The OUT parameter that holds our resulting token.
1160 * @param aScanner Our source of data
1161 * @return Error result.
1164 nsHTMLTokenizer::ConsumeNewline(PRUnichar aChar
,
1166 nsScanner
& aScanner
)
1168 // Get the newline character
1169 aScanner
.GetChar(aChar
);
1171 nsTokenAllocator
* theAllocator
= this->GetTokenAllocator();
1172 aToken
= theAllocator
->CreateTokenOfType(eToken_newline
, eHTMLTag_newline
);
1173 nsresult result
= NS_OK
;
1175 result
= aToken
->Consume(aChar
, aScanner
, mFlags
);
1176 AddToken(aToken
, result
, &mTokenDeque
, theAllocator
);
1184 * This method is called just after a <? has been consumed.
1186 * @param aChar The last character read from the scanner.
1187 * @param aToken The OUT parameter that holds our resulting token.
1188 * @param aScanner Our source of data
1189 * @return Error result.
1192 nsHTMLTokenizer::ConsumeProcessingInstruction(PRUnichar aChar
,
1194 nsScanner
& aScanner
)
1197 aScanner
.GetChar(aChar
);
1199 nsTokenAllocator
* theAllocator
= this->GetTokenAllocator();
1200 aToken
= theAllocator
->CreateTokenOfType(eToken_instruction
,
1202 nsresult result
= NS_OK
;
1204 result
= aToken
->Consume(aChar
, aScanner
, mFlags
);
1205 AddToken(aToken
, result
, &mTokenDeque
, theAllocator
);