Bug 436663. Work around ATSUI crasher caused by long Hebrew sequence. r=roc, sr=vlad
[wine-gecko.git] / parser / htmlparser / src / nsHTMLTokenizer.cpp
blob4aae83a07aa27ed919166efe5979a56193c0779d
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set sw=2 ts=2 et tw=78: */
3 /* ***** BEGIN LICENSE BLOCK *****
4 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
6 * The contents of this file are subject to the Mozilla Public License Version
7 * 1.1 (the "License"); you may not use this file except in compliance with
8 * the License. You may obtain a copy of the License at
9 * http://www.mozilla.org/MPL/
11 * Software distributed under the License is distributed on an "AS IS" basis,
12 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
13 * for the specific language governing rights and limitations under the
14 * License.
16 * The Original Code is mozilla.org code.
18 * The Initial Developer of the Original Code is
19 * Netscape Communications Corporation.
20 * Portions created by the Initial Developer are Copyright (C) 1998
21 * the Initial Developer. All Rights Reserved.
23 * Contributor(s):
24 * Blake Kaplan <mrbkap@gmail.com>
26 * Alternatively, the contents of this file may be used under the terms of
27 * either of the GNU General Public License Version 2 or later (the "GPL"),
28 * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
29 * in which case the provisions of the GPL or the LGPL are applicable instead
30 * of those above. If you wish to allow use of your version of this file only
31 * under the terms of either the GPL or the LGPL, and not to allow others to
32 * use your version of this file under the terms of the MPL, indicate your
33 * decision by deleting the provisions above and replace them with the notice
34 * and other provisions required by the GPL or the LGPL. If you do not delete
35 * the provisions above, a recipient may use your version of this file under
36 * the terms of any one of the MPL, the GPL or the LGPL.
38 * ***** END LICENSE BLOCK ***** */
41 /**
42 * @file nsHTMLTokenizer.cpp
43 * This is an implementation of the nsITokenizer interface.
44 * This file contains the implementation of a tokenizer to tokenize an HTML
45 * document. It attempts to do so, making tradeoffs between compatibility with
46 * older parsers and the SGML specification. Note that most of the real
47 * "tokenization" takes place in nsHTMLTokens.cpp.
50 #include "nsIAtom.h"
51 #include "nsHTMLTokenizer.h"
52 #include "nsScanner.h"
53 #include "nsElementTable.h"
54 #include "CParserContext.h"
55 #include "nsReadableUtils.h"
56 #include "nsUnicharUtils.h"
58 /************************************************************************
59 And now for the main class -- nsHTMLTokenizer...
60 ************************************************************************/
62 /**
63 * Satisfy the nsISupports interface.
65 NS_IMPL_ISUPPORTS1(nsHTMLTokenizer, nsITokenizer)
67 /**
68 * Default constructor
70 * @param aParseMode The current mode the document is in (quirks, etc.)
71 * @param aDocType The document type of the current document
72 * @param aCommand What we are trying to do (view-source, parse a fragment, etc.)
74 nsHTMLTokenizer::nsHTMLTokenizer(PRInt32 aParseMode,
75 eParserDocType aDocType,
76 eParserCommands aCommand,
77 PRUint16 aFlags) :
78 nsITokenizer(), mTokenDeque(0), mFlags(aFlags)
80 if (aParseMode == eDTDMode_full_standards ||
81 aParseMode == eDTDMode_almost_standards) {
82 mFlags |= NS_IPARSER_FLAG_STRICT_MODE;
83 } else if (aParseMode == eDTDMode_quirks) {
84 mFlags |= NS_IPARSER_FLAG_QUIRKS_MODE;
85 } else if (aParseMode == eDTDMode_autodetect) {
86 mFlags |= NS_IPARSER_FLAG_AUTO_DETECT_MODE;
87 } else {
88 mFlags |= NS_IPARSER_FLAG_UNKNOWN_MODE;
91 if (aDocType == ePlainText) {
92 mFlags |= NS_IPARSER_FLAG_PLAIN_TEXT;
93 } else if (aDocType == eXML) {
94 mFlags |= NS_IPARSER_FLAG_XML;
95 } else if (aDocType == eHTML_Quirks ||
96 aDocType == eHTML3_Quirks ||
97 aDocType == eHTML_Strict) {
98 mFlags |= NS_IPARSER_FLAG_HTML;
101 mFlags |= aCommand == eViewSource
102 ? NS_IPARSER_FLAG_VIEW_SOURCE
103 : NS_IPARSER_FLAG_VIEW_NORMAL;
105 NS_ASSERTION(!(mFlags & NS_IPARSER_FLAG_XML) ||
106 (mFlags & NS_IPARSER_FLAG_VIEW_SOURCE),
107 "Why isn't this XML document going through our XML parser?");
109 mTokenAllocator = nsnull;
110 mTokenScanPos = 0;
114 * The destructor ensures that we don't leak any left over tokens.
116 nsHTMLTokenizer::~nsHTMLTokenizer()
118 if (mTokenDeque.GetSize()) {
119 CTokenDeallocator theDeallocator(mTokenAllocator->GetArenaPool());
120 mTokenDeque.ForEach(theDeallocator);
125 /*******************************************************************
126 Here begins the real working methods for the tokenizer.
127 *******************************************************************/
130 * Adds a token onto the end of the deque if aResult is a successful result.
131 * Otherwise, this function frees aToken and sets it to nsnull.
133 * @param aToken The token that wants to be added.
134 * @param aResult The error code that will be used to determine if we actually
135 * want to push this token.
136 * @param aDeque The deque we want to push aToken onto.
137 * @param aTokenAllocator The allocator we use to free aToken in case aResult
138 * is not a success code.
140 /* static */
141 void
142 nsHTMLTokenizer::AddToken(CToken*& aToken,
143 nsresult aResult,
144 nsDeque* aDeque,
145 nsTokenAllocator* aTokenAllocator)
147 if (aToken && aDeque) {
148 if (NS_SUCCEEDED(aResult)) {
149 aDeque->Push(aToken);
150 } else {
151 IF_FREE(aToken, aTokenAllocator);
157 * Retrieve a pointer to the global token recycler...
159 * @return Pointer to recycler (or null)
161 nsTokenAllocator*
162 nsHTMLTokenizer::GetTokenAllocator()
164 return mTokenAllocator;
168 * This method provides access to the topmost token in the tokenDeque.
169 * The token is not really removed from the list.
171 * @return Pointer to token
173 CToken*
174 nsHTMLTokenizer::PeekToken()
176 return (CToken*)mTokenDeque.PeekFront();
180 * This method provides access to the topmost token in the tokenDeque.
181 * The token is really removed from the list; if the list is empty we return 0.
183 * @return Pointer to token or NULL
185 CToken*
186 nsHTMLTokenizer::PopToken()
188 return (CToken*)mTokenDeque.PopFront();
193 * Pushes a token onto the front of our deque such that the next call to
194 * PopToken() or PeekToken() will return that token.
196 * @param theToken The next token to be processed
197 * @return theToken
199 CToken*
200 nsHTMLTokenizer::PushTokenFront(CToken* theToken)
202 mTokenDeque.PushFront(theToken);
203 return theToken;
207 * Pushes a token onto the deque.
209 * @param theToken the new token.
210 * @return theToken
212 CToken*
213 nsHTMLTokenizer::PushToken(CToken* theToken)
215 mTokenDeque.Push(theToken);
216 return theToken;
220 * Returns the size of the deque.
222 * @return The number of remaining tokens.
224 PRInt32
225 nsHTMLTokenizer::GetCount()
227 return mTokenDeque.GetSize();
231 * Allows access to an arbitrary token in the deque. The accessed token is left
232 * in the deque.
234 * @param anIndex The index of the target token. Token 0 would be the same as
235 * the result of a call to PeekToken()
236 * @return The requested token.
238 CToken*
239 nsHTMLTokenizer::GetTokenAt(PRInt32 anIndex)
241 return (CToken*)mTokenDeque.ObjectAt(anIndex);
245 * This method is part of the "sandwich" that occurs when we want to tokenize
246 * a document. This prepares us to be able to tokenize properly.
248 * @param aIsFinalChunk Whether this is the last chunk of data that we will
249 * get to see.
250 * @param aTokenAllocator The token allocator to use for this document.
251 * @return Our success in setting up.
253 nsresult
254 nsHTMLTokenizer::WillTokenize(PRBool aIsFinalChunk,
255 nsTokenAllocator* aTokenAllocator)
257 mTokenAllocator = aTokenAllocator;
258 mIsFinalChunk = aIsFinalChunk;
260 // Cause ScanDocStructure to search from here for new tokens...
261 mTokenScanPos = mTokenDeque.GetSize();
262 return NS_OK;
266 * Pushes all of the tokens in aDeque onto the front of our deque so they
267 * get processed before any other tokens.
269 * @param aDeque The deque with the tokens in it.
271 void
272 nsHTMLTokenizer::PrependTokens(nsDeque& aDeque)
274 PRInt32 aCount = aDeque.GetSize();
276 for (PRInt32 anIndex = 0; anIndex < aCount; ++anIndex) {
277 CToken* theToken = (CToken*)aDeque.Pop();
278 PushTokenFront(theToken);
283 * Copies the state flags from aTokenizer into this tokenizer. This is used
284 * to pass information around between the main tokenizer and tokenizers
285 * created for document.write() calls.
287 * @param aTokenizer The tokenizer with more information in it.
288 * @return NS_OK
290 nsresult
291 nsHTMLTokenizer::CopyState(nsITokenizer* aTokenizer)
293 if (aTokenizer) {
294 mFlags = ((nsHTMLTokenizer*)aTokenizer)->mFlags;
297 return NS_OK;
301 * This is a utilty method for ScanDocStructure, which finds a given
302 * tag in the stack. The return value is meant to be used with
303 * nsDeque::ObjectAt() on aTagStack.
305 * @param aTag -- the ID of the tag we're seeking
306 * @param aTagStack -- the stack to be searched
307 * @return index position of tag in stack if found, otherwise kNotFound
309 static PRInt32
310 FindLastIndexOfTag(eHTMLTags aTag, nsDeque &aTagStack)
312 PRInt32 theCount = aTagStack.GetSize();
314 while (0 < theCount) {
315 CHTMLToken* theToken = (CHTMLToken*)aTagStack.ObjectAt(--theCount);
316 if (theToken) {
317 eHTMLTags theTag = (eHTMLTags)theToken->GetTypeID();
318 if (theTag == aTag) {
319 return theCount;
324 return kNotFound;
328 * This method scans the sequence of tokens to determine whether or not the
329 * tag structure of the document is well formed. In well formed cases, we can
330 * skip doing residual style handling and allow inlines to contain block-level
331 * elements.
333 * @param aFinalChunk Is unused.
334 * @return Success (currently, this function cannot fail).
336 nsresult nsHTMLTokenizer::ScanDocStructure(PRBool aFinalChunk)
338 nsresult result = NS_OK;
339 if (!mTokenDeque.GetSize()) {
340 return result;
343 CHTMLToken* theToken = (CHTMLToken*)mTokenDeque.ObjectAt(mTokenScanPos);
345 // Start by finding the first start tag that hasn't been reviewed.
346 while (mTokenScanPos > 0) {
347 if (theToken) {
348 eHTMLTokenTypes theType = eHTMLTokenTypes(theToken->GetTokenType());
349 if (theType == eToken_start &&
350 theToken->GetContainerInfo() == eFormUnknown) {
351 break;
354 theToken = (CHTMLToken*)mTokenDeque.ObjectAt(--mTokenScanPos);
357 // Now that we know where to start, let's walk through the
358 // tokens to see which are well-formed. Stop when you run out
359 // of fresh tokens.
361 nsDeque theStack(0);
362 nsDeque tempStack(0);
363 PRInt32 theStackDepth = 0;
364 // Don't bother if we get ridiculously deep.
365 static const PRInt32 theMaxStackDepth = 200;
367 while (theToken && theStackDepth < theMaxStackDepth) {
368 eHTMLTokenTypes theType = eHTMLTokenTypes(theToken->GetTokenType());
369 eHTMLTags theTag = (eHTMLTags)theToken->GetTypeID();
371 if (nsHTMLElement::IsContainer(theTag)) { // Bug 54117
372 PRBool theTagIsBlock = gHTMLElements[theTag].IsMemberOf(kBlockEntity);
373 PRBool theTagIsInline = theTagIsBlock
374 ? PR_FALSE
375 : gHTMLElements[theTag].IsMemberOf(kInlineEntity);
377 if (theTagIsBlock || theTagIsInline || eHTMLTag_table == theTag) {
378 switch(theType) {
379 case eToken_start:
381 if (gHTMLElements[theTag].ShouldVerifyHierarchy()) {
382 PRInt32 earlyPos = FindLastIndexOfTag(theTag, theStack);
383 if (earlyPos != kNotFound) {
384 // Uh-oh, we've found a tag that is not allowed to nest at
385 // all. Mark the previous one and all of its children as
386 // malformed to increase our chances of doing RS handling
387 // on all of them. We want to do this for cases such as:
388 // <a><div><a></a></div></a>.
389 // Note that we have to iterate through all of the chilren
390 // of the original malformed tag to protect against:
391 // <a><font><div><a></a></div></font></a>, so that the <font>
392 // is allowed to contain the <div>.
393 // XXX What about <a><span><a>, where the second <a> closes
394 // the <span>?
395 nsDequeIterator it(theStack, earlyPos), end(theStack.End());
396 while (it < end) {
397 CHTMLToken *theMalformedToken =
398 static_cast<CHTMLToken*>(it++);
400 theMalformedToken->SetContainerInfo(eMalformed);
405 theStack.Push(theToken);
406 ++theStackDepth;
408 break;
409 case eToken_end:
411 CHTMLToken *theLastToken =
412 static_cast<CHTMLToken*>(theStack.Peek());
413 if (theLastToken) {
414 if (theTag == theLastToken->GetTypeID()) {
415 theStack.Pop(); // Yank it for real
416 theStackDepth--;
417 theLastToken->SetContainerInfo(eWellFormed);
418 } else {
419 // This token wasn't what we expected it to be! We need to
420 // go searching for its real start tag on our stack. Each
421 // tag in between the end tag and start tag must be malformed
423 if (FindLastIndexOfTag(theTag, theStack) != kNotFound) {
424 // Find theTarget in the stack, marking each (malformed!)
425 // tag in our way.
426 theStack.Pop(); // Pop off theLastToken for real.
427 do {
428 theLastToken->SetContainerInfo(eMalformed);
429 tempStack.Push(theLastToken);
430 theLastToken = static_cast<CHTMLToken*>(theStack.Pop());
431 } while (theLastToken && theTag != theLastToken->GetTypeID());
432 // XXX The above test can confuse two different userdefined
433 // tags.
435 NS_ASSERTION(theLastToken,
436 "FindLastIndexOfTag lied to us!"
437 " We couldn't find theTag on theStack");
438 theLastToken->SetContainerInfo(eMalformed);
440 // Great, now push all of the other tokens back onto the
441 // stack to preserve the general structure of the document.
442 // Note that we don't push the target token back onto the
443 // the stack (since it was just closed).
444 while (tempStack.GetSize() != 0) {
445 theStack.Push(tempStack.Pop());
451 break;
452 default:
453 break;
458 theToken = (CHTMLToken*)mTokenDeque.ObjectAt(++mTokenScanPos);
461 return result;
465 * This method is called after we're done tokenizing a chunk of data.
467 * @param aFinalChunk Tells us if this was the last chunk of data.
468 * @return Error result.
470 nsresult
471 nsHTMLTokenizer::DidTokenize(PRBool aFinalChunk)
473 return ScanDocStructure(aFinalChunk);
477 * This method is repeatedly called by the tokenizer.
478 * Each time, we determine the kind of token we're about to
479 * read, and then we call the appropriate method to handle
480 * that token type.
482 * @param aScanner The source of our input.
483 * @param aFlushTokens An OUT parameter to tell the caller whether it should
484 * process our queued tokens up to now (e.g., when we
485 * reach a <script>).
486 * @return Success or error
488 nsresult
489 nsHTMLTokenizer::ConsumeToken(nsScanner& aScanner, PRBool& aFlushTokens)
491 PRUnichar theChar;
492 CToken* theToken = nsnull;
494 nsresult result = aScanner.Peek(theChar);
496 switch(result) {
497 case kEOF:
498 // Tell our caller that'we finished.
499 return result;
501 case NS_OK:
502 default:
503 if (!(mFlags & NS_IPARSER_FLAG_PLAIN_TEXT)) {
504 if (kLessThan == theChar) {
505 return ConsumeTag(theChar, theToken, aScanner, aFlushTokens);
506 } else if (kAmpersand == theChar) {
507 return ConsumeEntity(theChar, theToken, aScanner);
511 if (kCR == theChar || kLF == theChar) {
512 return ConsumeNewline(theChar, theToken, aScanner);
513 } else {
514 if (!nsCRT::IsAsciiSpace(theChar)) {
515 if (theChar != '\0') {
516 result = ConsumeText(theToken, aScanner);
517 } else {
518 // Skip the embedded null char. Fix bug 64098.
519 aScanner.GetChar(theChar);
521 break;
523 result = ConsumeWhitespace(theChar, theToken, aScanner);
525 break;
528 return result;
532 * This method is called just after a "<" has been consumed
533 * and we know we're at the start of some kind of tagged
534 * element. We don't know yet if it's a tag or a comment.
536 * @param aChar is the last char read
537 * @param aToken is the out arg holding our new token (the function allocates
538 * the return token using mTokenAllocator).
539 * @param aScanner represents our input source
540 * @param aFlushTokens is an OUT parameter use to tell consumers to flush
541 * the current tokens after processing the current one.
542 * @return error code.
544 nsresult
545 nsHTMLTokenizer::ConsumeTag(PRUnichar aChar,
546 CToken*& aToken,
547 nsScanner& aScanner,
548 PRBool& aFlushTokens)
550 PRUnichar theNextChar, oldChar;
551 nsresult result = aScanner.Peek(aChar, 1);
553 if (NS_OK == result) {
554 switch (aChar) {
555 case kForwardSlash:
556 result = aScanner.Peek(theNextChar, 2);
558 if (NS_OK == result) {
559 // Get the original "<" (we've already seen it with a Peek)
560 aScanner.GetChar(oldChar);
562 // XML allows non ASCII tag names, consume this as an end tag. This
563 // is needed to make XML view source work
564 PRBool isXML = !!(mFlags & NS_IPARSER_FLAG_XML);
565 if (nsCRT::IsAsciiAlpha(theNextChar) ||
566 kGreaterThan == theNextChar ||
567 (isXML && !nsCRT::IsAscii(theNextChar))) {
568 result = ConsumeEndTag(aChar, aToken, aScanner);
569 } else {
570 result = ConsumeComment(aChar, aToken, aScanner);
574 break;
576 case kExclamation:
577 result = aScanner.Peek(theNextChar, 2);
579 if (NS_OK == result) {
580 // Get the original "<" (we've already seen it with a Peek)
581 aScanner.GetChar(oldChar);
583 if (kMinus == theNextChar || kGreaterThan == theNextChar) {
584 result = ConsumeComment(aChar, aToken, aScanner);
585 } else {
586 result = ConsumeSpecialMarkup(aChar, aToken, aScanner);
589 break;
591 case kQuestionMark:
592 // It must be a processing instruction...
593 // Get the original "<" (we've already seen it with a Peek)
594 aScanner.GetChar(oldChar);
595 result = ConsumeProcessingInstruction(aChar, aToken, aScanner);
596 break;
598 default:
599 // XML allows non ASCII tag names, consume this as a start tag.
600 PRBool isXML = !!(mFlags & NS_IPARSER_FLAG_XML);
601 if (nsCRT::IsAsciiAlpha(aChar) ||
602 (isXML && !nsCRT::IsAscii(aChar))) {
603 // Get the original "<" (we've already seen it with a Peek)
604 aScanner.GetChar(oldChar);
605 result = ConsumeStartTag(aChar, aToken, aScanner, aFlushTokens);
606 } else {
607 // We are not dealing with a tag. So, don't consume the original
608 // char and leave the decision to ConsumeText().
609 result = ConsumeText(aToken, aScanner);
614 // Last ditch attempt to make sure we don't lose data.
615 if (kEOF == result && !aScanner.IsIncremental()) {
616 // Whoops, we don't want to lose any data! Consume the rest as text.
617 // This normally happens for either a trailing < or </
618 result = ConsumeText(aToken, aScanner);
621 return result;
625 * This method is called just after we've consumed a start or end
626 * tag, and we now have to consume its attributes.
628 * @param aChar is the last char read
629 * @param aToken is the start or end tag that "owns" these attributes.
630 * @param aScanner represents our input source
631 * @return Error result.
633 nsresult
634 nsHTMLTokenizer::ConsumeAttributes(PRUnichar aChar,
635 CToken* aToken,
636 nsScanner& aScanner)
638 PRBool done = PR_FALSE;
639 nsresult result = NS_OK;
640 PRInt16 theAttrCount = 0;
642 nsTokenAllocator* theAllocator = this->GetTokenAllocator();
644 while (!done && result == NS_OK) {
645 CAttributeToken* theToken =
646 static_cast<CAttributeToken*>
647 (theAllocator->CreateTokenOfType(eToken_attribute,
648 eHTMLTag_unknown));
649 if (NS_LIKELY(theToken != nsnull)) {
650 // Tell the new token to finish consuming text...
651 result = theToken->Consume(aChar, aScanner, mFlags);
653 if (NS_SUCCEEDED(result)) {
654 ++theAttrCount;
655 AddToken((CToken*&)theToken, result, &mTokenDeque, theAllocator);
656 } else {
657 IF_FREE(theToken, mTokenAllocator);
658 // Bad attribute returns shouldn't propagate out.
659 if (NS_ERROR_HTMLPARSER_BADATTRIBUTE == result) {
660 result = NS_OK;
664 else {
665 result = NS_ERROR_OUT_OF_MEMORY;
668 #ifdef DEBUG
669 if (NS_SUCCEEDED(result)) {
670 PRInt32 newline = 0;
671 aScanner.SkipWhitespace(newline);
672 NS_ASSERTION(newline == 0,
673 "CAttribute::Consume() failed to collect all the newlines!");
675 #endif
676 if (NS_SUCCEEDED(result)) {
677 result = aScanner.Peek(aChar);
678 if (NS_SUCCEEDED(result)) {
679 if (aChar == kGreaterThan) { // You just ate the '>'
680 aScanner.GetChar(aChar); // Skip the '>'
681 done = PR_TRUE;
682 } else if (aChar == kLessThan) {
683 aToken->SetInError(PR_TRUE);
684 done = PR_TRUE;
690 if (NS_FAILED(result)) {
691 aToken->SetInError(PR_TRUE);
693 if (!aScanner.IsIncremental()) {
694 result = NS_OK;
698 aToken->SetAttributeCount(theAttrCount);
699 return result;
703 * This method consumes a start tag and all of its attributes.
705 * @param aChar The last character read from the scanner.
706 * @param aToken The OUT parameter that holds our resulting token. (allocated
707 * by the function using mTokenAllocator
708 * @param aScanner Our source of data
709 * @param aFlushTokens is an OUT parameter use to tell consumers to flush
710 * the current tokens after processing the current one.
711 * @return Error result.
713 nsresult
714 nsHTMLTokenizer::ConsumeStartTag(PRUnichar aChar,
715 CToken*& aToken,
716 nsScanner& aScanner,
717 PRBool& aFlushTokens)
719 // Remember this for later in case you have to unwind...
720 PRInt32 theDequeSize = mTokenDeque.GetSize();
721 nsresult result = NS_OK;
723 nsTokenAllocator* theAllocator = this->GetTokenAllocator();
724 aToken = theAllocator->CreateTokenOfType(eToken_start, eHTMLTag_unknown);
725 NS_ENSURE_TRUE(aToken, NS_ERROR_OUT_OF_MEMORY);
727 // Tell the new token to finish consuming text...
728 result = aToken->Consume(aChar, aScanner, mFlags);
730 if (NS_SUCCEEDED(result)) {
731 AddToken(aToken, result, &mTokenDeque, theAllocator);
733 eHTMLTags theTag = (eHTMLTags)aToken->GetTypeID();
735 // Good. Now, let's see if the next char is ">".
736 // If so, we have a complete tag, otherwise, we have attributes.
737 result = aScanner.Peek(aChar);
738 if (NS_FAILED(result)) {
739 aToken->SetInError(PR_TRUE);
741 // Don't return early here so we can create a text and end token for
742 // the special <iframe>, <script> and similar tags down below.
743 result = NS_OK;
744 } else {
745 if (kGreaterThan != aChar) { // Look for a '>'
746 result = ConsumeAttributes(aChar, aToken, aScanner);
747 } else {
748 aScanner.GetChar(aChar);
752 /* Now that that's over with, we have one more problem to solve.
753 In the case that we just read a <SCRIPT> or <STYLE> tags, we should go and
754 consume all the content itself.
755 But XML doesn't treat these tags differently, so we shouldn't if the
756 document is XML.
758 if (NS_SUCCEEDED(result) && !(mFlags & NS_IPARSER_FLAG_XML)) {
759 PRBool isCDATA = gHTMLElements[theTag].CanContainType(kCDATA);
760 PRBool isPCDATA = eHTMLTag_textarea == theTag ||
761 eHTMLTag_title == theTag;
763 // XXX This is an evil hack, we should be able to handle these properly
764 // in the DTD.
765 if ((eHTMLTag_iframe == theTag &&
766 (mFlags & NS_IPARSER_FLAG_FRAMES_ENABLED)) ||
767 (eHTMLTag_noframes == theTag &&
768 (mFlags & NS_IPARSER_FLAG_FRAMES_ENABLED)) ||
769 (eHTMLTag_noscript == theTag &&
770 (mFlags & NS_IPARSER_FLAG_SCRIPT_ENABLED)) ||
771 (eHTMLTag_noembed == theTag)) {
772 isCDATA = PR_TRUE;
775 // Plaintext contains CDATA, but it's special, so we handle it
776 // differently than the other CDATA elements
777 if (eHTMLTag_plaintext == theTag) {
778 isCDATA = PR_FALSE;
780 // Note: We check in ConsumeToken() for this flag, and if we see it
781 // we only construct text tokens (which is what we want).
782 mFlags |= NS_IPARSER_FLAG_PLAIN_TEXT;
786 if (isCDATA || isPCDATA) {
787 PRBool done = PR_FALSE;
788 nsDependentString endTagName(nsHTMLTags::GetStringValue(theTag));
790 CToken* text =
791 theAllocator->CreateTokenOfType(eToken_text, eHTMLTag_text);
792 NS_ENSURE_TRUE(text, NS_ERROR_OUT_OF_MEMORY);
794 CTextToken* textToken = static_cast<CTextToken*>(text);
796 if (isCDATA) {
797 result = textToken->ConsumeCharacterData(theTag != eHTMLTag_script,
798 aScanner,
799 endTagName,
800 mFlags,
801 done);
803 // Only flush tokens for <script>, to give ourselves more of a
804 // chance of allowing inlines to contain blocks.
805 aFlushTokens = done && theTag == eHTMLTag_script;
806 } else if (isPCDATA) {
807 // Title is consumed conservatively in order to not regress
808 // bug 42945
809 result = textToken->ConsumeParsedCharacterData(
810 theTag == eHTMLTag_textarea,
811 theTag == eHTMLTag_title,
812 aScanner,
813 endTagName,
814 mFlags,
815 done);
817 // Note: we *don't* set aFlushTokens here.
820 // We want to do this unless result is kEOF, in which case we will
821 // simply unwind our stack and wait for more data anyway.
822 if (kEOF != result) {
823 AddToken(text, NS_OK, &mTokenDeque, theAllocator);
824 CToken* endToken = nsnull;
826 if (NS_SUCCEEDED(result) && done) {
827 PRUnichar theChar;
828 // Get the <
829 result = aScanner.GetChar(theChar);
830 NS_ASSERTION(NS_SUCCEEDED(result) && theChar == kLessThan,
831 "CTextToken::Consume*Data is broken!");
832 #ifdef DEBUG
833 // Ensure we have a /
834 PRUnichar tempChar; // Don't change non-debug vars in debug-only code
835 result = aScanner.Peek(tempChar);
836 NS_ASSERTION(NS_SUCCEEDED(result) && tempChar == kForwardSlash,
837 "CTextToken::Consume*Data is broken!");
838 #endif
839 result = ConsumeEndTag(PRUnichar('/'), endToken, aScanner);
840 if (!(mFlags & NS_IPARSER_FLAG_VIEW_SOURCE) &&
841 NS_SUCCEEDED(result)) {
842 // If ConsumeCharacterData returned a success result (and
843 // we're not in view source), then we want to make sure that
844 // we're going to execute this script (since the result means
845 // that we've found an end tag that satisfies all of the right
846 // conditions).
847 endToken->SetInError(PR_FALSE);
849 } else if (result == kFakeEndTag &&
850 !(mFlags & NS_IPARSER_FLAG_VIEW_SOURCE)) {
851 result = NS_OK;
852 endToken = theAllocator->CreateTokenOfType(eToken_end, theTag,
853 endTagName);
854 AddToken(endToken, result, &mTokenDeque, theAllocator);
855 if (NS_LIKELY(endToken != nsnull)) {
856 endToken->SetInError(PR_TRUE);
858 else {
859 result = NS_ERROR_OUT_OF_MEMORY;
861 } else if (result == kFakeEndTag) {
862 // If we are here, we are both faking having seen the end tag
863 // and are in view-source.
864 result = NS_OK;
866 } else {
867 IF_FREE(text, mTokenAllocator);
872 // This code is confusing, so pay attention.
873 // If you're here, it's because we were in the midst of consuming a start
874 // tag but ran out of data (not in the stream, but in this *part* of the
875 // stream. For simplicity, we have to unwind our input. Therefore, we pop
876 // and discard any new tokens we've queued this round. Later we can get
877 // smarter about this.
878 if (NS_FAILED(result)) {
879 while (mTokenDeque.GetSize()>theDequeSize) {
880 CToken* theToken = (CToken*)mTokenDeque.Pop();
881 IF_FREE(theToken, mTokenAllocator);
884 } else {
885 IF_FREE(aToken, mTokenAllocator);
888 return result;
892 * This method consumes an end tag and any "attributes" that may come after it.
894 * @param aChar The last character read from the scanner.
895 * @param aToken The OUT parameter that holds our resulting token.
896 * @param aScanner Our source of data
897 * @return Error result
899 nsresult
900 nsHTMLTokenizer::ConsumeEndTag(PRUnichar aChar,
901 CToken*& aToken,
902 nsScanner& aScanner)
904 // Get the "/" (we've already seen it with a Peek)
905 aScanner.GetChar(aChar);
907 nsTokenAllocator* theAllocator = this->GetTokenAllocator();
908 aToken = theAllocator->CreateTokenOfType(eToken_end, eHTMLTag_unknown);
909 NS_ENSURE_TRUE(aToken, NS_ERROR_OUT_OF_MEMORY);
911 // Remember this for later in case you have to unwind...
912 PRInt32 theDequeSize = mTokenDeque.GetSize();
913 nsresult result = NS_OK;
915 // Tell the new token to finish consuming text...
916 result = aToken->Consume(aChar, aScanner, mFlags);
917 AddToken(aToken, result, &mTokenDeque, theAllocator);
918 if (NS_FAILED(result)) {
919 // Note that this early-return here is safe because we have not yet
920 // added any of our tokens to the queue (AddToken only adds the token if
921 // result is a success), so we don't need to fall through.
922 return result;
925 result = aScanner.Peek(aChar);
926 if (NS_FAILED(result)) {
927 aToken->SetInError(PR_TRUE);
929 // Note: We know here that the scanner is not incremental since if
930 // this peek fails, then we've already masked over a kEOF coming from
931 // the Consume() call above.
932 return NS_OK;
935 if (kGreaterThan != aChar) {
936 result = ConsumeAttributes(aChar, aToken, aScanner);
937 } else {
938 aScanner.GetChar(aChar);
941 // Do the same thing as we do in ConsumeStartTag. Basically, if we've run
942 // out of room in this *section* of the document, pop all of the tokens
943 // we've consumed this round and wait for more data.
944 if (NS_FAILED(result)) {
945 while (mTokenDeque.GetSize() > theDequeSize) {
946 CToken* theToken = (CToken*)mTokenDeque.Pop();
947 IF_FREE(theToken, mTokenAllocator);
951 return result;
955 * This method is called just after a "&" has been consumed
956 * and we know we're at the start of an entity.
958 * @param aChar The last character read from the scanner.
959 * @param aToken The OUT parameter that holds our resulting token.
960 * @param aScanner Our source of data
961 * @return Error result.
963 nsresult
964 nsHTMLTokenizer::ConsumeEntity(PRUnichar aChar,
965 CToken*& aToken,
966 nsScanner& aScanner)
968 PRUnichar theChar;
969 nsresult result = aScanner.Peek(theChar, 1);
971 nsTokenAllocator* theAllocator = this->GetTokenAllocator();
972 if (NS_SUCCEEDED(result)) {
973 if (nsCRT::IsAsciiAlpha(theChar) || theChar == kHashsign) {
974 aToken = theAllocator->CreateTokenOfType(eToken_entity, eHTMLTag_entity);
975 NS_ENSURE_TRUE(aToken, NS_ERROR_OUT_OF_MEMORY);
976 result = aToken->Consume(theChar, aScanner, mFlags);
978 if (result == NS_HTMLTOKENS_NOT_AN_ENTITY) {
979 IF_FREE(aToken, mTokenAllocator);
980 } else {
981 if (result == kEOF && !aScanner.IsIncremental()) {
982 result = NS_OK; // Use as much of the entity as you can get.
985 AddToken(aToken, result, &mTokenDeque, theAllocator);
986 return result;
990 // Oops, we're actually looking at plain text...
991 result = ConsumeText(aToken, aScanner);
992 } else if (result == kEOF && !aScanner.IsIncremental()) {
993 // If the last character in the file is an &, consume it as text.
994 result = ConsumeText(aToken, aScanner);
995 if (aToken) {
996 aToken->SetInError(PR_TRUE);
1000 return result;
1005 * This method is called just after whitespace has been
1006 * consumed and we know we're at the start a whitespace run.
1008 * @param aChar The last character read from the scanner.
1009 * @param aToken The OUT parameter that holds our resulting token.
1010 * @param aScanner Our source of data
1011 * @return Error result.
1013 nsresult
1014 nsHTMLTokenizer::ConsumeWhitespace(PRUnichar aChar,
1015 CToken*& aToken,
1016 nsScanner& aScanner)
1018 // Get the whitespace character
1019 aScanner.GetChar(aChar);
1021 nsTokenAllocator* theAllocator = this->GetTokenAllocator();
1022 aToken = theAllocator->CreateTokenOfType(eToken_whitespace,
1023 eHTMLTag_whitespace);
1024 nsresult result = NS_OK;
1025 if (aToken) {
1026 result = aToken->Consume(aChar, aScanner, mFlags);
1027 AddToken(aToken, result, &mTokenDeque, theAllocator);
1030 return result;
1034 * This method is called just after a "<!" has been consumed
1035 * and we know we're at the start of a comment.
1037 * @param aChar The last character read from the scanner.
1038 * @param aToken The OUT parameter that holds our resulting token.
1039 * @param aScanner Our source of data
1040 * @return Error result.
1042 nsresult
1043 nsHTMLTokenizer::ConsumeComment(PRUnichar aChar,
1044 CToken*& aToken,
1045 nsScanner& aScanner)
1047 // Get the "!"
1048 aScanner.GetChar(aChar);
1050 nsTokenAllocator* theAllocator = this->GetTokenAllocator();
1051 aToken = theAllocator->CreateTokenOfType(eToken_comment, eHTMLTag_comment);
1052 nsresult result = NS_OK;
1053 if (aToken) {
1054 result = aToken->Consume(aChar, aScanner, mFlags);
1055 AddToken(aToken, result, &mTokenDeque, theAllocator);
1058 if (kNotAComment == result) {
1059 // AddToken has IF_FREE()'d our token, so...
1060 result = ConsumeText(aToken, aScanner);
1063 return result;
1067 * This method is called just after a known text char has
1068 * been consumed and we should read a text run. Note: we actually ignore the
1069 * first character of the text run so that we can consume invalid markup
1070 * as text.
1072 * @param aToken The OUT parameter that holds our resulting token.
1073 * @param aScanner Our source of data
1074 * @return Error result.
1076 nsresult
1077 nsHTMLTokenizer::ConsumeText(CToken*& aToken, nsScanner& aScanner)
1079 nsresult result = NS_OK;
1080 nsTokenAllocator* theAllocator = this->GetTokenAllocator();
1081 CTextToken* theToken =
1082 (CTextToken*)theAllocator->CreateTokenOfType(eToken_text, eHTMLTag_text);
1083 if (theToken) {
1084 PRUnichar ch = '\0';
1085 result = theToken->Consume(ch, aScanner, mFlags);
1086 if (NS_FAILED(result)) {
1087 if (0 == theToken->GetTextLength()) {
1088 IF_FREE(aToken, mTokenAllocator);
1089 aToken = nsnull;
1090 } else {
1091 result = NS_OK;
1095 aToken = theToken;
1096 AddToken(aToken, result, &mTokenDeque, theAllocator);
1099 return result;
1103 * This method is called just after a "<!" has been consumed.
1104 * NOTE: Here we might consume DOCTYPE and "special" markups.
1106 * @param aChar The last character read from the scanner.
1107 * @param aToken The OUT parameter that holds our resulting token.
1108 * @param aScanner Our source of data
1109 * @return Error result.
1111 nsresult
1112 nsHTMLTokenizer::ConsumeSpecialMarkup(PRUnichar aChar,
1113 CToken*& aToken,
1114 nsScanner& aScanner)
1116 // Get the "!"
1117 aScanner.GetChar(aChar);
1119 nsresult result = NS_OK;
1120 nsAutoString theBufCopy;
1121 aScanner.Peek(theBufCopy, 20);
1122 ToUpperCase(theBufCopy);
1123 PRInt32 theIndex = theBufCopy.Find("DOCTYPE", PR_FALSE, 0, 0);
1124 nsTokenAllocator* theAllocator = this->GetTokenAllocator();
1126 if (theIndex == kNotFound) {
1127 if ('[' == theBufCopy.CharAt(0)) {
1128 aToken = theAllocator->CreateTokenOfType(eToken_cdatasection,
1129 eHTMLTag_comment);
1130 } else if (StringBeginsWith(theBufCopy, NS_LITERAL_STRING("ELEMENT")) ||
1131 StringBeginsWith(theBufCopy, NS_LITERAL_STRING("ATTLIST")) ||
1132 StringBeginsWith(theBufCopy, NS_LITERAL_STRING("ENTITY")) ||
1133 StringBeginsWith(theBufCopy, NS_LITERAL_STRING("NOTATION"))) {
1134 aToken = theAllocator->CreateTokenOfType(eToken_markupDecl,
1135 eHTMLTag_markupDecl);
1136 } else {
1137 aToken = theAllocator->CreateTokenOfType(eToken_comment,
1138 eHTMLTag_comment);
1140 } else {
1141 aToken = theAllocator->CreateTokenOfType(eToken_doctypeDecl,
1142 eHTMLTag_doctypeDecl);
1145 if (aToken) {
1146 result = aToken->Consume(aChar, aScanner, mFlags);
1147 AddToken(aToken, result, &mTokenDeque, theAllocator);
1150 if (result == kNotAComment) {
1151 result = ConsumeText(aToken, aScanner);
1154 return result;
1158 * This method is called just after a newline has been consumed.
1160 * @param aChar The last character read from the scanner.
1161 * @param aToken The OUT parameter that holds our resulting token.
1162 * @param aScanner Our source of data
1163 * @return Error result.
1165 nsresult
1166 nsHTMLTokenizer::ConsumeNewline(PRUnichar aChar,
1167 CToken*& aToken,
1168 nsScanner& aScanner)
1170 // Get the newline character
1171 aScanner.GetChar(aChar);
1173 nsTokenAllocator* theAllocator = this->GetTokenAllocator();
1174 aToken = theAllocator->CreateTokenOfType(eToken_newline, eHTMLTag_newline);
1175 nsresult result = NS_OK;
1176 if (aToken) {
1177 result = aToken->Consume(aChar, aScanner, mFlags);
1178 AddToken(aToken, result, &mTokenDeque, theAllocator);
1181 return result;
1186 * This method is called just after a <? has been consumed.
1188 * @param aChar The last character read from the scanner.
1189 * @param aToken The OUT parameter that holds our resulting token.
1190 * @param aScanner Our source of data
1191 * @return Error result.
1193 nsresult
1194 nsHTMLTokenizer::ConsumeProcessingInstruction(PRUnichar aChar,
1195 CToken*& aToken,
1196 nsScanner& aScanner)
1198 // Get the "?"
1199 aScanner.GetChar(aChar);
1201 nsTokenAllocator* theAllocator = this->GetTokenAllocator();
1202 aToken = theAllocator->CreateTokenOfType(eToken_instruction,
1203 eHTMLTag_unknown);
1204 nsresult result = NS_OK;
1205 if (aToken) {
1206 result = aToken->Consume(aChar, aScanner, mFlags);
1207 AddToken(aToken, result, &mTokenDeque, theAllocator);
1210 return result;