Bug 470455 - test_database_sync_embed_visits.js leaks, r=sdwilsh
[wine-gecko.git] / parser / htmlparser / src / nsHTMLTokenizer.cpp
blob367ed24c8c59ff36f2b5a3b339d674495c078095
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set sw=2 ts=2 et tw=78: */
3 /* ***** BEGIN LICENSE BLOCK *****
4 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
6 * The contents of this file are subject to the Mozilla Public License Version
7 * 1.1 (the "License"); you may not use this file except in compliance with
8 * the License. You may obtain a copy of the License at
9 * http://www.mozilla.org/MPL/
11 * Software distributed under the License is distributed on an "AS IS" basis,
12 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
13 * for the specific language governing rights and limitations under the
14 * License.
16 * The Original Code is mozilla.org code.
18 * The Initial Developer of the Original Code is
19 * Netscape Communications Corporation.
20 * Portions created by the Initial Developer are Copyright (C) 1998
21 * the Initial Developer. All Rights Reserved.
23 * Contributor(s):
24 * Blake Kaplan <mrbkap@gmail.com>
26 * Alternatively, the contents of this file may be used under the terms of
27 * either of the GNU General Public License Version 2 or later (the "GPL"),
28 * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
29 * in which case the provisions of the GPL or the LGPL are applicable instead
30 * of those above. If you wish to allow use of your version of this file only
31 * under the terms of either the GPL or the LGPL, and not to allow others to
32 * use your version of this file under the terms of the MPL, indicate your
33 * decision by deleting the provisions above and replace them with the notice
34 * and other provisions required by the GPL or the LGPL. If you do not delete
35 * the provisions above, a recipient may use your version of this file under
36 * the terms of any one of the MPL, the GPL or the LGPL.
38 * ***** END LICENSE BLOCK ***** */
41 /**
42 * @file nsHTMLTokenizer.cpp
43 * This is an implementation of the nsITokenizer interface.
44 * This file contains the implementation of a tokenizer to tokenize an HTML
45 * document. It attempts to do so, making tradeoffs between compatibility with
46 * older parsers and the SGML specification. Note that most of the real
47 * "tokenization" takes place in nsHTMLTokens.cpp.
50 #include "nsIAtom.h"
51 #include "nsHTMLTokenizer.h"
52 #include "nsScanner.h"
53 #include "nsElementTable.h"
54 #include "nsReadableUtils.h"
55 #include "nsUnicharUtils.h"
57 /************************************************************************
58 And now for the main class -- nsHTMLTokenizer...
59 ************************************************************************/
61 /**
62 * Satisfy the nsISupports interface.
64 NS_IMPL_ISUPPORTS1(nsHTMLTokenizer, nsITokenizer)
66 /**
67 * Default constructor
69 * @param aParseMode The current mode the document is in (quirks, etc.)
70 * @param aDocType The document type of the current document
71 * @param aCommand What we are trying to do (view-source, parse a fragment, etc.)
73 nsHTMLTokenizer::nsHTMLTokenizer(PRInt32 aParseMode,
74 eParserDocType aDocType,
75 eParserCommands aCommand,
76 PRUint16 aFlags) :
77 nsITokenizer(), mTokenDeque(0), mFlags(aFlags)
79 if (aParseMode == eDTDMode_full_standards ||
80 aParseMode == eDTDMode_almost_standards) {
81 mFlags |= NS_IPARSER_FLAG_STRICT_MODE;
82 } else if (aParseMode == eDTDMode_quirks) {
83 mFlags |= NS_IPARSER_FLAG_QUIRKS_MODE;
84 } else if (aParseMode == eDTDMode_autodetect) {
85 mFlags |= NS_IPARSER_FLAG_AUTO_DETECT_MODE;
86 } else {
87 mFlags |= NS_IPARSER_FLAG_UNKNOWN_MODE;
90 if (aDocType == ePlainText) {
91 mFlags |= NS_IPARSER_FLAG_PLAIN_TEXT;
92 } else if (aDocType == eXML) {
93 mFlags |= NS_IPARSER_FLAG_XML;
94 } else if (aDocType == eHTML_Quirks ||
95 aDocType == eHTML_Strict) {
96 mFlags |= NS_IPARSER_FLAG_HTML;
99 mFlags |= aCommand == eViewSource
100 ? NS_IPARSER_FLAG_VIEW_SOURCE
101 : NS_IPARSER_FLAG_VIEW_NORMAL;
103 NS_ASSERTION(!(mFlags & NS_IPARSER_FLAG_XML) ||
104 (mFlags & NS_IPARSER_FLAG_VIEW_SOURCE),
105 "Why isn't this XML document going through our XML parser?");
107 mTokenAllocator = nsnull;
108 mTokenScanPos = 0;
112 * The destructor ensures that we don't leak any left over tokens.
114 nsHTMLTokenizer::~nsHTMLTokenizer()
116 if (mTokenDeque.GetSize()) {
117 CTokenDeallocator theDeallocator(mTokenAllocator->GetArenaPool());
118 mTokenDeque.ForEach(theDeallocator);
123 /*******************************************************************
124 Here begins the real working methods for the tokenizer.
125 *******************************************************************/
128 * Adds a token onto the end of the deque if aResult is a successful result.
129 * Otherwise, this function frees aToken and sets it to nsnull.
131 * @param aToken The token that wants to be added.
132 * @param aResult The error code that will be used to determine if we actually
133 * want to push this token.
134 * @param aDeque The deque we want to push aToken onto.
135 * @param aTokenAllocator The allocator we use to free aToken in case aResult
136 * is not a success code.
138 /* static */
139 void
140 nsHTMLTokenizer::AddToken(CToken*& aToken,
141 nsresult aResult,
142 nsDeque* aDeque,
143 nsTokenAllocator* aTokenAllocator)
145 if (aToken && aDeque) {
146 if (NS_SUCCEEDED(aResult)) {
147 aDeque->Push(aToken);
148 } else {
149 IF_FREE(aToken, aTokenAllocator);
155 * Retrieve a pointer to the global token recycler...
157 * @return Pointer to recycler (or null)
159 nsTokenAllocator*
160 nsHTMLTokenizer::GetTokenAllocator()
162 return mTokenAllocator;
166 * This method provides access to the topmost token in the tokenDeque.
167 * The token is not really removed from the list.
169 * @return Pointer to token
171 CToken*
172 nsHTMLTokenizer::PeekToken()
174 return (CToken*)mTokenDeque.PeekFront();
178 * This method provides access to the topmost token in the tokenDeque.
179 * The token is really removed from the list; if the list is empty we return 0.
181 * @return Pointer to token or NULL
183 CToken*
184 nsHTMLTokenizer::PopToken()
186 return (CToken*)mTokenDeque.PopFront();
191 * Pushes a token onto the front of our deque such that the next call to
192 * PopToken() or PeekToken() will return that token.
194 * @param theToken The next token to be processed
195 * @return theToken
197 CToken*
198 nsHTMLTokenizer::PushTokenFront(CToken* theToken)
200 mTokenDeque.PushFront(theToken);
201 return theToken;
205 * Pushes a token onto the deque.
207 * @param theToken the new token.
208 * @return theToken
210 CToken*
211 nsHTMLTokenizer::PushToken(CToken* theToken)
213 mTokenDeque.Push(theToken);
214 return theToken;
218 * Returns the size of the deque.
220 * @return The number of remaining tokens.
222 PRInt32
223 nsHTMLTokenizer::GetCount()
225 return mTokenDeque.GetSize();
229 * Allows access to an arbitrary token in the deque. The accessed token is left
230 * in the deque.
232 * @param anIndex The index of the target token. Token 0 would be the same as
233 * the result of a call to PeekToken()
234 * @return The requested token.
236 CToken*
237 nsHTMLTokenizer::GetTokenAt(PRInt32 anIndex)
239 return (CToken*)mTokenDeque.ObjectAt(anIndex);
243 * This method is part of the "sandwich" that occurs when we want to tokenize
244 * a document. This prepares us to be able to tokenize properly.
246 * @param aIsFinalChunk Whether this is the last chunk of data that we will
247 * get to see.
248 * @param aTokenAllocator The token allocator to use for this document.
249 * @return Our success in setting up.
251 nsresult
252 nsHTMLTokenizer::WillTokenize(PRBool aIsFinalChunk,
253 nsTokenAllocator* aTokenAllocator)
255 mTokenAllocator = aTokenAllocator;
256 mIsFinalChunk = aIsFinalChunk;
258 // Cause ScanDocStructure to search from here for new tokens...
259 mTokenScanPos = mTokenDeque.GetSize();
260 return NS_OK;
264 * Pushes all of the tokens in aDeque onto the front of our deque so they
265 * get processed before any other tokens.
267 * @param aDeque The deque with the tokens in it.
269 void
270 nsHTMLTokenizer::PrependTokens(nsDeque& aDeque)
272 PRInt32 aCount = aDeque.GetSize();
274 for (PRInt32 anIndex = 0; anIndex < aCount; ++anIndex) {
275 CToken* theToken = (CToken*)aDeque.Pop();
276 PushTokenFront(theToken);
281 * Copies the state flags from aTokenizer into this tokenizer. This is used
282 * to pass information around between the main tokenizer and tokenizers
283 * created for document.write() calls.
285 * @param aTokenizer The tokenizer with more information in it.
286 * @return NS_OK
288 nsresult
289 nsHTMLTokenizer::CopyState(nsITokenizer* aTokenizer)
291 if (aTokenizer) {
292 mFlags = ((nsHTMLTokenizer*)aTokenizer)->mFlags;
295 return NS_OK;
299 * This is a utilty method for ScanDocStructure, which finds a given
300 * tag in the stack. The return value is meant to be used with
301 * nsDeque::ObjectAt() on aTagStack.
303 * @param aTag -- the ID of the tag we're seeking
304 * @param aTagStack -- the stack to be searched
305 * @return index position of tag in stack if found, otherwise kNotFound
307 static PRInt32
308 FindLastIndexOfTag(eHTMLTags aTag, nsDeque &aTagStack)
310 PRInt32 theCount = aTagStack.GetSize();
312 while (0 < theCount) {
313 CHTMLToken* theToken = (CHTMLToken*)aTagStack.ObjectAt(--theCount);
314 if (theToken) {
315 eHTMLTags theTag = (eHTMLTags)theToken->GetTypeID();
316 if (theTag == aTag) {
317 return theCount;
322 return kNotFound;
326 * This method scans the sequence of tokens to determine whether or not the
327 * tag structure of the document is well formed. In well formed cases, we can
328 * skip doing residual style handling and allow inlines to contain block-level
329 * elements.
331 * @param aFinalChunk Is unused.
332 * @return Success (currently, this function cannot fail).
334 nsresult nsHTMLTokenizer::ScanDocStructure(PRBool aFinalChunk)
336 nsresult result = NS_OK;
337 if (!mTokenDeque.GetSize()) {
338 return result;
341 CHTMLToken* theToken = (CHTMLToken*)mTokenDeque.ObjectAt(mTokenScanPos);
343 // Start by finding the first start tag that hasn't been reviewed.
344 while (mTokenScanPos > 0) {
345 if (theToken) {
346 eHTMLTokenTypes theType = eHTMLTokenTypes(theToken->GetTokenType());
347 if (theType == eToken_start &&
348 theToken->GetContainerInfo() == eFormUnknown) {
349 break;
352 theToken = (CHTMLToken*)mTokenDeque.ObjectAt(--mTokenScanPos);
355 // Now that we know where to start, let's walk through the
356 // tokens to see which are well-formed. Stop when you run out
357 // of fresh tokens.
359 nsDeque theStack(0);
360 nsDeque tempStack(0);
361 PRInt32 theStackDepth = 0;
362 // Don't bother if we get ridiculously deep.
363 static const PRInt32 theMaxStackDepth = 200;
365 while (theToken && theStackDepth < theMaxStackDepth) {
366 eHTMLTokenTypes theType = eHTMLTokenTypes(theToken->GetTokenType());
367 eHTMLTags theTag = (eHTMLTags)theToken->GetTypeID();
369 if (nsHTMLElement::IsContainer(theTag)) { // Bug 54117
370 PRBool theTagIsBlock = gHTMLElements[theTag].IsMemberOf(kBlockEntity);
371 PRBool theTagIsInline = theTagIsBlock
372 ? PR_FALSE
373 : gHTMLElements[theTag].IsMemberOf(kInlineEntity);
375 if (theTagIsBlock || theTagIsInline || eHTMLTag_table == theTag) {
376 switch(theType) {
377 case eToken_start:
379 if (gHTMLElements[theTag].ShouldVerifyHierarchy()) {
380 PRInt32 earlyPos = FindLastIndexOfTag(theTag, theStack);
381 if (earlyPos != kNotFound) {
382 // Uh-oh, we've found a tag that is not allowed to nest at
383 // all. Mark the previous one and all of its children as
384 // malformed to increase our chances of doing RS handling
385 // on all of them. We want to do this for cases such as:
386 // <a><div><a></a></div></a>.
387 // Note that we have to iterate through all of the chilren
388 // of the original malformed tag to protect against:
389 // <a><font><div><a></a></div></font></a>, so that the <font>
390 // is allowed to contain the <div>.
391 // XXX What about <a><span><a>, where the second <a> closes
392 // the <span>?
393 nsDequeIterator it(theStack, earlyPos), end(theStack.End());
394 while (it < end) {
395 CHTMLToken *theMalformedToken =
396 static_cast<CHTMLToken*>(it++);
398 theMalformedToken->SetContainerInfo(eMalformed);
403 theStack.Push(theToken);
404 ++theStackDepth;
406 break;
407 case eToken_end:
409 CHTMLToken *theLastToken =
410 static_cast<CHTMLToken*>(theStack.Peek());
411 if (theLastToken) {
412 if (theTag == theLastToken->GetTypeID()) {
413 theStack.Pop(); // Yank it for real
414 theStackDepth--;
415 theLastToken->SetContainerInfo(eWellFormed);
416 } else {
417 // This token wasn't what we expected it to be! We need to
418 // go searching for its real start tag on our stack. Each
419 // tag in between the end tag and start tag must be malformed
421 if (FindLastIndexOfTag(theTag, theStack) != kNotFound) {
422 // Find theTarget in the stack, marking each (malformed!)
423 // tag in our way.
424 theStack.Pop(); // Pop off theLastToken for real.
425 do {
426 theLastToken->SetContainerInfo(eMalformed);
427 tempStack.Push(theLastToken);
428 theLastToken = static_cast<CHTMLToken*>(theStack.Pop());
429 } while (theLastToken && theTag != theLastToken->GetTypeID());
430 // XXX The above test can confuse two different userdefined
431 // tags.
433 NS_ASSERTION(theLastToken,
434 "FindLastIndexOfTag lied to us!"
435 " We couldn't find theTag on theStack");
436 theLastToken->SetContainerInfo(eMalformed);
438 // Great, now push all of the other tokens back onto the
439 // stack to preserve the general structure of the document.
440 // Note that we don't push the target token back onto the
441 // the stack (since it was just closed).
442 while (tempStack.GetSize() != 0) {
443 theStack.Push(tempStack.Pop());
449 break;
450 default:
451 break;
456 theToken = (CHTMLToken*)mTokenDeque.ObjectAt(++mTokenScanPos);
459 return result;
463 * This method is called after we're done tokenizing a chunk of data.
465 * @param aFinalChunk Tells us if this was the last chunk of data.
466 * @return Error result.
468 nsresult
469 nsHTMLTokenizer::DidTokenize(PRBool aFinalChunk)
471 return ScanDocStructure(aFinalChunk);
475 * This method is repeatedly called by the tokenizer.
476 * Each time, we determine the kind of token we're about to
477 * read, and then we call the appropriate method to handle
478 * that token type.
480 * @param aScanner The source of our input.
481 * @param aFlushTokens An OUT parameter to tell the caller whether it should
482 * process our queued tokens up to now (e.g., when we
483 * reach a <script>).
484 * @return Success or error
486 nsresult
487 nsHTMLTokenizer::ConsumeToken(nsScanner& aScanner, PRBool& aFlushTokens)
489 PRUnichar theChar;
490 CToken* theToken = nsnull;
492 nsresult result = aScanner.Peek(theChar);
494 switch(result) {
495 case kEOF:
496 // Tell our caller that'we finished.
497 return result;
499 case NS_OK:
500 default:
501 if (!(mFlags & NS_IPARSER_FLAG_PLAIN_TEXT)) {
502 if (kLessThan == theChar) {
503 return ConsumeTag(theChar, theToken, aScanner, aFlushTokens);
504 } else if (kAmpersand == theChar) {
505 return ConsumeEntity(theChar, theToken, aScanner);
509 if (kCR == theChar || kLF == theChar) {
510 return ConsumeNewline(theChar, theToken, aScanner);
511 } else {
512 if (!nsCRT::IsAsciiSpace(theChar)) {
513 if (theChar != '\0') {
514 result = ConsumeText(theToken, aScanner);
515 } else {
516 // Skip the embedded null char. Fix bug 64098.
517 aScanner.GetChar(theChar);
519 break;
521 result = ConsumeWhitespace(theChar, theToken, aScanner);
523 break;
526 return result;
530 * This method is called just after a "<" has been consumed
531 * and we know we're at the start of some kind of tagged
532 * element. We don't know yet if it's a tag or a comment.
534 * @param aChar is the last char read
535 * @param aToken is the out arg holding our new token (the function allocates
536 * the return token using mTokenAllocator).
537 * @param aScanner represents our input source
538 * @param aFlushTokens is an OUT parameter use to tell consumers to flush
539 * the current tokens after processing the current one.
540 * @return error code.
542 nsresult
543 nsHTMLTokenizer::ConsumeTag(PRUnichar aChar,
544 CToken*& aToken,
545 nsScanner& aScanner,
546 PRBool& aFlushTokens)
548 PRUnichar theNextChar, oldChar;
549 nsresult result = aScanner.Peek(aChar, 1);
551 if (NS_OK == result) {
552 switch (aChar) {
553 case kForwardSlash:
554 result = aScanner.Peek(theNextChar, 2);
556 if (NS_OK == result) {
557 // Get the original "<" (we've already seen it with a Peek)
558 aScanner.GetChar(oldChar);
560 // XML allows non ASCII tag names, consume this as an end tag. This
561 // is needed to make XML view source work
562 PRBool isXML = !!(mFlags & NS_IPARSER_FLAG_XML);
563 if (nsCRT::IsAsciiAlpha(theNextChar) ||
564 kGreaterThan == theNextChar ||
565 (isXML && !nsCRT::IsAscii(theNextChar))) {
566 result = ConsumeEndTag(aChar, aToken, aScanner);
567 } else {
568 result = ConsumeComment(aChar, aToken, aScanner);
572 break;
574 case kExclamation:
575 result = aScanner.Peek(theNextChar, 2);
577 if (NS_OK == result) {
578 // Get the original "<" (we've already seen it with a Peek)
579 aScanner.GetChar(oldChar);
581 if (kMinus == theNextChar || kGreaterThan == theNextChar) {
582 result = ConsumeComment(aChar, aToken, aScanner);
583 } else {
584 result = ConsumeSpecialMarkup(aChar, aToken, aScanner);
587 break;
589 case kQuestionMark:
590 // It must be a processing instruction...
591 // Get the original "<" (we've already seen it with a Peek)
592 aScanner.GetChar(oldChar);
593 result = ConsumeProcessingInstruction(aChar, aToken, aScanner);
594 break;
596 default:
597 // XML allows non ASCII tag names, consume this as a start tag.
598 PRBool isXML = !!(mFlags & NS_IPARSER_FLAG_XML);
599 if (nsCRT::IsAsciiAlpha(aChar) ||
600 (isXML && !nsCRT::IsAscii(aChar))) {
601 // Get the original "<" (we've already seen it with a Peek)
602 aScanner.GetChar(oldChar);
603 result = ConsumeStartTag(aChar, aToken, aScanner, aFlushTokens);
604 } else {
605 // We are not dealing with a tag. So, don't consume the original
606 // char and leave the decision to ConsumeText().
607 result = ConsumeText(aToken, aScanner);
612 // Last ditch attempt to make sure we don't lose data.
613 if (kEOF == result && !aScanner.IsIncremental()) {
614 // Whoops, we don't want to lose any data! Consume the rest as text.
615 // This normally happens for either a trailing < or </
616 result = ConsumeText(aToken, aScanner);
619 return result;
623 * This method is called just after we've consumed a start or end
624 * tag, and we now have to consume its attributes.
626 * @param aChar is the last char read
627 * @param aToken is the start or end tag that "owns" these attributes.
628 * @param aScanner represents our input source
629 * @return Error result.
631 nsresult
632 nsHTMLTokenizer::ConsumeAttributes(PRUnichar aChar,
633 CToken* aToken,
634 nsScanner& aScanner)
636 PRBool done = PR_FALSE;
637 nsresult result = NS_OK;
638 PRInt16 theAttrCount = 0;
640 nsTokenAllocator* theAllocator = this->GetTokenAllocator();
642 while (!done && result == NS_OK) {
643 CAttributeToken* theToken =
644 static_cast<CAttributeToken*>
645 (theAllocator->CreateTokenOfType(eToken_attribute,
646 eHTMLTag_unknown));
647 if (NS_LIKELY(theToken != nsnull)) {
648 // Tell the new token to finish consuming text...
649 result = theToken->Consume(aChar, aScanner, mFlags);
651 if (NS_SUCCEEDED(result)) {
652 ++theAttrCount;
653 AddToken((CToken*&)theToken, result, &mTokenDeque, theAllocator);
654 } else {
655 IF_FREE(theToken, mTokenAllocator);
656 // Bad attribute returns shouldn't propagate out.
657 if (NS_ERROR_HTMLPARSER_BADATTRIBUTE == result) {
658 result = NS_OK;
662 else {
663 result = NS_ERROR_OUT_OF_MEMORY;
666 #ifdef DEBUG
667 if (NS_SUCCEEDED(result)) {
668 PRInt32 newline = 0;
669 aScanner.SkipWhitespace(newline);
670 NS_ASSERTION(newline == 0,
671 "CAttribute::Consume() failed to collect all the newlines!");
673 #endif
674 if (NS_SUCCEEDED(result)) {
675 result = aScanner.Peek(aChar);
676 if (NS_SUCCEEDED(result)) {
677 if (aChar == kGreaterThan) { // You just ate the '>'
678 aScanner.GetChar(aChar); // Skip the '>'
679 done = PR_TRUE;
680 } else if (aChar == kLessThan) {
681 aToken->SetInError(PR_TRUE);
682 done = PR_TRUE;
688 if (NS_FAILED(result)) {
689 aToken->SetInError(PR_TRUE);
691 if (!aScanner.IsIncremental()) {
692 result = NS_OK;
696 aToken->SetAttributeCount(theAttrCount);
697 return result;
701 * This method consumes a start tag and all of its attributes.
703 * @param aChar The last character read from the scanner.
704 * @param aToken The OUT parameter that holds our resulting token. (allocated
705 * by the function using mTokenAllocator
706 * @param aScanner Our source of data
707 * @param aFlushTokens is an OUT parameter use to tell consumers to flush
708 * the current tokens after processing the current one.
709 * @return Error result.
711 nsresult
712 nsHTMLTokenizer::ConsumeStartTag(PRUnichar aChar,
713 CToken*& aToken,
714 nsScanner& aScanner,
715 PRBool& aFlushTokens)
717 // Remember this for later in case you have to unwind...
718 PRInt32 theDequeSize = mTokenDeque.GetSize();
719 nsresult result = NS_OK;
721 nsTokenAllocator* theAllocator = this->GetTokenAllocator();
722 aToken = theAllocator->CreateTokenOfType(eToken_start, eHTMLTag_unknown);
723 NS_ENSURE_TRUE(aToken, NS_ERROR_OUT_OF_MEMORY);
725 // Tell the new token to finish consuming text...
726 result = aToken->Consume(aChar, aScanner, mFlags);
728 if (NS_SUCCEEDED(result)) {
729 AddToken(aToken, result, &mTokenDeque, theAllocator);
731 eHTMLTags theTag = (eHTMLTags)aToken->GetTypeID();
733 // Good. Now, let's see if the next char is ">".
734 // If so, we have a complete tag, otherwise, we have attributes.
735 result = aScanner.Peek(aChar);
736 if (NS_FAILED(result)) {
737 aToken->SetInError(PR_TRUE);
739 // Don't return early here so we can create a text and end token for
740 // the special <iframe>, <script> and similar tags down below.
741 result = NS_OK;
742 } else {
743 if (kGreaterThan != aChar) { // Look for a '>'
744 result = ConsumeAttributes(aChar, aToken, aScanner);
745 } else {
746 aScanner.GetChar(aChar);
750 /* Now that that's over with, we have one more problem to solve.
751 In the case that we just read a <SCRIPT> or <STYLE> tags, we should go and
752 consume all the content itself.
753 But XML doesn't treat these tags differently, so we shouldn't if the
754 document is XML.
756 if (NS_SUCCEEDED(result) && !(mFlags & NS_IPARSER_FLAG_XML)) {
757 PRBool isCDATA = gHTMLElements[theTag].CanContainType(kCDATA);
758 PRBool isPCDATA = eHTMLTag_textarea == theTag ||
759 eHTMLTag_title == theTag;
761 // XXX This is an evil hack, we should be able to handle these properly
762 // in the DTD.
763 if ((eHTMLTag_iframe == theTag &&
764 (mFlags & NS_IPARSER_FLAG_FRAMES_ENABLED)) ||
765 (eHTMLTag_noframes == theTag &&
766 (mFlags & NS_IPARSER_FLAG_FRAMES_ENABLED)) ||
767 (eHTMLTag_noscript == theTag &&
768 (mFlags & NS_IPARSER_FLAG_SCRIPT_ENABLED)) ||
769 (eHTMLTag_noembed == theTag)) {
770 isCDATA = PR_TRUE;
773 // Plaintext contains CDATA, but it's special, so we handle it
774 // differently than the other CDATA elements
775 if (eHTMLTag_plaintext == theTag) {
776 isCDATA = PR_FALSE;
778 // Note: We check in ConsumeToken() for this flag, and if we see it
779 // we only construct text tokens (which is what we want).
780 mFlags |= NS_IPARSER_FLAG_PLAIN_TEXT;
784 if (isCDATA || isPCDATA) {
785 PRBool done = PR_FALSE;
786 nsDependentString endTagName(nsHTMLTags::GetStringValue(theTag));
788 CToken* text =
789 theAllocator->CreateTokenOfType(eToken_text, eHTMLTag_text);
790 NS_ENSURE_TRUE(text, NS_ERROR_OUT_OF_MEMORY);
792 CTextToken* textToken = static_cast<CTextToken*>(text);
794 if (isCDATA) {
795 result = textToken->ConsumeCharacterData(theTag != eHTMLTag_script,
796 aScanner,
797 endTagName,
798 mFlags,
799 done);
801 // Only flush tokens for <script>, to give ourselves more of a
802 // chance of allowing inlines to contain blocks.
803 aFlushTokens = done && theTag == eHTMLTag_script;
804 } else if (isPCDATA) {
805 // Title is consumed conservatively in order to not regress
806 // bug 42945
807 result = textToken->ConsumeParsedCharacterData(
808 theTag == eHTMLTag_textarea,
809 theTag == eHTMLTag_title,
810 aScanner,
811 endTagName,
812 mFlags,
813 done);
815 // Note: we *don't* set aFlushTokens here.
818 // We want to do this unless result is kEOF, in which case we will
819 // simply unwind our stack and wait for more data anyway.
820 if (kEOF != result) {
821 AddToken(text, NS_OK, &mTokenDeque, theAllocator);
822 CToken* endToken = nsnull;
824 if (NS_SUCCEEDED(result) && done) {
825 PRUnichar theChar;
826 // Get the <
827 result = aScanner.GetChar(theChar);
828 NS_ASSERTION(NS_SUCCEEDED(result) && theChar == kLessThan,
829 "CTextToken::Consume*Data is broken!");
830 #ifdef DEBUG
831 // Ensure we have a /
832 PRUnichar tempChar; // Don't change non-debug vars in debug-only code
833 result = aScanner.Peek(tempChar);
834 NS_ASSERTION(NS_SUCCEEDED(result) && tempChar == kForwardSlash,
835 "CTextToken::Consume*Data is broken!");
836 #endif
837 result = ConsumeEndTag(PRUnichar('/'), endToken, aScanner);
838 if (!(mFlags & NS_IPARSER_FLAG_VIEW_SOURCE) &&
839 NS_SUCCEEDED(result)) {
840 // If ConsumeCharacterData returned a success result (and
841 // we're not in view source), then we want to make sure that
842 // we're going to execute this script (since the result means
843 // that we've found an end tag that satisfies all of the right
844 // conditions).
845 endToken->SetInError(PR_FALSE);
847 } else if (result == kFakeEndTag &&
848 !(mFlags & NS_IPARSER_FLAG_VIEW_SOURCE)) {
849 result = NS_OK;
850 endToken = theAllocator->CreateTokenOfType(eToken_end, theTag,
851 endTagName);
852 AddToken(endToken, result, &mTokenDeque, theAllocator);
853 if (NS_LIKELY(endToken != nsnull)) {
854 endToken->SetInError(PR_TRUE);
856 else {
857 result = NS_ERROR_OUT_OF_MEMORY;
859 } else if (result == kFakeEndTag) {
860 // If we are here, we are both faking having seen the end tag
861 // and are in view-source.
862 result = NS_OK;
864 } else {
865 IF_FREE(text, mTokenAllocator);
870 // This code is confusing, so pay attention.
871 // If you're here, it's because we were in the midst of consuming a start
872 // tag but ran out of data (not in the stream, but in this *part* of the
873 // stream. For simplicity, we have to unwind our input. Therefore, we pop
874 // and discard any new tokens we've queued this round. Later we can get
875 // smarter about this.
876 if (NS_FAILED(result)) {
877 while (mTokenDeque.GetSize()>theDequeSize) {
878 CToken* theToken = (CToken*)mTokenDeque.Pop();
879 IF_FREE(theToken, mTokenAllocator);
882 } else {
883 IF_FREE(aToken, mTokenAllocator);
886 return result;
890 * This method consumes an end tag and any "attributes" that may come after it.
892 * @param aChar The last character read from the scanner.
893 * @param aToken The OUT parameter that holds our resulting token.
894 * @param aScanner Our source of data
895 * @return Error result
897 nsresult
898 nsHTMLTokenizer::ConsumeEndTag(PRUnichar aChar,
899 CToken*& aToken,
900 nsScanner& aScanner)
902 // Get the "/" (we've already seen it with a Peek)
903 aScanner.GetChar(aChar);
905 nsTokenAllocator* theAllocator = this->GetTokenAllocator();
906 aToken = theAllocator->CreateTokenOfType(eToken_end, eHTMLTag_unknown);
907 NS_ENSURE_TRUE(aToken, NS_ERROR_OUT_OF_MEMORY);
909 // Remember this for later in case you have to unwind...
910 PRInt32 theDequeSize = mTokenDeque.GetSize();
911 nsresult result = NS_OK;
913 // Tell the new token to finish consuming text...
914 result = aToken->Consume(aChar, aScanner, mFlags);
915 AddToken(aToken, result, &mTokenDeque, theAllocator);
916 if (NS_FAILED(result)) {
917 // Note that this early-return here is safe because we have not yet
918 // added any of our tokens to the queue (AddToken only adds the token if
919 // result is a success), so we don't need to fall through.
920 return result;
923 result = aScanner.Peek(aChar);
924 if (NS_FAILED(result)) {
925 aToken->SetInError(PR_TRUE);
927 // Note: We know here that the scanner is not incremental since if
928 // this peek fails, then we've already masked over a kEOF coming from
929 // the Consume() call above.
930 return NS_OK;
933 if (kGreaterThan != aChar) {
934 result = ConsumeAttributes(aChar, aToken, aScanner);
935 } else {
936 aScanner.GetChar(aChar);
939 // Do the same thing as we do in ConsumeStartTag. Basically, if we've run
940 // out of room in this *section* of the document, pop all of the tokens
941 // we've consumed this round and wait for more data.
942 if (NS_FAILED(result)) {
943 while (mTokenDeque.GetSize() > theDequeSize) {
944 CToken* theToken = (CToken*)mTokenDeque.Pop();
945 IF_FREE(theToken, mTokenAllocator);
949 return result;
953 * This method is called just after a "&" has been consumed
954 * and we know we're at the start of an entity.
956 * @param aChar The last character read from the scanner.
957 * @param aToken The OUT parameter that holds our resulting token.
958 * @param aScanner Our source of data
959 * @return Error result.
961 nsresult
962 nsHTMLTokenizer::ConsumeEntity(PRUnichar aChar,
963 CToken*& aToken,
964 nsScanner& aScanner)
966 PRUnichar theChar;
967 nsresult result = aScanner.Peek(theChar, 1);
969 nsTokenAllocator* theAllocator = this->GetTokenAllocator();
970 if (NS_SUCCEEDED(result)) {
971 if (nsCRT::IsAsciiAlpha(theChar) || theChar == kHashsign) {
972 aToken = theAllocator->CreateTokenOfType(eToken_entity, eHTMLTag_entity);
973 NS_ENSURE_TRUE(aToken, NS_ERROR_OUT_OF_MEMORY);
974 result = aToken->Consume(theChar, aScanner, mFlags);
976 if (result == NS_HTMLTOKENS_NOT_AN_ENTITY) {
977 IF_FREE(aToken, mTokenAllocator);
978 } else {
979 if (result == kEOF && !aScanner.IsIncremental()) {
980 result = NS_OK; // Use as much of the entity as you can get.
983 AddToken(aToken, result, &mTokenDeque, theAllocator);
984 return result;
988 // Oops, we're actually looking at plain text...
989 result = ConsumeText(aToken, aScanner);
990 } else if (result == kEOF && !aScanner.IsIncremental()) {
991 // If the last character in the file is an &, consume it as text.
992 result = ConsumeText(aToken, aScanner);
993 if (aToken) {
994 aToken->SetInError(PR_TRUE);
998 return result;
1003 * This method is called just after whitespace has been
1004 * consumed and we know we're at the start a whitespace run.
1006 * @param aChar The last character read from the scanner.
1007 * @param aToken The OUT parameter that holds our resulting token.
1008 * @param aScanner Our source of data
1009 * @return Error result.
1011 nsresult
1012 nsHTMLTokenizer::ConsumeWhitespace(PRUnichar aChar,
1013 CToken*& aToken,
1014 nsScanner& aScanner)
1016 // Get the whitespace character
1017 aScanner.GetChar(aChar);
1019 nsTokenAllocator* theAllocator = this->GetTokenAllocator();
1020 aToken = theAllocator->CreateTokenOfType(eToken_whitespace,
1021 eHTMLTag_whitespace);
1022 nsresult result = NS_OK;
1023 if (aToken) {
1024 result = aToken->Consume(aChar, aScanner, mFlags);
1025 AddToken(aToken, result, &mTokenDeque, theAllocator);
1028 return result;
1032 * This method is called just after a "<!" has been consumed
1033 * and we know we're at the start of a comment.
1035 * @param aChar The last character read from the scanner.
1036 * @param aToken The OUT parameter that holds our resulting token.
1037 * @param aScanner Our source of data
1038 * @return Error result.
1040 nsresult
1041 nsHTMLTokenizer::ConsumeComment(PRUnichar aChar,
1042 CToken*& aToken,
1043 nsScanner& aScanner)
1045 // Get the "!"
1046 aScanner.GetChar(aChar);
1048 nsTokenAllocator* theAllocator = this->GetTokenAllocator();
1049 aToken = theAllocator->CreateTokenOfType(eToken_comment, eHTMLTag_comment);
1050 nsresult result = NS_OK;
1051 if (aToken) {
1052 result = aToken->Consume(aChar, aScanner, mFlags);
1053 AddToken(aToken, result, &mTokenDeque, theAllocator);
1056 if (kNotAComment == result) {
1057 // AddToken has IF_FREE()'d our token, so...
1058 result = ConsumeText(aToken, aScanner);
1061 return result;
1065 * This method is called just after a known text char has
1066 * been consumed and we should read a text run. Note: we actually ignore the
1067 * first character of the text run so that we can consume invalid markup
1068 * as text.
1070 * @param aToken The OUT parameter that holds our resulting token.
1071 * @param aScanner Our source of data
1072 * @return Error result.
1074 nsresult
1075 nsHTMLTokenizer::ConsumeText(CToken*& aToken, nsScanner& aScanner)
1077 nsresult result = NS_OK;
1078 nsTokenAllocator* theAllocator = this->GetTokenAllocator();
1079 CTextToken* theToken =
1080 (CTextToken*)theAllocator->CreateTokenOfType(eToken_text, eHTMLTag_text);
1081 if (theToken) {
1082 PRUnichar ch = '\0';
1083 result = theToken->Consume(ch, aScanner, mFlags);
1084 if (NS_FAILED(result)) {
1085 if (0 == theToken->GetTextLength()) {
1086 IF_FREE(aToken, mTokenAllocator);
1087 aToken = nsnull;
1088 } else {
1089 result = NS_OK;
1093 aToken = theToken;
1094 AddToken(aToken, result, &mTokenDeque, theAllocator);
1097 return result;
1101 * This method is called just after a "<!" has been consumed.
1102 * NOTE: Here we might consume DOCTYPE and "special" markups.
1104 * @param aChar The last character read from the scanner.
1105 * @param aToken The OUT parameter that holds our resulting token.
1106 * @param aScanner Our source of data
1107 * @return Error result.
1109 nsresult
1110 nsHTMLTokenizer::ConsumeSpecialMarkup(PRUnichar aChar,
1111 CToken*& aToken,
1112 nsScanner& aScanner)
1114 // Get the "!"
1115 aScanner.GetChar(aChar);
1117 nsresult result = NS_OK;
1118 nsAutoString theBufCopy;
1119 aScanner.Peek(theBufCopy, 20);
1120 ToUpperCase(theBufCopy);
1121 PRInt32 theIndex = theBufCopy.Find("DOCTYPE", PR_FALSE, 0, 0);
1122 nsTokenAllocator* theAllocator = this->GetTokenAllocator();
1124 if (theIndex == kNotFound) {
1125 if ('[' == theBufCopy.CharAt(0)) {
1126 aToken = theAllocator->CreateTokenOfType(eToken_cdatasection,
1127 eHTMLTag_comment);
1128 } else if (StringBeginsWith(theBufCopy, NS_LITERAL_STRING("ELEMENT")) ||
1129 StringBeginsWith(theBufCopy, NS_LITERAL_STRING("ATTLIST")) ||
1130 StringBeginsWith(theBufCopy, NS_LITERAL_STRING("ENTITY")) ||
1131 StringBeginsWith(theBufCopy, NS_LITERAL_STRING("NOTATION"))) {
1132 aToken = theAllocator->CreateTokenOfType(eToken_markupDecl,
1133 eHTMLTag_markupDecl);
1134 } else {
1135 aToken = theAllocator->CreateTokenOfType(eToken_comment,
1136 eHTMLTag_comment);
1138 } else {
1139 aToken = theAllocator->CreateTokenOfType(eToken_doctypeDecl,
1140 eHTMLTag_doctypeDecl);
1143 if (aToken) {
1144 result = aToken->Consume(aChar, aScanner, mFlags);
1145 AddToken(aToken, result, &mTokenDeque, theAllocator);
1148 if (result == kNotAComment) {
1149 result = ConsumeText(aToken, aScanner);
1152 return result;
1156 * This method is called just after a newline has been consumed.
1158 * @param aChar The last character read from the scanner.
1159 * @param aToken The OUT parameter that holds our resulting token.
1160 * @param aScanner Our source of data
1161 * @return Error result.
1163 nsresult
1164 nsHTMLTokenizer::ConsumeNewline(PRUnichar aChar,
1165 CToken*& aToken,
1166 nsScanner& aScanner)
1168 // Get the newline character
1169 aScanner.GetChar(aChar);
1171 nsTokenAllocator* theAllocator = this->GetTokenAllocator();
1172 aToken = theAllocator->CreateTokenOfType(eToken_newline, eHTMLTag_newline);
1173 nsresult result = NS_OK;
1174 if (aToken) {
1175 result = aToken->Consume(aChar, aScanner, mFlags);
1176 AddToken(aToken, result, &mTokenDeque, theAllocator);
1179 return result;
1184 * This method is called just after a <? has been consumed.
1186 * @param aChar The last character read from the scanner.
1187 * @param aToken The OUT parameter that holds our resulting token.
1188 * @param aScanner Our source of data
1189 * @return Error result.
1191 nsresult
1192 nsHTMLTokenizer::ConsumeProcessingInstruction(PRUnichar aChar,
1193 CToken*& aToken,
1194 nsScanner& aScanner)
1196 // Get the "?"
1197 aScanner.GetChar(aChar);
1199 nsTokenAllocator* theAllocator = this->GetTokenAllocator();
1200 aToken = theAllocator->CreateTokenOfType(eToken_instruction,
1201 eHTMLTag_unknown);
1202 nsresult result = NS_OK;
1203 if (aToken) {
1204 result = aToken->Consume(aChar, aScanner, mFlags);
1205 AddToken(aToken, result, &mTokenDeque, theAllocator);
1208 return result;