parser/htmlparser/src/nsHTMLTokenizer.cpp

   1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
   2 /* vim: set sw=2 ts=2 et tw=78: */
   3 /* ***** BEGIN LICENSE BLOCK *****
   4  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
   5  *
   6  * The contents of this file are subject to the Mozilla Public License Version
   7  * 1.1 (the "License"); you may not use this file except in compliance with
   8  * the License. You may obtain a copy of the License at
   9  * http://www.mozilla.org/MPL/
  10  *
  11  * Software distributed under the License is distributed on an "AS IS" basis,
  12  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
  13  * for the specific language governing rights and limitations under the
  14  * License.
  15  *
  16  * The Original Code is mozilla.org code.
  17  *
  18  * The Initial Developer of the Original Code is
  19  * Netscape Communications Corporation.
  20  * Portions created by the Initial Developer are Copyright (C) 1998
  21  * the Initial Developer. All Rights Reserved.
  22  *
  23  * Contributor(s):
  24  *   Blake Kaplan <mrbkap@gmail.com>
  25  *
  26  * Alternatively, the contents of this file may be used under the terms of
  27  * either of the GNU General Public License Version 2 or later (the "GPL"),
  28  * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
  29  * in which case the provisions of the GPL or the LGPL are applicable instead
  30  * of those above. If you wish to allow use of your version of this file only
  31  * under the terms of either the GPL or the LGPL, and not to allow others to
  32  * use your version of this file under the terms of the MPL, indicate your
  33  * decision by deleting the provisions above and replace them with the notice
  34  * and other provisions required by the GPL or the LGPL. If you do not delete
  35  * the provisions above, a recipient may use your version of this file under
  36  * the terms of any one of the MPL, the GPL or the LGPL.
  37  *
  38  * ***** END LICENSE BLOCK ***** */
  39
  40
  41 /**
  42  * @file nsHTMLTokenizer.cpp
  43  * This is an implementation of the nsITokenizer interface.
  44  * This file contains the implementation of a tokenizer to tokenize an HTML
  45  * document. It attempts to do so, making tradeoffs between compatibility with
  46  * older parsers and the SGML specification. Note that most of the real
  47  * "tokenization" takes place in nsHTMLTokens.cpp.
  48  */
  49
  50 #include "nsIAtom.h"
  51 #include "nsHTMLTokenizer.h"
  52 #include "nsScanner.h"
  53 #include "nsElementTable.h"
  54 #include "CParserContext.h"
  55 #include "nsReadableUtils.h"
  56 #include "nsUnicharUtils.h"
  57
  58 /************************************************************************
  59   And now for the main class -- nsHTMLTokenizer...
  60  ************************************************************************/
  61
  62 /**
  63  * Satisfy the nsISupports interface.
  64  */
  65 NS_IMPL_ISUPPORTS1(nsHTMLTokenizer, nsITokenizer)
  66
  67 /**
  68  * Default constructor
  69  *
  70  * @param  aParseMode The current mode the document is in (quirks, etc.)
  71  * @param  aDocType The document type of the current document
  72  * @param  aCommand What we are trying to do (view-source, parse a fragment, etc.)
  73  */
  74 nsHTMLTokenizer::nsHTMLTokenizer(PRInt32 aParseMode,
  75                                  eParserDocType aDocType,
  76                                  eParserCommands aCommand,
  77                                  PRUint16 aFlags) :
  78   nsITokenizer(), mTokenDeque(0), mFlags(aFlags)
  79 {
  80   if (aParseMode == eDTDMode_full_standards ||
  81       aParseMode == eDTDMode_almost_standards) {
  82     mFlags |= NS_IPARSER_FLAG_STRICT_MODE;
  83   } else if (aParseMode == eDTDMode_quirks)  {
  84     mFlags |= NS_IPARSER_FLAG_QUIRKS_MODE;
  85   } else if (aParseMode == eDTDMode_autodetect) {
  86     mFlags |= NS_IPARSER_FLAG_AUTO_DETECT_MODE;
  87   } else {
  88     mFlags |= NS_IPARSER_FLAG_UNKNOWN_MODE;
  89   }
  90
  91   if (aDocType == ePlainText) {
  92     mFlags |= NS_IPARSER_FLAG_PLAIN_TEXT;
  93   } else if (aDocType == eXML) {
  94     mFlags |= NS_IPARSER_FLAG_XML;
  95   } else if (aDocType == eHTML_Quirks ||
  96              aDocType == eHTML3_Quirks ||
  97              aDocType == eHTML_Strict) {
  98     mFlags |= NS_IPARSER_FLAG_HTML;
  99   }
 100
 101   mFlags |= aCommand == eViewSource
 102             ? NS_IPARSER_FLAG_VIEW_SOURCE
 103             : NS_IPARSER_FLAG_VIEW_NORMAL;
 104
 105   NS_ASSERTION(!(mFlags & NS_IPARSER_FLAG_XML) ||
 106                 (mFlags & NS_IPARSER_FLAG_VIEW_SOURCE),
 107               "Why isn't this XML document going through our XML parser?");
 108
 109   mTokenAllocator = nsnull;
 110   mTokenScanPos = 0;
 111 }
 112
 113 /**
 114  * The destructor ensures that we don't leak any left over tokens.
 115  */
 116 nsHTMLTokenizer::~nsHTMLTokenizer()
 117 {
 118   if (mTokenDeque.GetSize()) {
 119     CTokenDeallocator theDeallocator(mTokenAllocator->GetArenaPool());
 120     mTokenDeque.ForEach(theDeallocator);
 121   }
 122 }
 123
 124
 125 /*******************************************************************
 126   Here begins the real working methods for the tokenizer.
 127  *******************************************************************/
 128
 129 /**
 130  * Adds a token onto the end of the deque if aResult is a successful result.
 131  * Otherwise, this function frees aToken and sets it to nsnull.
 132  *
 133  * @param aToken The token that wants to be added.
 134  * @param aResult The error code that will be used to determine if we actually
 135  *                want to push this token.
 136  * @param aDeque The deque we want to push aToken onto.
 137  * @param aTokenAllocator The allocator we use to free aToken in case aResult
 138  *                        is not a success code.
 139  */
 140 /* static */
 141 void
 142 nsHTMLTokenizer::AddToken(CToken*& aToken,
 143                           nsresult aResult,
 144                           nsDeque* aDeque,
 145                           nsTokenAllocator* aTokenAllocator)
 146 {
 147   if (aToken && aDeque) {
 148     if (NS_SUCCEEDED(aResult)) {
 149       aDeque->Push(aToken);
 150     } else {
 151       IF_FREE(aToken, aTokenAllocator);
 152     }
 153   }
 154 }
 155
 156 /**
 157  * Retrieve a pointer to the global token recycler...
 158  *
 159  * @return Pointer to recycler (or null)
 160  */
 161 nsTokenAllocator*
 162 nsHTMLTokenizer::GetTokenAllocator()
 163 {
 164   return mTokenAllocator;
 165 }
 166
 167 /**
 168  * This method provides access to the topmost token in the tokenDeque.
 169  * The token is not really removed from the list.
 170  *
 171  * @return Pointer to token
 172  */
 173 CToken*
 174 nsHTMLTokenizer::PeekToken()
 175 {
 176   return (CToken*)mTokenDeque.PeekFront();
 177 }
 178
 179 /**
 180  * This method provides access to the topmost token in the tokenDeque.
 181  * The token is really removed from the list; if the list is empty we return 0.
 182  *
 183  * @return Pointer to token or NULL
 184  */
 185 CToken*
 186 nsHTMLTokenizer::PopToken()
 187 {
 188   return (CToken*)mTokenDeque.PopFront();
 189 }
 190
 191
 192 /**
 193  * Pushes a token onto the front of our deque such that the next call to
 194  * PopToken() or PeekToken() will return that token.
 195  *
 196  * @param theToken The next token to be processed
 197  * @return theToken
 198  */
 199 CToken*
 200 nsHTMLTokenizer::PushTokenFront(CToken* theToken)
 201 {
 202   mTokenDeque.PushFront(theToken);
 203   return theToken;
 204 }
 205
 206 /**
 207  * Pushes a token onto the deque.
 208  *
 209  * @param theToken the new token.
 210  * @return theToken
 211  */
 212 CToken*
 213 nsHTMLTokenizer::PushToken(CToken* theToken)
 214 {
 215   mTokenDeque.Push(theToken);
 216   return theToken;
 217 }
 218
 219 /**
 220  * Returns the size of the deque.
 221  *
 222  * @return The number of remaining tokens.
 223  */
 224 PRInt32
 225 nsHTMLTokenizer::GetCount()
 226 {
 227   return mTokenDeque.GetSize();
 228 }
 229
 230 /**
 231  * Allows access to an arbitrary token in the deque. The accessed token is left
 232  * in the deque.
 233  *
 234  * @param anIndex The index of the target token. Token 0 would be the same as
 235  *                the result of a call to PeekToken()
 236  * @return The requested token.
 237  */
 238 CToken*
 239 nsHTMLTokenizer::GetTokenAt(PRInt32 anIndex)
 240 {
 241   return (CToken*)mTokenDeque.ObjectAt(anIndex);
 242 }
 243
 244 /**
 245  * This method is part of the "sandwich" that occurs when we want to tokenize
 246  * a document. This prepares us to be able to tokenize properly.
 247  *
 248  * @param aIsFinalChunk Whether this is the last chunk of data that we will
 249  *                      get to see.
 250  * @param aTokenAllocator The token allocator to use for this document.
 251  * @return Our success in setting up.
 252  */
 253 nsresult
 254 nsHTMLTokenizer::WillTokenize(PRBool aIsFinalChunk,
 255                               nsTokenAllocator* aTokenAllocator)
 256 {
 257   mTokenAllocator = aTokenAllocator;
 258   mIsFinalChunk = aIsFinalChunk;
 259
 260   // Cause ScanDocStructure to search from here for new tokens...
 261   mTokenScanPos = mTokenDeque.GetSize();
 262   return NS_OK;
 263 }
 264
 265 /**
 266  * Pushes all of the tokens in aDeque onto the front of our deque so they
 267  * get processed before any other tokens.
 268  *
 269  * @param aDeque The deque with the tokens in it.
 270  */
 271 void
 272 nsHTMLTokenizer::PrependTokens(nsDeque& aDeque)
 273 {
 274   PRInt32 aCount = aDeque.GetSize();
 275
 276   for (PRInt32 anIndex = 0; anIndex < aCount; ++anIndex) {
 277     CToken* theToken = (CToken*)aDeque.Pop();
 278     PushTokenFront(theToken);
 279   }
 280 }
 281
 282 /**
 283  * Copies the state flags from aTokenizer into this tokenizer. This is used
 284  * to pass information around between the main tokenizer and tokenizers
 285  * created for document.write() calls.
 286  *
 287  * @param aTokenizer The tokenizer with more information in it.
 288  * @return NS_OK
 289  */
 290 nsresult
 291 nsHTMLTokenizer::CopyState(nsITokenizer* aTokenizer)
 292 {
 293   if (aTokenizer) {
 294     mFlags = ((nsHTMLTokenizer*)aTokenizer)->mFlags;
 295   }
 296
 297   return NS_OK;
 298 }
 299
 300 /**
 301  * This is a utilty method for ScanDocStructure, which finds a given
 302  * tag in the stack. The return value is meant to be used with
 303  * nsDeque::ObjectAt() on aTagStack.
 304  *
 305  * @param   aTag -- the ID of the tag we're seeking
 306  * @param   aTagStack -- the stack to be searched
 307  * @return  index position of tag in stack if found, otherwise kNotFound
 308  */
 309 static PRInt32
 310 FindLastIndexOfTag(eHTMLTags aTag, nsDeque &aTagStack)
 311 {
 312   PRInt32 theCount = aTagStack.GetSize();
 313
 314   while (0 < theCount) {
 315     CHTMLToken* theToken = (CHTMLToken*)aTagStack.ObjectAt(--theCount);
 316     if (theToken) {
 317       eHTMLTags theTag = (eHTMLTags)theToken->GetTypeID();
 318       if (theTag == aTag) {
 319         return theCount;
 320       }
 321     }
 322   }
 323
 324   return kNotFound;
 325 }
 326
 327 /**
 328  * This method scans the sequence of tokens to determine whether or not the
 329  * tag structure of the document is well formed. In well formed cases, we can
 330  * skip doing residual style handling and allow inlines to contain block-level
 331  * elements.
 332  *
 333  * @param aFinalChunk Is unused.
 334  * @return Success (currently, this function cannot fail).
 335  */
 336 nsresult nsHTMLTokenizer::ScanDocStructure(PRBool aFinalChunk)
 337 {
 338   nsresult result = NS_OK;
 339   if (!mTokenDeque.GetSize()) {
 340     return result;
 341   }
 342
 343   CHTMLToken* theToken = (CHTMLToken*)mTokenDeque.ObjectAt(mTokenScanPos);
 344
 345   // Start by finding the first start tag that hasn't been reviewed.
 346   while (mTokenScanPos > 0) {
 347     if (theToken) {
 348       eHTMLTokenTypes theType = eHTMLTokenTypes(theToken->GetTokenType());
 349       if (theType == eToken_start &&
 350           theToken->GetContainerInfo() == eFormUnknown) {
 351         break;
 352       }
 353     }
 354     theToken = (CHTMLToken*)mTokenDeque.ObjectAt(--mTokenScanPos);
 355   }
 356
 357   // Now that we know where to start, let's walk through the
 358   // tokens to see which are well-formed. Stop when you run out
 359   // of fresh tokens.
 360
 361   nsDeque       theStack(0);
 362   nsDeque       tempStack(0);
 363   PRInt32       theStackDepth = 0;
 364   // Don't bother if we get ridiculously deep.
 365   static  const PRInt32 theMaxStackDepth = 200;
 366
 367   while (theToken && theStackDepth < theMaxStackDepth) {
 368     eHTMLTokenTypes theType = eHTMLTokenTypes(theToken->GetTokenType());
 369     eHTMLTags       theTag  = (eHTMLTags)theToken->GetTypeID();
 370
 371     if (nsHTMLElement::IsContainer(theTag)) { // Bug 54117
 372       PRBool theTagIsBlock  = gHTMLElements[theTag].IsMemberOf(kBlockEntity);
 373       PRBool theTagIsInline = theTagIsBlock
 374                               ? PR_FALSE
 375                               : gHTMLElements[theTag].IsMemberOf(kInlineEntity);
 376
 377       if (theTagIsBlock || theTagIsInline || eHTMLTag_table == theTag) {
 378         switch(theType) {
 379           case eToken_start:
 380             {
 381               if (gHTMLElements[theTag].ShouldVerifyHierarchy()) {
 382                 PRInt32 earlyPos = FindLastIndexOfTag(theTag, theStack);
 383                 if (earlyPos != kNotFound) {
 384                   // Uh-oh, we've found a tag that is not allowed to nest at
 385                   // all. Mark the previous one and all of its children as
 386                   // malformed to increase our chances of doing RS handling
 387                   // on all of them. We want to do this for cases such as:
 388                   // <a><div><a></a></div></a>.
 389                   // Note that we have to iterate through all of the chilren
 390                   // of the original malformed tag to protect against:
 391                   // <a><font><div><a></a></div></font></a>, so that the <font>
 392                   // is allowed to contain the <div>.
 393                   // XXX What about <a><span><a>, where the second <a> closes
 394                   // the <span>?
 395                   nsDequeIterator it(theStack, earlyPos), end(theStack.End());
 396                   while (it < end) {
 397                     CHTMLToken *theMalformedToken =
 398                         static_cast<CHTMLToken*>(it++);
 399
 400                     theMalformedToken->SetContainerInfo(eMalformed);
 401                   }
 402                 }
 403               }
 404
 405               theStack.Push(theToken);
 406               ++theStackDepth;
 407             }
 408             break;
 409           case eToken_end:
 410             {
 411               CHTMLToken *theLastToken =
 412                 static_cast<CHTMLToken*>(theStack.Peek());
 413               if (theLastToken) {
 414                 if (theTag == theLastToken->GetTypeID()) {
 415                   theStack.Pop(); // Yank it for real
 416                   theStackDepth--;
 417                   theLastToken->SetContainerInfo(eWellFormed);
 418                 } else {
 419                   // This token wasn't what we expected it to be! We need to
 420                   // go searching for its real start tag on our stack. Each
 421                   // tag in between the end tag and start tag must be malformed
 422
 423                   if (FindLastIndexOfTag(theTag, theStack) != kNotFound) {
 424                     // Find theTarget in the stack, marking each (malformed!)
 425                     // tag in our way.
 426                     theStack.Pop(); // Pop off theLastToken for real.
 427                     do {
 428                       theLastToken->SetContainerInfo(eMalformed);
 429                       tempStack.Push(theLastToken);
 430                       theLastToken = static_cast<CHTMLToken*>(theStack.Pop());
 431                     } while (theLastToken && theTag != theLastToken->GetTypeID());
 432                     // XXX The above test can confuse two different userdefined
 433                     // tags.
 434
 435                     NS_ASSERTION(theLastToken,
 436                                  "FindLastIndexOfTag lied to us!"
 437                                  " We couldn't find theTag on theStack");
 438                     theLastToken->SetContainerInfo(eMalformed);
 439
 440                     // Great, now push all of the other tokens back onto the
 441                     // stack to preserve the general structure of the document.
 442                     // Note that we don't push the target token back onto the
 443                     // the stack (since it was just closed).
 444                     while (tempStack.GetSize() != 0) {
 445                       theStack.Push(tempStack.Pop());
 446                     }
 447                   }
 448                 }
 449               }
 450             }
 451             break;
 452           default:
 453             break;
 454         }
 455       }
 456     }
 457
 458     theToken = (CHTMLToken*)mTokenDeque.ObjectAt(++mTokenScanPos);
 459   }
 460
 461   return result;
 462 }
 463
 464 /**
 465  * This method is called after we're done tokenizing a chunk of data.
 466  *
 467  * @param aFinalChunk Tells us if this was the last chunk of data.
 468  * @return Error result.
 469  */
 470 nsresult
 471 nsHTMLTokenizer::DidTokenize(PRBool aFinalChunk)
 472 {
 473   return ScanDocStructure(aFinalChunk);
 474 }
 475
 476 /**
 477  * This method is repeatedly called by the tokenizer.
 478  * Each time, we determine the kind of token we're about to
 479  * read, and then we call the appropriate method to handle
 480  * that token type.
 481  *
 482  * @param  aScanner The source of our input.
 483  * @param  aFlushTokens An OUT parameter to tell the caller whether it should
 484  *                      process our queued tokens up to now (e.g., when we
 485  *                      reach a <script>).
 486  * @return Success or error
 487  */
 488 nsresult
 489 nsHTMLTokenizer::ConsumeToken(nsScanner& aScanner, PRBool& aFlushTokens)
 490 {
 491   PRUnichar theChar;
 492   CToken* theToken = nsnull;
 493
 494   nsresult result = aScanner.Peek(theChar);
 495
 496   switch(result) {
 497     case kEOF:
 498       // Tell our caller that'we finished.
 499       return result;
 500
 501     case NS_OK:
 502     default:
 503       if (!(mFlags & NS_IPARSER_FLAG_PLAIN_TEXT)) {
 504         if (kLessThan == theChar) {
 505           return ConsumeTag(theChar, theToken, aScanner, aFlushTokens);
 506         } else if (kAmpersand == theChar) {
 507           return ConsumeEntity(theChar, theToken, aScanner);
 508         }
 509       }
 510
 511       if (kCR == theChar || kLF == theChar) {
 512         return ConsumeNewline(theChar, theToken, aScanner);
 513       } else {
 514         if (!nsCRT::IsAsciiSpace(theChar)) {
 515           if (theChar != '\0') {
 516             result = ConsumeText(theToken, aScanner);
 517           } else {
 518             // Skip the embedded null char. Fix bug 64098.
 519             aScanner.GetChar(theChar);
 520           }
 521           break;
 522         }
 523         result = ConsumeWhitespace(theChar, theToken, aScanner);
 524       }
 525       break;
 526   }
 527
 528   return result;
 529 }
 530
 531 /**
 532  * This method is called just after a "<" has been consumed
 533  * and we know we're at the start of some kind of tagged
 534  * element. We don't know yet if it's a tag or a comment.
 535  *
 536  * @param   aChar is the last char read
 537  * @param   aToken is the out arg holding our new token (the function allocates
 538  *                 the return token using mTokenAllocator).
 539  * @param   aScanner represents our input source
 540  * @param   aFlushTokens is an OUT parameter use to tell consumers to flush
 541  *                       the current tokens after processing the current one.
 542  * @return  error code.
 543  */
 544 nsresult
 545 nsHTMLTokenizer::ConsumeTag(PRUnichar aChar,
 546                             CToken*& aToken,
 547                             nsScanner& aScanner,
 548                             PRBool& aFlushTokens)
 549 {
 550   PRUnichar theNextChar, oldChar;
 551   nsresult result = aScanner.Peek(aChar, 1);
 552
 553   if (NS_OK == result) {
 554     switch (aChar) {
 555       case kForwardSlash:
 556         result = aScanner.Peek(theNextChar, 2);
 557
 558         if (NS_OK == result) {
 559           // Get the original "<" (we've already seen it with a Peek)
 560           aScanner.GetChar(oldChar);
 561
 562           // XML allows non ASCII tag names, consume this as an end tag. This
 563           // is needed to make XML view source work
 564           PRBool isXML = !!(mFlags & NS_IPARSER_FLAG_XML);
 565           if (nsCRT::IsAsciiAlpha(theNextChar) ||
 566               kGreaterThan == theNextChar      ||
 567               (isXML && !nsCRT::IsAscii(theNextChar))) {
 568             result = ConsumeEndTag(aChar, aToken, aScanner);
 569           } else {
 570             result = ConsumeComment(aChar, aToken, aScanner);
 571           }
 572         }
 573
 574         break;
 575
 576       case kExclamation:
 577         result = aScanner.Peek(theNextChar, 2);
 578
 579         if (NS_OK == result) {
 580           // Get the original "<" (we've already seen it with a Peek)
 581           aScanner.GetChar(oldChar);
 582
 583           if (kMinus == theNextChar || kGreaterThan == theNextChar) {
 584             result = ConsumeComment(aChar, aToken, aScanner);
 585           } else {
 586             result = ConsumeSpecialMarkup(aChar, aToken, aScanner);
 587           }
 588         }
 589         break;
 590
 591       case kQuestionMark:
 592         // It must be a processing instruction...
 593         // Get the original "<" (we've already seen it with a Peek)
 594         aScanner.GetChar(oldChar);
 595         result = ConsumeProcessingInstruction(aChar, aToken, aScanner);
 596         break;
 597
 598       default:
 599         // XML allows non ASCII tag names, consume this as a start tag.
 600         PRBool isXML = !!(mFlags & NS_IPARSER_FLAG_XML);
 601         if (nsCRT::IsAsciiAlpha(aChar) ||
 602             (isXML && !nsCRT::IsAscii(aChar))) {
 603           // Get the original "<" (we've already seen it with a Peek)
 604           aScanner.GetChar(oldChar);
 605           result = ConsumeStartTag(aChar, aToken, aScanner, aFlushTokens);
 606         } else {
 607           // We are not dealing with a tag. So, don't consume the original
 608           // char and leave the decision to ConsumeText().
 609           result = ConsumeText(aToken, aScanner);
 610         }
 611     }
 612   }
 613
 614   // Last ditch attempt to make sure we don't lose data.
 615   if (kEOF == result && !aScanner.IsIncremental()) {
 616     // Whoops, we don't want to lose any data! Consume the rest as text.
 617     // This normally happens for either a trailing < or </
 618     result = ConsumeText(aToken, aScanner);
 619   }
 620
 621   return result;
 622 }
 623
 624 /**
 625  * This method is called just after we've consumed a start or end
 626  * tag, and we now have to consume its attributes.
 627  *
 628  * @param   aChar is the last char read
 629  * @param   aToken is the start or end tag that "owns" these attributes.
 630  * @param   aScanner represents our input source
 631  * @return  Error result.
 632  */
 633 nsresult
 634 nsHTMLTokenizer::ConsumeAttributes(PRUnichar aChar,
 635                                    CToken* aToken,
 636                                    nsScanner& aScanner)
 637 {
 638   PRBool done = PR_FALSE;
 639   nsresult result = NS_OK;
 640   PRInt16 theAttrCount = 0;
 641
 642   nsTokenAllocator* theAllocator = this->GetTokenAllocator();
 643
 644   while (!done && result == NS_OK) {
 645     CAttributeToken* theToken =
 646       static_cast<CAttributeToken*>
 647                  (theAllocator->CreateTokenOfType(eToken_attribute,
 648                                                      eHTMLTag_unknown));
 649     if (NS_LIKELY(theToken != nsnull)) {
 650       // Tell the new token to finish consuming text...
 651       result = theToken->Consume(aChar, aScanner, mFlags);
 652
 653       if (NS_SUCCEEDED(result)) {
 654         ++theAttrCount;
 655         AddToken((CToken*&)theToken, result, &mTokenDeque, theAllocator);
 656       } else {
 657         IF_FREE(theToken, mTokenAllocator);
 658         // Bad attribute returns shouldn't propagate out.
 659         if (NS_ERROR_HTMLPARSER_BADATTRIBUTE == result) {
 660           result = NS_OK;
 661         }
 662       }
 663     }
 664     else {
 665       result = NS_ERROR_OUT_OF_MEMORY;
 666     }
 667
 668 #ifdef DEBUG
 669     if (NS_SUCCEEDED(result)) {
 670       PRInt32 newline = 0;
 671       aScanner.SkipWhitespace(newline);
 672       NS_ASSERTION(newline == 0,
 673           "CAttribute::Consume() failed to collect all the newlines!");
 674     }
 675 #endif
 676     if (NS_SUCCEEDED(result)) {
 677       result = aScanner.Peek(aChar);
 678       if (NS_SUCCEEDED(result)) {
 679         if (aChar == kGreaterThan) { // You just ate the '>'
 680           aScanner.GetChar(aChar); // Skip the '>'
 681           done = PR_TRUE;
 682         } else if (aChar == kLessThan) {
 683           aToken->SetInError(PR_TRUE);
 684           done = PR_TRUE;
 685         }
 686       }
 687     }
 688   }
 689
 690   if (NS_FAILED(result)) {
 691     aToken->SetInError(PR_TRUE);
 692
 693     if (!aScanner.IsIncremental()) {
 694       result = NS_OK;
 695     }
 696   }
 697
 698   aToken->SetAttributeCount(theAttrCount);
 699   return result;
 700 }
 701
 702 /**
 703  * This method consumes a start tag and all of its attributes.
 704  *
 705  * @param aChar The last character read from the scanner.
 706  * @param aToken The OUT parameter that holds our resulting token. (allocated
 707  *               by the function using mTokenAllocator
 708  * @param aScanner Our source of data
 709  * @param aFlushTokens is an OUT parameter use to tell consumers to flush
 710  *                     the current tokens after processing the current one.
 711  * @return Error result.
 712  */
 713 nsresult
 714 nsHTMLTokenizer::ConsumeStartTag(PRUnichar aChar,
 715                                  CToken*& aToken,
 716                                  nsScanner& aScanner,
 717                                  PRBool& aFlushTokens)
 718 {
 719   // Remember this for later in case you have to unwind...
 720   PRInt32 theDequeSize = mTokenDeque.GetSize();
 721   nsresult result = NS_OK;
 722
 723   nsTokenAllocator* theAllocator = this->GetTokenAllocator();
 724   aToken = theAllocator->CreateTokenOfType(eToken_start, eHTMLTag_unknown);
 725   NS_ENSURE_TRUE(aToken, NS_ERROR_OUT_OF_MEMORY);
 726
 727   // Tell the new token to finish consuming text...
 728   result = aToken->Consume(aChar, aScanner, mFlags);
 729
 730   if (NS_SUCCEEDED(result)) {
 731     AddToken(aToken, result, &mTokenDeque, theAllocator);
 732
 733     eHTMLTags theTag = (eHTMLTags)aToken->GetTypeID();
 734
 735     // Good. Now, let's see if the next char is ">".
 736     // If so, we have a complete tag, otherwise, we have attributes.
 737     result = aScanner.Peek(aChar);
 738     if (NS_FAILED(result)) {
 739       aToken->SetInError(PR_TRUE);
 740
 741       // Don't return early here so we can create a text and end token for
 742       // the special <iframe>, <script> and similar tags down below.
 743       result = NS_OK;
 744     } else {
 745       if (kGreaterThan != aChar) { // Look for a '>'
 746         result = ConsumeAttributes(aChar, aToken, aScanner);
 747       } else {
 748         aScanner.GetChar(aChar);
 749       }
 750     }
 751
 752     /*  Now that that's over with, we have one more problem to solve.
 753         In the case that we just read a <SCRIPT> or <STYLE> tags, we should go and
 754         consume all the content itself.
 755         But XML doesn't treat these tags differently, so we shouldn't if the
 756         document is XML.
 757      */
 758     if (NS_SUCCEEDED(result) && !(mFlags & NS_IPARSER_FLAG_XML)) {
 759       PRBool isCDATA = gHTMLElements[theTag].CanContainType(kCDATA);
 760       PRBool isPCDATA = eHTMLTag_textarea == theTag ||
 761                         eHTMLTag_title    == theTag;
 762
 763       // XXX This is an evil hack, we should be able to handle these properly
 764       // in the DTD.
 765       if ((eHTMLTag_iframe == theTag &&
 766             (mFlags & NS_IPARSER_FLAG_FRAMES_ENABLED)) ||
 767           (eHTMLTag_noframes == theTag &&
 768             (mFlags & NS_IPARSER_FLAG_FRAMES_ENABLED)) ||
 769           (eHTMLTag_noscript == theTag &&
 770             (mFlags & NS_IPARSER_FLAG_SCRIPT_ENABLED)) ||
 771           (eHTMLTag_noembed == theTag)) {
 772         isCDATA = PR_TRUE;
 773       }
 774
 775       // Plaintext contains CDATA, but it's special, so we handle it
 776       // differently than the other CDATA elements
 777       if (eHTMLTag_plaintext == theTag) {
 778         isCDATA = PR_FALSE;
 779
 780         // Note: We check in ConsumeToken() for this flag, and if we see it
 781         // we only construct text tokens (which is what we want).
 782         mFlags |= NS_IPARSER_FLAG_PLAIN_TEXT;
 783       }
 784
 785
 786       if (isCDATA || isPCDATA) {
 787         PRBool done = PR_FALSE;
 788         nsDependentString endTagName(nsHTMLTags::GetStringValue(theTag));
 789
 790         CToken* text =
 791             theAllocator->CreateTokenOfType(eToken_text, eHTMLTag_text);
 792         NS_ENSURE_TRUE(text, NS_ERROR_OUT_OF_MEMORY);
 793
 794         CTextToken* textToken = static_cast<CTextToken*>(text);
 795
 796         if (isCDATA) {
 797           result = textToken->ConsumeCharacterData(theTag != eHTMLTag_script,
 798                                                    aScanner,
 799                                                    endTagName,
 800                                                    mFlags,
 801                                                    done);
 802
 803           // Only flush tokens for <script>, to give ourselves more of a
 804           // chance of allowing inlines to contain blocks.
 805           aFlushTokens = done && theTag == eHTMLTag_script;
 806         } else if (isPCDATA) {
 807           // Title is consumed conservatively in order to not regress
 808           // bug 42945
 809           result = textToken->ConsumeParsedCharacterData(
 810                                                   theTag == eHTMLTag_textarea,
 811                                                   theTag == eHTMLTag_title,
 812                                                   aScanner,
 813                                                   endTagName,
 814                                                   mFlags,
 815                                                   done);
 816
 817           // Note: we *don't* set aFlushTokens here.
 818         }
 819
 820         // We want to do this unless result is kEOF, in which case we will
 821         // simply unwind our stack and wait for more data anyway.
 822         if (kEOF != result) {
 823           AddToken(text, NS_OK, &mTokenDeque, theAllocator);
 824           CToken* endToken = nsnull;
 825
 826           if (NS_SUCCEEDED(result) && done) {
 827             PRUnichar theChar;
 828             // Get the <
 829             result = aScanner.GetChar(theChar);
 830             NS_ASSERTION(NS_SUCCEEDED(result) && theChar == kLessThan,
 831                          "CTextToken::Consume*Data is broken!");
 832 #ifdef DEBUG
 833             // Ensure we have a /
 834             PRUnichar tempChar;  // Don't change non-debug vars in debug-only code
 835             result = aScanner.Peek(tempChar);
 836             NS_ASSERTION(NS_SUCCEEDED(result) && tempChar == kForwardSlash,
 837                          "CTextToken::Consume*Data is broken!");
 838 #endif
 839             result = ConsumeEndTag(PRUnichar('/'), endToken, aScanner);
 840             if (!(mFlags & NS_IPARSER_FLAG_VIEW_SOURCE) &&
 841                 NS_SUCCEEDED(result)) {
 842               // If ConsumeCharacterData returned a success result (and
 843               // we're not in view source), then we want to make sure that
 844               // we're going to execute this script (since the result means
 845               // that we've found an end tag that satisfies all of the right
 846               // conditions).
 847               endToken->SetInError(PR_FALSE);
 848             }
 849           } else if (result == kFakeEndTag &&
 850                     !(mFlags & NS_IPARSER_FLAG_VIEW_SOURCE)) {
 851             result = NS_OK;
 852             endToken = theAllocator->CreateTokenOfType(eToken_end, theTag,
 853                                                        endTagName);
 854             AddToken(endToken, result, &mTokenDeque, theAllocator);
 855             if (NS_LIKELY(endToken != nsnull)) {
 856               endToken->SetInError(PR_TRUE);
 857             }
 858             else {
 859               result = NS_ERROR_OUT_OF_MEMORY;
 860             }
 861           } else if (result == kFakeEndTag) {
 862             // If we are here, we are both faking having seen the end tag
 863             // and are in view-source.
 864             result = NS_OK;
 865           }
 866         } else {
 867           IF_FREE(text, mTokenAllocator);
 868         }
 869       }
 870     }
 871
 872     // This code is confusing, so pay attention.
 873     // If you're here, it's because we were in the midst of consuming a start
 874     // tag but ran out of data (not in the stream, but in this *part* of the
 875     // stream. For simplicity, we have to unwind our input. Therefore, we pop
 876     // and discard any new tokens we've queued this round. Later we can get
 877     // smarter about this.
 878     if (NS_FAILED(result)) {
 879       while (mTokenDeque.GetSize()>theDequeSize) {
 880         CToken* theToken = (CToken*)mTokenDeque.Pop();
 881         IF_FREE(theToken, mTokenAllocator);
 882       }
 883     }
 884   } else {
 885     IF_FREE(aToken, mTokenAllocator);
 886   }
 887
 888   return result;
 889 }
 890
 891 /**
 892  * This method consumes an end tag and any "attributes" that may come after it.
 893  *
 894  * @param aChar The last character read from the scanner.
 895  * @param aToken The OUT parameter that holds our resulting token.
 896  * @param aScanner Our source of data
 897  * @return Error result
 898  */
 899 nsresult
 900 nsHTMLTokenizer::ConsumeEndTag(PRUnichar aChar,
 901                                CToken*& aToken,
 902                                nsScanner& aScanner)
 903 {
 904   // Get the "/" (we've already seen it with a Peek)
 905   aScanner.GetChar(aChar);
 906
 907   nsTokenAllocator* theAllocator = this->GetTokenAllocator();
 908   aToken = theAllocator->CreateTokenOfType(eToken_end, eHTMLTag_unknown);
 909   NS_ENSURE_TRUE(aToken, NS_ERROR_OUT_OF_MEMORY);
 910
 911   // Remember this for later in case you have to unwind...
 912   PRInt32 theDequeSize = mTokenDeque.GetSize();
 913   nsresult result = NS_OK;
 914
 915   // Tell the new token to finish consuming text...
 916   result = aToken->Consume(aChar, aScanner, mFlags);
 917   AddToken(aToken, result, &mTokenDeque, theAllocator);
 918   if (NS_FAILED(result)) {
 919     // Note that this early-return here is safe because we have not yet
 920     // added any of our tokens to the queue (AddToken only adds the token if
 921     // result is a success), so we don't need to fall through.
 922     return result;
 923   }
 924
 925   result = aScanner.Peek(aChar);
 926   if (NS_FAILED(result)) {
 927     aToken->SetInError(PR_TRUE);
 928
 929     // Note: We know here that the scanner is not incremental since if
 930     // this peek fails, then we've already masked over a kEOF coming from
 931     // the Consume() call above.
 932     return NS_OK;
 933   }
 934
 935   if (kGreaterThan != aChar) {
 936     result = ConsumeAttributes(aChar, aToken, aScanner);
 937   } else {
 938     aScanner.GetChar(aChar);
 939   }
 940
 941   // Do the same thing as we do in ConsumeStartTag. Basically, if we've run
 942   // out of room in this *section* of the document, pop all of the tokens
 943   // we've consumed this round and wait for more data.
 944   if (NS_FAILED(result)) {
 945     while (mTokenDeque.GetSize() > theDequeSize) {
 946       CToken* theToken = (CToken*)mTokenDeque.Pop();
 947       IF_FREE(theToken, mTokenAllocator);
 948     }
 949   }
 950
 951   return result;
 952 }
 953
 954 /**
 955  *  This method is called just after a "&" has been consumed
 956  *  and we know we're at the start of an entity.
 957  *
 958  * @param aChar The last character read from the scanner.
 959  * @param aToken The OUT parameter that holds our resulting token.
 960  * @param aScanner Our source of data
 961  * @return Error result.
 962  */
 963 nsresult
 964 nsHTMLTokenizer::ConsumeEntity(PRUnichar aChar,
 965                                CToken*& aToken,
 966                                nsScanner& aScanner)
 967 {
 968   PRUnichar  theChar;
 969   nsresult result = aScanner.Peek(theChar, 1);
 970
 971   nsTokenAllocator* theAllocator = this->GetTokenAllocator();
 972   if (NS_SUCCEEDED(result)) {
 973     if (nsCRT::IsAsciiAlpha(theChar) || theChar == kHashsign) {
 974       aToken = theAllocator->CreateTokenOfType(eToken_entity, eHTMLTag_entity);
 975       NS_ENSURE_TRUE(aToken, NS_ERROR_OUT_OF_MEMORY);
 976       result = aToken->Consume(theChar, aScanner, mFlags);
 977
 978       if (result == NS_HTMLTOKENS_NOT_AN_ENTITY) {
 979         IF_FREE(aToken, mTokenAllocator);
 980       } else {
 981         if (result == kEOF && !aScanner.IsIncremental()) {
 982           result = NS_OK; // Use as much of the entity as you can get.
 983         }
 984
 985         AddToken(aToken, result, &mTokenDeque, theAllocator);
 986         return result;
 987       }
 988     }
 989
 990     // Oops, we're actually looking at plain text...
 991     result = ConsumeText(aToken, aScanner);
 992   } else if (result == kEOF && !aScanner.IsIncremental()) {
 993     // If the last character in the file is an &, consume it as text.
 994     result = ConsumeText(aToken, aScanner);
 995     if (aToken) {
 996       aToken->SetInError(PR_TRUE);
 997     }
 998   }
 999
1000   return result;
1001 }
1002
1003
1004 /**
1005  *  This method is called just after whitespace has been
1006  *  consumed and we know we're at the start a whitespace run.
1007  *
1008  * @param aChar The last character read from the scanner.
1009  * @param aToken The OUT parameter that holds our resulting token.
1010  * @param aScanner Our source of data
1011  * @return Error result.
1012  */
1013 nsresult
1014 nsHTMLTokenizer::ConsumeWhitespace(PRUnichar aChar,
1015                                    CToken*& aToken,
1016                                    nsScanner& aScanner)
1017 {
1018   // Get the whitespace character
1019   aScanner.GetChar(aChar);
1020
1021   nsTokenAllocator* theAllocator = this->GetTokenAllocator();
1022   aToken = theAllocator->CreateTokenOfType(eToken_whitespace,
1023                                            eHTMLTag_whitespace);
1024   nsresult result = NS_OK;
1025   if (aToken) {
1026     result = aToken->Consume(aChar, aScanner, mFlags);
1027     AddToken(aToken, result, &mTokenDeque, theAllocator);
1028   }
1029
1030   return result;
1031 }
1032
1033 /**
1034  *  This method is called just after a "<!" has been consumed
1035  *  and we know we're at the start of a comment.
1036  *
1037  * @param aChar The last character read from the scanner.
1038  * @param aToken The OUT parameter that holds our resulting token.
1039  * @param aScanner Our source of data
1040  * @return Error result.
1041  */
1042 nsresult
1043 nsHTMLTokenizer::ConsumeComment(PRUnichar aChar,
1044                                 CToken*& aToken,
1045                                 nsScanner& aScanner)
1046 {
1047   // Get the "!"
1048   aScanner.GetChar(aChar);
1049
1050   nsTokenAllocator* theAllocator = this->GetTokenAllocator();
1051   aToken = theAllocator->CreateTokenOfType(eToken_comment, eHTMLTag_comment);
1052   nsresult result = NS_OK;
1053   if (aToken) {
1054     result = aToken->Consume(aChar, aScanner, mFlags);
1055     AddToken(aToken, result, &mTokenDeque, theAllocator);
1056   }
1057
1058   if (kNotAComment == result) {
1059     // AddToken has IF_FREE()'d our token, so...
1060     result = ConsumeText(aToken, aScanner);
1061   }
1062
1063   return result;
1064 }
1065
1066 /**
1067  * This method is called just after a known text char has
1068  * been consumed and we should read a text run. Note: we actually ignore the
1069  * first character of the text run so that we can consume invalid markup
1070  * as text.
1071  *
1072  * @param aToken The OUT parameter that holds our resulting token.
1073  * @param aScanner Our source of data
1074  * @return Error result.
1075  */
1076 nsresult
1077 nsHTMLTokenizer::ConsumeText(CToken*& aToken, nsScanner& aScanner)
1078 {
1079   nsresult result = NS_OK;
1080   nsTokenAllocator* theAllocator = this->GetTokenAllocator();
1081   CTextToken* theToken =
1082     (CTextToken*)theAllocator->CreateTokenOfType(eToken_text, eHTMLTag_text);
1083   if (theToken) {
1084     PRUnichar ch = '\0';
1085     result = theToken->Consume(ch, aScanner, mFlags);
1086     if (NS_FAILED(result)) {
1087       if (0 == theToken->GetTextLength()) {
1088         IF_FREE(aToken, mTokenAllocator);
1089         aToken = nsnull;
1090       } else {
1091         result = NS_OK;
1092       }
1093     }
1094
1095     aToken = theToken;
1096     AddToken(aToken, result, &mTokenDeque, theAllocator);
1097   }
1098
1099   return result;
1100 }
1101
1102 /**
1103  * This method is called just after a "<!" has been consumed.
1104  * NOTE: Here we might consume DOCTYPE and "special" markups.
1105  *
1106  * @param aChar The last character read from the scanner.
1107  * @param aToken The OUT parameter that holds our resulting token.
1108  * @param aScanner Our source of data
1109  * @return Error result.
1110  */
1111 nsresult
1112 nsHTMLTokenizer::ConsumeSpecialMarkup(PRUnichar aChar,
1113                                       CToken*& aToken,
1114                                       nsScanner& aScanner)
1115 {
1116   // Get the "!"
1117   aScanner.GetChar(aChar);
1118
1119   nsresult result = NS_OK;
1120   nsAutoString theBufCopy;
1121   aScanner.Peek(theBufCopy, 20);
1122   ToUpperCase(theBufCopy);
1123   PRInt32 theIndex = theBufCopy.Find("DOCTYPE", PR_FALSE, 0, 0);
1124   nsTokenAllocator* theAllocator = this->GetTokenAllocator();
1125
1126   if (theIndex == kNotFound) {
1127     if ('[' == theBufCopy.CharAt(0)) {
1128       aToken = theAllocator->CreateTokenOfType(eToken_cdatasection,
1129                                                eHTMLTag_comment);
1130     } else if (StringBeginsWith(theBufCopy, NS_LITERAL_STRING("ELEMENT")) ||
1131                StringBeginsWith(theBufCopy, NS_LITERAL_STRING("ATTLIST")) ||
1132                StringBeginsWith(theBufCopy, NS_LITERAL_STRING("ENTITY"))  ||
1133                StringBeginsWith(theBufCopy, NS_LITERAL_STRING("NOTATION"))) {
1134       aToken = theAllocator->CreateTokenOfType(eToken_markupDecl,
1135                                                eHTMLTag_markupDecl);
1136     } else {
1137       aToken = theAllocator->CreateTokenOfType(eToken_comment,
1138                                                eHTMLTag_comment);
1139     }
1140   } else {
1141     aToken = theAllocator->CreateTokenOfType(eToken_doctypeDecl,
1142                                              eHTMLTag_doctypeDecl);
1143   }
1144
1145   if (aToken) {
1146     result = aToken->Consume(aChar, aScanner, mFlags);
1147     AddToken(aToken, result, &mTokenDeque, theAllocator);
1148   }
1149
1150   if (result == kNotAComment) {
1151     result = ConsumeText(aToken, aScanner);
1152   }
1153
1154   return result;
1155 }
1156
1157 /**
1158  * This method is called just after a newline has been consumed.
1159  *
1160  * @param aChar The last character read from the scanner.
1161  * @param aToken The OUT parameter that holds our resulting token.
1162  * @param aScanner Our source of data
1163  * @return Error result.
1164  */
1165 nsresult
1166 nsHTMLTokenizer::ConsumeNewline(PRUnichar aChar,
1167                                 CToken*& aToken,
1168                                 nsScanner& aScanner)
1169 {
1170   // Get the newline character
1171   aScanner.GetChar(aChar);
1172
1173   nsTokenAllocator* theAllocator = this->GetTokenAllocator();
1174   aToken = theAllocator->CreateTokenOfType(eToken_newline, eHTMLTag_newline);
1175   nsresult result = NS_OK;
1176   if (aToken) {
1177     result = aToken->Consume(aChar, aScanner, mFlags);
1178     AddToken(aToken, result, &mTokenDeque, theAllocator);
1179   }
1180
1181   return result;
1182 }
1183
1184
1185 /**
1186  * This method is called just after a <? has been consumed.
1187  *
1188  * @param aChar The last character read from the scanner.
1189  * @param aToken The OUT parameter that holds our resulting token.
1190  * @param aScanner Our source of data
1191  * @return Error result.
1192  */
1193 nsresult
1194 nsHTMLTokenizer::ConsumeProcessingInstruction(PRUnichar aChar,
1195                                               CToken*& aToken,
1196                                               nsScanner& aScanner)
1197 {
1198   // Get the "?"
1199   aScanner.GetChar(aChar);
1200
1201   nsTokenAllocator* theAllocator = this->GetTokenAllocator();
1202   aToken = theAllocator->CreateTokenOfType(eToken_instruction,
1203                                            eHTMLTag_unknown);
1204   nsresult result = NS_OK;
1205   if (aToken) {
1206     result = aToken->Consume(aChar, aScanner, mFlags);
1207     AddToken(aToken, result, &mTokenDeque, theAllocator);
1208   }
1209
1210   return result;
1211 }