parser/htmlparser/src/nsHTMLTokenizer.cpp

   1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
   2 /* vim: set sw=2 ts=2 et tw=78: */
   3 /* ***** BEGIN LICENSE BLOCK *****
   4  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
   5  *
   6  * The contents of this file are subject to the Mozilla Public License Version
   7  * 1.1 (the "License"); you may not use this file except in compliance with
   8  * the License. You may obtain a copy of the License at
   9  * http://www.mozilla.org/MPL/
  10  *
  11  * Software distributed under the License is distributed on an "AS IS" basis,
  12  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
  13  * for the specific language governing rights and limitations under the
  14  * License.
  15  *
  16  * The Original Code is mozilla.org code.
  17  *
  18  * The Initial Developer of the Original Code is
  19  * Netscape Communications Corporation.
  20  * Portions created by the Initial Developer are Copyright (C) 1998
  21  * the Initial Developer. All Rights Reserved.
  22  *
  23  * Contributor(s):
  24  *   Blake Kaplan <mrbkap@gmail.com>
  25  *
  26  * Alternatively, the contents of this file may be used under the terms of
  27  * either of the GNU General Public License Version 2 or later (the "GPL"),
  28  * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
  29  * in which case the provisions of the GPL or the LGPL are applicable instead
  30  * of those above. If you wish to allow use of your version of this file only
  31  * under the terms of either the GPL or the LGPL, and not to allow others to
  32  * use your version of this file under the terms of the MPL, indicate your
  33  * decision by deleting the provisions above and replace them with the notice
  34  * and other provisions required by the GPL or the LGPL. If you do not delete
  35  * the provisions above, a recipient may use your version of this file under
  36  * the terms of any one of the MPL, the GPL or the LGPL.
  37  *
  38  * ***** END LICENSE BLOCK ***** */
  39
  40
  41 /**
  42  * @file nsHTMLTokenizer.cpp
  43  * This is an implementation of the nsITokenizer interface.
  44  * This file contains the implementation of a tokenizer to tokenize an HTML
  45  * document. It attempts to do so, making tradeoffs between compatibility with
  46  * older parsers and the SGML specification. Note that most of the real
  47  * "tokenization" takes place in nsHTMLTokens.cpp.
  48  */
  49
  50 #include "nsIAtom.h"
  51 #include "nsHTMLTokenizer.h"
  52 #include "nsScanner.h"
  53 #include "nsElementTable.h"
  54 #include "nsReadableUtils.h"
  55 #include "nsUnicharUtils.h"
  56
  57 /************************************************************************
  58   And now for the main class -- nsHTMLTokenizer...
  59  ************************************************************************/
  60
  61 /**
  62  * Satisfy the nsISupports interface.
  63  */
  64 NS_IMPL_ISUPPORTS1(nsHTMLTokenizer, nsITokenizer)
  65
  66 /**
  67  * Default constructor
  68  *
  69  * @param  aParseMode The current mode the document is in (quirks, etc.)
  70  * @param  aDocType The document type of the current document
  71  * @param  aCommand What we are trying to do (view-source, parse a fragment, etc.)
  72  */
  73 nsHTMLTokenizer::nsHTMLTokenizer(PRInt32 aParseMode,
  74                                  eParserDocType aDocType,
  75                                  eParserCommands aCommand,
  76                                  PRUint16 aFlags) :
  77   nsITokenizer(), mTokenDeque(0), mFlags(aFlags)
  78 {
  79   if (aParseMode == eDTDMode_full_standards ||
  80       aParseMode == eDTDMode_almost_standards) {
  81     mFlags |= NS_IPARSER_FLAG_STRICT_MODE;
  82   } else if (aParseMode == eDTDMode_quirks)  {
  83     mFlags |= NS_IPARSER_FLAG_QUIRKS_MODE;
  84   } else if (aParseMode == eDTDMode_autodetect) {
  85     mFlags |= NS_IPARSER_FLAG_AUTO_DETECT_MODE;
  86   } else {
  87     mFlags |= NS_IPARSER_FLAG_UNKNOWN_MODE;
  88   }
  89
  90   if (aDocType == ePlainText) {
  91     mFlags |= NS_IPARSER_FLAG_PLAIN_TEXT;
  92   } else if (aDocType == eXML) {
  93     mFlags |= NS_IPARSER_FLAG_XML;
  94   } else if (aDocType == eHTML_Quirks ||
  95              aDocType == eHTML_Strict) {
  96     mFlags |= NS_IPARSER_FLAG_HTML;
  97   }
  98
  99   mFlags |= aCommand == eViewSource
 100             ? NS_IPARSER_FLAG_VIEW_SOURCE
 101             : NS_IPARSER_FLAG_VIEW_NORMAL;
 102
 103   NS_ASSERTION(!(mFlags & NS_IPARSER_FLAG_XML) ||
 104                 (mFlags & NS_IPARSER_FLAG_VIEW_SOURCE),
 105               "Why isn't this XML document going through our XML parser?");
 106
 107   mTokenAllocator = nsnull;
 108   mTokenScanPos = 0;
 109 }
 110
 111 /**
 112  * The destructor ensures that we don't leak any left over tokens.
 113  */
 114 nsHTMLTokenizer::~nsHTMLTokenizer()
 115 {
 116   if (mTokenDeque.GetSize()) {
 117     CTokenDeallocator theDeallocator(mTokenAllocator->GetArenaPool());
 118     mTokenDeque.ForEach(theDeallocator);
 119   }
 120 }
 121
 122
 123 /*******************************************************************
 124   Here begins the real working methods for the tokenizer.
 125  *******************************************************************/
 126
 127 /**
 128  * Adds a token onto the end of the deque if aResult is a successful result.
 129  * Otherwise, this function frees aToken and sets it to nsnull.
 130  *
 131  * @param aToken The token that wants to be added.
 132  * @param aResult The error code that will be used to determine if we actually
 133  *                want to push this token.
 134  * @param aDeque The deque we want to push aToken onto.
 135  * @param aTokenAllocator The allocator we use to free aToken in case aResult
 136  *                        is not a success code.
 137  */
 138 /* static */
 139 void
 140 nsHTMLTokenizer::AddToken(CToken*& aToken,
 141                           nsresult aResult,
 142                           nsDeque* aDeque,
 143                           nsTokenAllocator* aTokenAllocator)
 144 {
 145   if (aToken && aDeque) {
 146     if (NS_SUCCEEDED(aResult)) {
 147       aDeque->Push(aToken);
 148     } else {
 149       IF_FREE(aToken, aTokenAllocator);
 150     }
 151   }
 152 }
 153
 154 /**
 155  * Retrieve a pointer to the global token recycler...
 156  *
 157  * @return Pointer to recycler (or null)
 158  */
 159 nsTokenAllocator*
 160 nsHTMLTokenizer::GetTokenAllocator()
 161 {
 162   return mTokenAllocator;
 163 }
 164
 165 /**
 166  * This method provides access to the topmost token in the tokenDeque.
 167  * The token is not really removed from the list.
 168  *
 169  * @return Pointer to token
 170  */
 171 CToken*
 172 nsHTMLTokenizer::PeekToken()
 173 {
 174   return (CToken*)mTokenDeque.PeekFront();
 175 }
 176
 177 /**
 178  * This method provides access to the topmost token in the tokenDeque.
 179  * The token is really removed from the list; if the list is empty we return 0.
 180  *
 181  * @return Pointer to token or NULL
 182  */
 183 CToken*
 184 nsHTMLTokenizer::PopToken()
 185 {
 186   return (CToken*)mTokenDeque.PopFront();
 187 }
 188
 189
 190 /**
 191  * Pushes a token onto the front of our deque such that the next call to
 192  * PopToken() or PeekToken() will return that token.
 193  *
 194  * @param theToken The next token to be processed
 195  * @return theToken
 196  */
 197 CToken*
 198 nsHTMLTokenizer::PushTokenFront(CToken* theToken)
 199 {
 200   mTokenDeque.PushFront(theToken);
 201   return theToken;
 202 }
 203
 204 /**
 205  * Pushes a token onto the deque.
 206  *
 207  * @param theToken the new token.
 208  * @return theToken
 209  */
 210 CToken*
 211 nsHTMLTokenizer::PushToken(CToken* theToken)
 212 {
 213   mTokenDeque.Push(theToken);
 214   return theToken;
 215 }
 216
 217 /**
 218  * Returns the size of the deque.
 219  *
 220  * @return The number of remaining tokens.
 221  */
 222 PRInt32
 223 nsHTMLTokenizer::GetCount()
 224 {
 225   return mTokenDeque.GetSize();
 226 }
 227
 228 /**
 229  * Allows access to an arbitrary token in the deque. The accessed token is left
 230  * in the deque.
 231  *
 232  * @param anIndex The index of the target token. Token 0 would be the same as
 233  *                the result of a call to PeekToken()
 234  * @return The requested token.
 235  */
 236 CToken*
 237 nsHTMLTokenizer::GetTokenAt(PRInt32 anIndex)
 238 {
 239   return (CToken*)mTokenDeque.ObjectAt(anIndex);
 240 }
 241
 242 /**
 243  * This method is part of the "sandwich" that occurs when we want to tokenize
 244  * a document. This prepares us to be able to tokenize properly.
 245  *
 246  * @param aIsFinalChunk Whether this is the last chunk of data that we will
 247  *                      get to see.
 248  * @param aTokenAllocator The token allocator to use for this document.
 249  * @return Our success in setting up.
 250  */
 251 nsresult
 252 nsHTMLTokenizer::WillTokenize(PRBool aIsFinalChunk,
 253                               nsTokenAllocator* aTokenAllocator)
 254 {
 255   mTokenAllocator = aTokenAllocator;
 256   mIsFinalChunk = aIsFinalChunk;
 257
 258   // Cause ScanDocStructure to search from here for new tokens...
 259   mTokenScanPos = mTokenDeque.GetSize();
 260   return NS_OK;
 261 }
 262
 263 /**
 264  * Pushes all of the tokens in aDeque onto the front of our deque so they
 265  * get processed before any other tokens.
 266  *
 267  * @param aDeque The deque with the tokens in it.
 268  */
 269 void
 270 nsHTMLTokenizer::PrependTokens(nsDeque& aDeque)
 271 {
 272   PRInt32 aCount = aDeque.GetSize();
 273
 274   for (PRInt32 anIndex = 0; anIndex < aCount; ++anIndex) {
 275     CToken* theToken = (CToken*)aDeque.Pop();
 276     PushTokenFront(theToken);
 277   }
 278 }
 279
 280 /**
 281  * Copies the state flags from aTokenizer into this tokenizer. This is used
 282  * to pass information around between the main tokenizer and tokenizers
 283  * created for document.write() calls.
 284  *
 285  * @param aTokenizer The tokenizer with more information in it.
 286  * @return NS_OK
 287  */
 288 nsresult
 289 nsHTMLTokenizer::CopyState(nsITokenizer* aTokenizer)
 290 {
 291   if (aTokenizer) {
 292     mFlags = ((nsHTMLTokenizer*)aTokenizer)->mFlags;
 293   }
 294
 295   return NS_OK;
 296 }
 297
 298 /**
 299  * This is a utilty method for ScanDocStructure, which finds a given
 300  * tag in the stack. The return value is meant to be used with
 301  * nsDeque::ObjectAt() on aTagStack.
 302  *
 303  * @param   aTag -- the ID of the tag we're seeking
 304  * @param   aTagStack -- the stack to be searched
 305  * @return  index position of tag in stack if found, otherwise kNotFound
 306  */
 307 static PRInt32
 308 FindLastIndexOfTag(eHTMLTags aTag, nsDeque &aTagStack)
 309 {
 310   PRInt32 theCount = aTagStack.GetSize();
 311
 312   while (0 < theCount) {
 313     CHTMLToken* theToken = (CHTMLToken*)aTagStack.ObjectAt(--theCount);
 314     if (theToken) {
 315       eHTMLTags theTag = (eHTMLTags)theToken->GetTypeID();
 316       if (theTag == aTag) {
 317         return theCount;
 318       }
 319     }
 320   }
 321
 322   return kNotFound;
 323 }
 324
 325 /**
 326  * This method scans the sequence of tokens to determine whether or not the
 327  * tag structure of the document is well formed. In well formed cases, we can
 328  * skip doing residual style handling and allow inlines to contain block-level
 329  * elements.
 330  *
 331  * @param aFinalChunk Is unused.
 332  * @return Success (currently, this function cannot fail).
 333  */
 334 nsresult nsHTMLTokenizer::ScanDocStructure(PRBool aFinalChunk)
 335 {
 336   nsresult result = NS_OK;
 337   if (!mTokenDeque.GetSize()) {
 338     return result;
 339   }
 340
 341   CHTMLToken* theToken = (CHTMLToken*)mTokenDeque.ObjectAt(mTokenScanPos);
 342
 343   // Start by finding the first start tag that hasn't been reviewed.
 344   while (mTokenScanPos > 0) {
 345     if (theToken) {
 346       eHTMLTokenTypes theType = eHTMLTokenTypes(theToken->GetTokenType());
 347       if (theType == eToken_start &&
 348           theToken->GetContainerInfo() == eFormUnknown) {
 349         break;
 350       }
 351     }
 352     theToken = (CHTMLToken*)mTokenDeque.ObjectAt(--mTokenScanPos);
 353   }
 354
 355   // Now that we know where to start, let's walk through the
 356   // tokens to see which are well-formed. Stop when you run out
 357   // of fresh tokens.
 358
 359   nsDeque       theStack(0);
 360   nsDeque       tempStack(0);
 361   PRInt32       theStackDepth = 0;
 362   // Don't bother if we get ridiculously deep.
 363   static  const PRInt32 theMaxStackDepth = 200;
 364
 365   while (theToken && theStackDepth < theMaxStackDepth) {
 366     eHTMLTokenTypes theType = eHTMLTokenTypes(theToken->GetTokenType());
 367     eHTMLTags       theTag  = (eHTMLTags)theToken->GetTypeID();
 368
 369     if (nsHTMLElement::IsContainer(theTag)) { // Bug 54117
 370       PRBool theTagIsBlock  = gHTMLElements[theTag].IsMemberOf(kBlockEntity);
 371       PRBool theTagIsInline = theTagIsBlock
 372                               ? PR_FALSE
 373                               : gHTMLElements[theTag].IsMemberOf(kInlineEntity);
 374
 375       if (theTagIsBlock || theTagIsInline || eHTMLTag_table == theTag) {
 376         switch(theType) {
 377           case eToken_start:
 378             {
 379               if (gHTMLElements[theTag].ShouldVerifyHierarchy()) {
 380                 PRInt32 earlyPos = FindLastIndexOfTag(theTag, theStack);
 381                 if (earlyPos != kNotFound) {
 382                   // Uh-oh, we've found a tag that is not allowed to nest at
 383                   // all. Mark the previous one and all of its children as
 384                   // malformed to increase our chances of doing RS handling
 385                   // on all of them. We want to do this for cases such as:
 386                   // <a><div><a></a></div></a>.
 387                   // Note that we have to iterate through all of the chilren
 388                   // of the original malformed tag to protect against:
 389                   // <a><font><div><a></a></div></font></a>, so that the <font>
 390                   // is allowed to contain the <div>.
 391                   // XXX What about <a><span><a>, where the second <a> closes
 392                   // the <span>?
 393                   nsDequeIterator it(theStack, earlyPos), end(theStack.End());
 394                   while (it < end) {
 395                     CHTMLToken *theMalformedToken =
 396                         static_cast<CHTMLToken*>(it++);
 397
 398                     theMalformedToken->SetContainerInfo(eMalformed);
 399                   }
 400                 }
 401               }
 402
 403               theStack.Push(theToken);
 404               ++theStackDepth;
 405             }
 406             break;
 407           case eToken_end:
 408             {
 409               CHTMLToken *theLastToken =
 410                 static_cast<CHTMLToken*>(theStack.Peek());
 411               if (theLastToken) {
 412                 if (theTag == theLastToken->GetTypeID()) {
 413                   theStack.Pop(); // Yank it for real
 414                   theStackDepth--;
 415                   theLastToken->SetContainerInfo(eWellFormed);
 416                 } else {
 417                   // This token wasn't what we expected it to be! We need to
 418                   // go searching for its real start tag on our stack. Each
 419                   // tag in between the end tag and start tag must be malformed
 420
 421                   if (FindLastIndexOfTag(theTag, theStack) != kNotFound) {
 422                     // Find theTarget in the stack, marking each (malformed!)
 423                     // tag in our way.
 424                     theStack.Pop(); // Pop off theLastToken for real.
 425                     do {
 426                       theLastToken->SetContainerInfo(eMalformed);
 427                       tempStack.Push(theLastToken);
 428                       theLastToken = static_cast<CHTMLToken*>(theStack.Pop());
 429                     } while (theLastToken && theTag != theLastToken->GetTypeID());
 430                     // XXX The above test can confuse two different userdefined
 431                     // tags.
 432
 433                     NS_ASSERTION(theLastToken,
 434                                  "FindLastIndexOfTag lied to us!"
 435                                  " We couldn't find theTag on theStack");
 436                     theLastToken->SetContainerInfo(eMalformed);
 437
 438                     // Great, now push all of the other tokens back onto the
 439                     // stack to preserve the general structure of the document.
 440                     // Note that we don't push the target token back onto the
 441                     // the stack (since it was just closed).
 442                     while (tempStack.GetSize() != 0) {
 443                       theStack.Push(tempStack.Pop());
 444                     }
 445                   }
 446                 }
 447               }
 448             }
 449             break;
 450           default:
 451             break;
 452         }
 453       }
 454     }
 455
 456     theToken = (CHTMLToken*)mTokenDeque.ObjectAt(++mTokenScanPos);
 457   }
 458
 459   return result;
 460 }
 461
 462 /**
 463  * This method is called after we're done tokenizing a chunk of data.
 464  *
 465  * @param aFinalChunk Tells us if this was the last chunk of data.
 466  * @return Error result.
 467  */
 468 nsresult
 469 nsHTMLTokenizer::DidTokenize(PRBool aFinalChunk)
 470 {
 471   return ScanDocStructure(aFinalChunk);
 472 }
 473
 474 /**
 475  * This method is repeatedly called by the tokenizer.
 476  * Each time, we determine the kind of token we're about to
 477  * read, and then we call the appropriate method to handle
 478  * that token type.
 479  *
 480  * @param  aScanner The source of our input.
 481  * @param  aFlushTokens An OUT parameter to tell the caller whether it should
 482  *                      process our queued tokens up to now (e.g., when we
 483  *                      reach a <script>).
 484  * @return Success or error
 485  */
 486 nsresult
 487 nsHTMLTokenizer::ConsumeToken(nsScanner& aScanner, PRBool& aFlushTokens)
 488 {
 489   PRUnichar theChar;
 490   CToken* theToken = nsnull;
 491
 492   nsresult result = aScanner.Peek(theChar);
 493
 494   switch(result) {
 495     case kEOF:
 496       // Tell our caller that'we finished.
 497       return result;
 498
 499     case NS_OK:
 500     default:
 501       if (!(mFlags & NS_IPARSER_FLAG_PLAIN_TEXT)) {
 502         if (kLessThan == theChar) {
 503           return ConsumeTag(theChar, theToken, aScanner, aFlushTokens);
 504         } else if (kAmpersand == theChar) {
 505           return ConsumeEntity(theChar, theToken, aScanner);
 506         }
 507       }
 508
 509       if (kCR == theChar || kLF == theChar) {
 510         return ConsumeNewline(theChar, theToken, aScanner);
 511       } else {
 512         if (!nsCRT::IsAsciiSpace(theChar)) {
 513           if (theChar != '\0') {
 514             result = ConsumeText(theToken, aScanner);
 515           } else {
 516             // Skip the embedded null char. Fix bug 64098.
 517             aScanner.GetChar(theChar);
 518           }
 519           break;
 520         }
 521         result = ConsumeWhitespace(theChar, theToken, aScanner);
 522       }
 523       break;
 524   }
 525
 526   return result;
 527 }
 528
 529 /**
 530  * This method is called just after a "<" has been consumed
 531  * and we know we're at the start of some kind of tagged
 532  * element. We don't know yet if it's a tag or a comment.
 533  *
 534  * @param   aChar is the last char read
 535  * @param   aToken is the out arg holding our new token (the function allocates
 536  *                 the return token using mTokenAllocator).
 537  * @param   aScanner represents our input source
 538  * @param   aFlushTokens is an OUT parameter use to tell consumers to flush
 539  *                       the current tokens after processing the current one.
 540  * @return  error code.
 541  */
 542 nsresult
 543 nsHTMLTokenizer::ConsumeTag(PRUnichar aChar,
 544                             CToken*& aToken,
 545                             nsScanner& aScanner,
 546                             PRBool& aFlushTokens)
 547 {
 548   PRUnichar theNextChar, oldChar;
 549   nsresult result = aScanner.Peek(aChar, 1);
 550
 551   if (NS_OK == result) {
 552     switch (aChar) {
 553       case kForwardSlash:
 554         result = aScanner.Peek(theNextChar, 2);
 555
 556         if (NS_OK == result) {
 557           // Get the original "<" (we've already seen it with a Peek)
 558           aScanner.GetChar(oldChar);
 559
 560           // XML allows non ASCII tag names, consume this as an end tag. This
 561           // is needed to make XML view source work
 562           PRBool isXML = !!(mFlags & NS_IPARSER_FLAG_XML);
 563           if (nsCRT::IsAsciiAlpha(theNextChar) ||
 564               kGreaterThan == theNextChar      ||
 565               (isXML && !nsCRT::IsAscii(theNextChar))) {
 566             result = ConsumeEndTag(aChar, aToken, aScanner);
 567           } else {
 568             result = ConsumeComment(aChar, aToken, aScanner);
 569           }
 570         }
 571
 572         break;
 573
 574       case kExclamation:
 575         result = aScanner.Peek(theNextChar, 2);
 576
 577         if (NS_OK == result) {
 578           // Get the original "<" (we've already seen it with a Peek)
 579           aScanner.GetChar(oldChar);
 580
 581           if (kMinus == theNextChar || kGreaterThan == theNextChar) {
 582             result = ConsumeComment(aChar, aToken, aScanner);
 583           } else {
 584             result = ConsumeSpecialMarkup(aChar, aToken, aScanner);
 585           }
 586         }
 587         break;
 588
 589       case kQuestionMark:
 590         // It must be a processing instruction...
 591         // Get the original "<" (we've already seen it with a Peek)
 592         aScanner.GetChar(oldChar);
 593         result = ConsumeProcessingInstruction(aChar, aToken, aScanner);
 594         break;
 595
 596       default:
 597         // XML allows non ASCII tag names, consume this as a start tag.
 598         PRBool isXML = !!(mFlags & NS_IPARSER_FLAG_XML);
 599         if (nsCRT::IsAsciiAlpha(aChar) ||
 600             (isXML && !nsCRT::IsAscii(aChar))) {
 601           // Get the original "<" (we've already seen it with a Peek)
 602           aScanner.GetChar(oldChar);
 603           result = ConsumeStartTag(aChar, aToken, aScanner, aFlushTokens);
 604         } else {
 605           // We are not dealing with a tag. So, don't consume the original
 606           // char and leave the decision to ConsumeText().
 607           result = ConsumeText(aToken, aScanner);
 608         }
 609     }
 610   }
 611
 612   // Last ditch attempt to make sure we don't lose data.
 613   if (kEOF == result && !aScanner.IsIncremental()) {
 614     // Whoops, we don't want to lose any data! Consume the rest as text.
 615     // This normally happens for either a trailing < or </
 616     result = ConsumeText(aToken, aScanner);
 617   }
 618
 619   return result;
 620 }
 621
 622 /**
 623  * This method is called just after we've consumed a start or end
 624  * tag, and we now have to consume its attributes.
 625  *
 626  * @param   aChar is the last char read
 627  * @param   aToken is the start or end tag that "owns" these attributes.
 628  * @param   aScanner represents our input source
 629  * @return  Error result.
 630  */
 631 nsresult
 632 nsHTMLTokenizer::ConsumeAttributes(PRUnichar aChar,
 633                                    CToken* aToken,
 634                                    nsScanner& aScanner)
 635 {
 636   PRBool done = PR_FALSE;
 637   nsresult result = NS_OK;
 638   PRInt16 theAttrCount = 0;
 639
 640   nsTokenAllocator* theAllocator = this->GetTokenAllocator();
 641
 642   while (!done && result == NS_OK) {
 643     CAttributeToken* theToken =
 644       static_cast<CAttributeToken*>
 645                  (theAllocator->CreateTokenOfType(eToken_attribute,
 646                                                      eHTMLTag_unknown));
 647     if (NS_LIKELY(theToken != nsnull)) {
 648       // Tell the new token to finish consuming text...
 649       result = theToken->Consume(aChar, aScanner, mFlags);
 650
 651       if (NS_SUCCEEDED(result)) {
 652         ++theAttrCount;
 653         AddToken((CToken*&)theToken, result, &mTokenDeque, theAllocator);
 654       } else {
 655         IF_FREE(theToken, mTokenAllocator);
 656         // Bad attribute returns shouldn't propagate out.
 657         if (NS_ERROR_HTMLPARSER_BADATTRIBUTE == result) {
 658           result = NS_OK;
 659         }
 660       }
 661     }
 662     else {
 663       result = NS_ERROR_OUT_OF_MEMORY;
 664     }
 665
 666 #ifdef DEBUG
 667     if (NS_SUCCEEDED(result)) {
 668       PRInt32 newline = 0;
 669       aScanner.SkipWhitespace(newline);
 670       NS_ASSERTION(newline == 0,
 671           "CAttribute::Consume() failed to collect all the newlines!");
 672     }
 673 #endif
 674     if (NS_SUCCEEDED(result)) {
 675       result = aScanner.Peek(aChar);
 676       if (NS_SUCCEEDED(result)) {
 677         if (aChar == kGreaterThan) { // You just ate the '>'
 678           aScanner.GetChar(aChar); // Skip the '>'
 679           done = PR_TRUE;
 680         } else if (aChar == kLessThan) {
 681           aToken->SetInError(PR_TRUE);
 682           done = PR_TRUE;
 683         }
 684       }
 685     }
 686   }
 687
 688   if (NS_FAILED(result)) {
 689     aToken->SetInError(PR_TRUE);
 690
 691     if (!aScanner.IsIncremental()) {
 692       result = NS_OK;
 693     }
 694   }
 695
 696   aToken->SetAttributeCount(theAttrCount);
 697   return result;
 698 }
 699
 700 /**
 701  * This method consumes a start tag and all of its attributes.
 702  *
 703  * @param aChar The last character read from the scanner.
 704  * @param aToken The OUT parameter that holds our resulting token. (allocated
 705  *               by the function using mTokenAllocator
 706  * @param aScanner Our source of data
 707  * @param aFlushTokens is an OUT parameter use to tell consumers to flush
 708  *                     the current tokens after processing the current one.
 709  * @return Error result.
 710  */
 711 nsresult
 712 nsHTMLTokenizer::ConsumeStartTag(PRUnichar aChar,
 713                                  CToken*& aToken,
 714                                  nsScanner& aScanner,
 715                                  PRBool& aFlushTokens)
 716 {
 717   // Remember this for later in case you have to unwind...
 718   PRInt32 theDequeSize = mTokenDeque.GetSize();
 719   nsresult result = NS_OK;
 720
 721   nsTokenAllocator* theAllocator = this->GetTokenAllocator();
 722   aToken = theAllocator->CreateTokenOfType(eToken_start, eHTMLTag_unknown);
 723   NS_ENSURE_TRUE(aToken, NS_ERROR_OUT_OF_MEMORY);
 724
 725   // Tell the new token to finish consuming text...
 726   result = aToken->Consume(aChar, aScanner, mFlags);
 727
 728   if (NS_SUCCEEDED(result)) {
 729     AddToken(aToken, result, &mTokenDeque, theAllocator);
 730
 731     eHTMLTags theTag = (eHTMLTags)aToken->GetTypeID();
 732
 733     // Good. Now, let's see if the next char is ">".
 734     // If so, we have a complete tag, otherwise, we have attributes.
 735     result = aScanner.Peek(aChar);
 736     if (NS_FAILED(result)) {
 737       aToken->SetInError(PR_TRUE);
 738
 739       // Don't return early here so we can create a text and end token for
 740       // the special <iframe>, <script> and similar tags down below.
 741       result = NS_OK;
 742     } else {
 743       if (kGreaterThan != aChar) { // Look for a '>'
 744         result = ConsumeAttributes(aChar, aToken, aScanner);
 745       } else {
 746         aScanner.GetChar(aChar);
 747       }
 748     }
 749
 750     /*  Now that that's over with, we have one more problem to solve.
 751         In the case that we just read a <SCRIPT> or <STYLE> tags, we should go and
 752         consume all the content itself.
 753         But XML doesn't treat these tags differently, so we shouldn't if the
 754         document is XML.
 755      */
 756     if (NS_SUCCEEDED(result) && !(mFlags & NS_IPARSER_FLAG_XML)) {
 757       PRBool isCDATA = gHTMLElements[theTag].CanContainType(kCDATA);
 758       PRBool isPCDATA = eHTMLTag_textarea == theTag ||
 759                         eHTMLTag_title    == theTag;
 760
 761       // XXX This is an evil hack, we should be able to handle these properly
 762       // in the DTD.
 763       if ((eHTMLTag_iframe == theTag &&
 764             (mFlags & NS_IPARSER_FLAG_FRAMES_ENABLED)) ||
 765           (eHTMLTag_noframes == theTag &&
 766             (mFlags & NS_IPARSER_FLAG_FRAMES_ENABLED)) ||
 767           (eHTMLTag_noscript == theTag &&
 768             (mFlags & NS_IPARSER_FLAG_SCRIPT_ENABLED)) ||
 769           (eHTMLTag_noembed == theTag)) {
 770         isCDATA = PR_TRUE;
 771       }
 772
 773       // Plaintext contains CDATA, but it's special, so we handle it
 774       // differently than the other CDATA elements
 775       if (eHTMLTag_plaintext == theTag) {
 776         isCDATA = PR_FALSE;
 777
 778         // Note: We check in ConsumeToken() for this flag, and if we see it
 779         // we only construct text tokens (which is what we want).
 780         mFlags |= NS_IPARSER_FLAG_PLAIN_TEXT;
 781       }
 782
 783
 784       if (isCDATA || isPCDATA) {
 785         PRBool done = PR_FALSE;
 786         nsDependentString endTagName(nsHTMLTags::GetStringValue(theTag));
 787
 788         CToken* text =
 789             theAllocator->CreateTokenOfType(eToken_text, eHTMLTag_text);
 790         NS_ENSURE_TRUE(text, NS_ERROR_OUT_OF_MEMORY);
 791
 792         CTextToken* textToken = static_cast<CTextToken*>(text);
 793
 794         if (isCDATA) {
 795           result = textToken->ConsumeCharacterData(theTag != eHTMLTag_script,
 796                                                    aScanner,
 797                                                    endTagName,
 798                                                    mFlags,
 799                                                    done);
 800
 801           // Only flush tokens for <script>, to give ourselves more of a
 802           // chance of allowing inlines to contain blocks.
 803           aFlushTokens = done && theTag == eHTMLTag_script;
 804         } else if (isPCDATA) {
 805           // Title is consumed conservatively in order to not regress
 806           // bug 42945
 807           result = textToken->ConsumeParsedCharacterData(
 808                                                   theTag == eHTMLTag_textarea,
 809                                                   theTag == eHTMLTag_title,
 810                                                   aScanner,
 811                                                   endTagName,
 812                                                   mFlags,
 813                                                   done);
 814
 815           // Note: we *don't* set aFlushTokens here.
 816         }
 817
 818         // We want to do this unless result is kEOF, in which case we will
 819         // simply unwind our stack and wait for more data anyway.
 820         if (kEOF != result) {
 821           AddToken(text, NS_OK, &mTokenDeque, theAllocator);
 822           CToken* endToken = nsnull;
 823
 824           if (NS_SUCCEEDED(result) && done) {
 825             PRUnichar theChar;
 826             // Get the <
 827             result = aScanner.GetChar(theChar);
 828             NS_ASSERTION(NS_SUCCEEDED(result) && theChar == kLessThan,
 829                          "CTextToken::Consume*Data is broken!");
 830 #ifdef DEBUG
 831             // Ensure we have a /
 832             PRUnichar tempChar;  // Don't change non-debug vars in debug-only code
 833             result = aScanner.Peek(tempChar);
 834             NS_ASSERTION(NS_SUCCEEDED(result) && tempChar == kForwardSlash,
 835                          "CTextToken::Consume*Data is broken!");
 836 #endif
 837             result = ConsumeEndTag(PRUnichar('/'), endToken, aScanner);
 838             if (!(mFlags & NS_IPARSER_FLAG_VIEW_SOURCE) &&
 839                 NS_SUCCEEDED(result)) {
 840               // If ConsumeCharacterData returned a success result (and
 841               // we're not in view source), then we want to make sure that
 842               // we're going to execute this script (since the result means
 843               // that we've found an end tag that satisfies all of the right
 844               // conditions).
 845               endToken->SetInError(PR_FALSE);
 846             }
 847           } else if (result == kFakeEndTag &&
 848                     !(mFlags & NS_IPARSER_FLAG_VIEW_SOURCE)) {
 849             result = NS_OK;
 850             endToken = theAllocator->CreateTokenOfType(eToken_end, theTag,
 851                                                        endTagName);
 852             AddToken(endToken, result, &mTokenDeque, theAllocator);
 853             if (NS_LIKELY(endToken != nsnull)) {
 854               endToken->SetInError(PR_TRUE);
 855             }
 856             else {
 857               result = NS_ERROR_OUT_OF_MEMORY;
 858             }
 859           } else if (result == kFakeEndTag) {
 860             // If we are here, we are both faking having seen the end tag
 861             // and are in view-source.
 862             result = NS_OK;
 863           }
 864         } else {
 865           IF_FREE(text, mTokenAllocator);
 866         }
 867       }
 868     }
 869
 870     // This code is confusing, so pay attention.
 871     // If you're here, it's because we were in the midst of consuming a start
 872     // tag but ran out of data (not in the stream, but in this *part* of the
 873     // stream. For simplicity, we have to unwind our input. Therefore, we pop
 874     // and discard any new tokens we've queued this round. Later we can get
 875     // smarter about this.
 876     if (NS_FAILED(result)) {
 877       while (mTokenDeque.GetSize()>theDequeSize) {
 878         CToken* theToken = (CToken*)mTokenDeque.Pop();
 879         IF_FREE(theToken, mTokenAllocator);
 880       }
 881     }
 882   } else {
 883     IF_FREE(aToken, mTokenAllocator);
 884   }
 885
 886   return result;
 887 }
 888
 889 /**
 890  * This method consumes an end tag and any "attributes" that may come after it.
 891  *
 892  * @param aChar The last character read from the scanner.
 893  * @param aToken The OUT parameter that holds our resulting token.
 894  * @param aScanner Our source of data
 895  * @return Error result
 896  */
 897 nsresult
 898 nsHTMLTokenizer::ConsumeEndTag(PRUnichar aChar,
 899                                CToken*& aToken,
 900                                nsScanner& aScanner)
 901 {
 902   // Get the "/" (we've already seen it with a Peek)
 903   aScanner.GetChar(aChar);
 904
 905   nsTokenAllocator* theAllocator = this->GetTokenAllocator();
 906   aToken = theAllocator->CreateTokenOfType(eToken_end, eHTMLTag_unknown);
 907   NS_ENSURE_TRUE(aToken, NS_ERROR_OUT_OF_MEMORY);
 908
 909   // Remember this for later in case you have to unwind...
 910   PRInt32 theDequeSize = mTokenDeque.GetSize();
 911   nsresult result = NS_OK;
 912
 913   // Tell the new token to finish consuming text...
 914   result = aToken->Consume(aChar, aScanner, mFlags);
 915   AddToken(aToken, result, &mTokenDeque, theAllocator);
 916   if (NS_FAILED(result)) {
 917     // Note that this early-return here is safe because we have not yet
 918     // added any of our tokens to the queue (AddToken only adds the token if
 919     // result is a success), so we don't need to fall through.
 920     return result;
 921   }
 922
 923   result = aScanner.Peek(aChar);
 924   if (NS_FAILED(result)) {
 925     aToken->SetInError(PR_TRUE);
 926
 927     // Note: We know here that the scanner is not incremental since if
 928     // this peek fails, then we've already masked over a kEOF coming from
 929     // the Consume() call above.
 930     return NS_OK;
 931   }
 932
 933   if (kGreaterThan != aChar) {
 934     result = ConsumeAttributes(aChar, aToken, aScanner);
 935   } else {
 936     aScanner.GetChar(aChar);
 937   }
 938
 939   // Do the same thing as we do in ConsumeStartTag. Basically, if we've run
 940   // out of room in this *section* of the document, pop all of the tokens
 941   // we've consumed this round and wait for more data.
 942   if (NS_FAILED(result)) {
 943     while (mTokenDeque.GetSize() > theDequeSize) {
 944       CToken* theToken = (CToken*)mTokenDeque.Pop();
 945       IF_FREE(theToken, mTokenAllocator);
 946     }
 947   }
 948
 949   return result;
 950 }
 951
 952 /**
 953  *  This method is called just after a "&" has been consumed
 954  *  and we know we're at the start of an entity.
 955  *
 956  * @param aChar The last character read from the scanner.
 957  * @param aToken The OUT parameter that holds our resulting token.
 958  * @param aScanner Our source of data
 959  * @return Error result.
 960  */
 961 nsresult
 962 nsHTMLTokenizer::ConsumeEntity(PRUnichar aChar,
 963                                CToken*& aToken,
 964                                nsScanner& aScanner)
 965 {
 966   PRUnichar  theChar;
 967   nsresult result = aScanner.Peek(theChar, 1);
 968
 969   nsTokenAllocator* theAllocator = this->GetTokenAllocator();
 970   if (NS_SUCCEEDED(result)) {
 971     if (nsCRT::IsAsciiAlpha(theChar) || theChar == kHashsign) {
 972       aToken = theAllocator->CreateTokenOfType(eToken_entity, eHTMLTag_entity);
 973       NS_ENSURE_TRUE(aToken, NS_ERROR_OUT_OF_MEMORY);
 974       result = aToken->Consume(theChar, aScanner, mFlags);
 975
 976       if (result == NS_HTMLTOKENS_NOT_AN_ENTITY) {
 977         IF_FREE(aToken, mTokenAllocator);
 978       } else {
 979         if (result == kEOF && !aScanner.IsIncremental()) {
 980           result = NS_OK; // Use as much of the entity as you can get.
 981         }
 982
 983         AddToken(aToken, result, &mTokenDeque, theAllocator);
 984         return result;
 985       }
 986     }
 987
 988     // Oops, we're actually looking at plain text...
 989     result = ConsumeText(aToken, aScanner);
 990   } else if (result == kEOF && !aScanner.IsIncremental()) {
 991     // If the last character in the file is an &, consume it as text.
 992     result = ConsumeText(aToken, aScanner);
 993     if (aToken) {
 994       aToken->SetInError(PR_TRUE);
 995     }
 996   }
 997
 998   return result;
 999 }
1000
1001
1002 /**
1003  *  This method is called just after whitespace has been
1004  *  consumed and we know we're at the start a whitespace run.
1005  *
1006  * @param aChar The last character read from the scanner.
1007  * @param aToken The OUT parameter that holds our resulting token.
1008  * @param aScanner Our source of data
1009  * @return Error result.
1010  */
1011 nsresult
1012 nsHTMLTokenizer::ConsumeWhitespace(PRUnichar aChar,
1013                                    CToken*& aToken,
1014                                    nsScanner& aScanner)
1015 {
1016   // Get the whitespace character
1017   aScanner.GetChar(aChar);
1018
1019   nsTokenAllocator* theAllocator = this->GetTokenAllocator();
1020   aToken = theAllocator->CreateTokenOfType(eToken_whitespace,
1021                                            eHTMLTag_whitespace);
1022   nsresult result = NS_OK;
1023   if (aToken) {
1024     result = aToken->Consume(aChar, aScanner, mFlags);
1025     AddToken(aToken, result, &mTokenDeque, theAllocator);
1026   }
1027
1028   return result;
1029 }
1030
1031 /**
1032  *  This method is called just after a "<!" has been consumed
1033  *  and we know we're at the start of a comment.
1034  *
1035  * @param aChar The last character read from the scanner.
1036  * @param aToken The OUT parameter that holds our resulting token.
1037  * @param aScanner Our source of data
1038  * @return Error result.
1039  */
1040 nsresult
1041 nsHTMLTokenizer::ConsumeComment(PRUnichar aChar,
1042                                 CToken*& aToken,
1043                                 nsScanner& aScanner)
1044 {
1045   // Get the "!"
1046   aScanner.GetChar(aChar);
1047
1048   nsTokenAllocator* theAllocator = this->GetTokenAllocator();
1049   aToken = theAllocator->CreateTokenOfType(eToken_comment, eHTMLTag_comment);
1050   nsresult result = NS_OK;
1051   if (aToken) {
1052     result = aToken->Consume(aChar, aScanner, mFlags);
1053     AddToken(aToken, result, &mTokenDeque, theAllocator);
1054   }
1055
1056   if (kNotAComment == result) {
1057     // AddToken has IF_FREE()'d our token, so...
1058     result = ConsumeText(aToken, aScanner);
1059   }
1060
1061   return result;
1062 }
1063
1064 /**
1065  * This method is called just after a known text char has
1066  * been consumed and we should read a text run. Note: we actually ignore the
1067  * first character of the text run so that we can consume invalid markup
1068  * as text.
1069  *
1070  * @param aToken The OUT parameter that holds our resulting token.
1071  * @param aScanner Our source of data
1072  * @return Error result.
1073  */
1074 nsresult
1075 nsHTMLTokenizer::ConsumeText(CToken*& aToken, nsScanner& aScanner)
1076 {
1077   nsresult result = NS_OK;
1078   nsTokenAllocator* theAllocator = this->GetTokenAllocator();
1079   CTextToken* theToken =
1080     (CTextToken*)theAllocator->CreateTokenOfType(eToken_text, eHTMLTag_text);
1081   if (theToken) {
1082     PRUnichar ch = '\0';
1083     result = theToken->Consume(ch, aScanner, mFlags);
1084     if (NS_FAILED(result)) {
1085       if (0 == theToken->GetTextLength()) {
1086         IF_FREE(aToken, mTokenAllocator);
1087         aToken = nsnull;
1088       } else {
1089         result = NS_OK;
1090       }
1091     }
1092
1093     aToken = theToken;
1094     AddToken(aToken, result, &mTokenDeque, theAllocator);
1095   }
1096
1097   return result;
1098 }
1099
1100 /**
1101  * This method is called just after a "<!" has been consumed.
1102  * NOTE: Here we might consume DOCTYPE and "special" markups.
1103  *
1104  * @param aChar The last character read from the scanner.
1105  * @param aToken The OUT parameter that holds our resulting token.
1106  * @param aScanner Our source of data
1107  * @return Error result.
1108  */
1109 nsresult
1110 nsHTMLTokenizer::ConsumeSpecialMarkup(PRUnichar aChar,
1111                                       CToken*& aToken,
1112                                       nsScanner& aScanner)
1113 {
1114   // Get the "!"
1115   aScanner.GetChar(aChar);
1116
1117   nsresult result = NS_OK;
1118   nsAutoString theBufCopy;
1119   aScanner.Peek(theBufCopy, 20);
1120   ToUpperCase(theBufCopy);
1121   PRInt32 theIndex = theBufCopy.Find("DOCTYPE", PR_FALSE, 0, 0);
1122   nsTokenAllocator* theAllocator = this->GetTokenAllocator();
1123
1124   if (theIndex == kNotFound) {
1125     if ('[' == theBufCopy.CharAt(0)) {
1126       aToken = theAllocator->CreateTokenOfType(eToken_cdatasection,
1127                                                eHTMLTag_comment);
1128     } else if (StringBeginsWith(theBufCopy, NS_LITERAL_STRING("ELEMENT")) ||
1129                StringBeginsWith(theBufCopy, NS_LITERAL_STRING("ATTLIST")) ||
1130                StringBeginsWith(theBufCopy, NS_LITERAL_STRING("ENTITY"))  ||
1131                StringBeginsWith(theBufCopy, NS_LITERAL_STRING("NOTATION"))) {
1132       aToken = theAllocator->CreateTokenOfType(eToken_markupDecl,
1133                                                eHTMLTag_markupDecl);
1134     } else {
1135       aToken = theAllocator->CreateTokenOfType(eToken_comment,
1136                                                eHTMLTag_comment);
1137     }
1138   } else {
1139     aToken = theAllocator->CreateTokenOfType(eToken_doctypeDecl,
1140                                              eHTMLTag_doctypeDecl);
1141   }
1142
1143   if (aToken) {
1144     result = aToken->Consume(aChar, aScanner, mFlags);
1145     AddToken(aToken, result, &mTokenDeque, theAllocator);
1146   }
1147
1148   if (result == kNotAComment) {
1149     result = ConsumeText(aToken, aScanner);
1150   }
1151
1152   return result;
1153 }
1154
1155 /**
1156  * This method is called just after a newline has been consumed.
1157  *
1158  * @param aChar The last character read from the scanner.
1159  * @param aToken The OUT parameter that holds our resulting token.
1160  * @param aScanner Our source of data
1161  * @return Error result.
1162  */
1163 nsresult
1164 nsHTMLTokenizer::ConsumeNewline(PRUnichar aChar,
1165                                 CToken*& aToken,
1166                                 nsScanner& aScanner)
1167 {
1168   // Get the newline character
1169   aScanner.GetChar(aChar);
1170
1171   nsTokenAllocator* theAllocator = this->GetTokenAllocator();
1172   aToken = theAllocator->CreateTokenOfType(eToken_newline, eHTMLTag_newline);
1173   nsresult result = NS_OK;
1174   if (aToken) {
1175     result = aToken->Consume(aChar, aScanner, mFlags);
1176     AddToken(aToken, result, &mTokenDeque, theAllocator);
1177   }
1178
1179   return result;
1180 }
1181
1182
1183 /**
1184  * This method is called just after a <? has been consumed.
1185  *
1186  * @param aChar The last character read from the scanner.
1187  * @param aToken The OUT parameter that holds our resulting token.
1188  * @param aScanner Our source of data
1189  * @return Error result.
1190  */
1191 nsresult
1192 nsHTMLTokenizer::ConsumeProcessingInstruction(PRUnichar aChar,
1193                                               CToken*& aToken,
1194                                               nsScanner& aScanner)
1195 {
1196   // Get the "?"
1197   aScanner.GetChar(aChar);
1198
1199   nsTokenAllocator* theAllocator = this->GetTokenAllocator();
1200   aToken = theAllocator->CreateTokenOfType(eToken_instruction,
1201                                            eHTMLTag_unknown);
1202   nsresult result = NS_OK;
1203   if (aToken) {
1204     result = aToken->Consume(aChar, aScanner, mFlags);
1205     AddToken(aToken, result, &mTokenDeque, theAllocator);
1206   }
1207
1208   return result;
1209 }