parser/htmlparser/src/nsParser.cpp

   1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
   2 /* vim: set sw=2 ts=2 et tw=79: */
   3 /* ***** BEGIN LICENSE BLOCK *****
   4  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
   5  *
   6  * The contents of this file are subject to the Mozilla Public License Version
   7  * 1.1 (the "License"); you may not use this file except in compliance with
   8  * the License. You may obtain a copy of the License at
   9  * http://www.mozilla.org/MPL/
  10  *
  11  * Software distributed under the License is distributed on an "AS IS" basis,
  12  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
  13  * for the specific language governing rights and limitations under the
  14  * License.
  15  *
  16  * The Original Code is mozilla.org code.
  17  *
  18  * The Initial Developer of the Original Code is
  19  * Netscape Communications Corporation.
  20  * Portions created by the Initial Developer are Copyright (C) 1998
  21  * the Initial Developer. All Rights Reserved.
  22  *
  23  * Contributor(s):
  24  *   Pierre Phaneuf <pp@ludusdesign.com>
  25  *
  26  * Alternatively, the contents of this file may be used under the terms of
  27  * either of the GNU General Public License Version 2 or later (the "GPL"),
  28  * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
  29  * in which case the provisions of the GPL or the LGPL are applicable instead
  30  * of those above. If you wish to allow use of your version of this file only
  31  * under the terms of either the GPL or the LGPL, and not to allow others to
  32  * use your version of this file under the terms of the MPL, indicate your
  33  * decision by deleting the provisions above and replace them with the notice
  34  * and other provisions required by the GPL or the LGPL. If you do not delete
  35  * the provisions above, a recipient may use your version of this file under
  36  * the terms of any one of the MPL, the GPL or the LGPL.
  37  *
  38  * ***** END LICENSE BLOCK ***** */
  39
  40 #include "nsIAtom.h"
  41 #include "nsParser.h"
  42 #include "nsString.h"
  43 #include "nsCRT.h"
  44 #include "nsScanner.h"
  45 #include "plstr.h"
  46 #include "nsIStringStream.h"
  47 #include "nsIChannel.h"
  48 #include "nsICachingChannel.h"
  49 #include "nsICacheEntryDescriptor.h"
  50 #include "nsICharsetAlias.h"
  51 #include "nsICharsetConverterManager.h"
  52 #include "nsIInputStream.h"
  53 #include "CNavDTD.h"
  54 #include "prenv.h"
  55 #include "prlock.h"
  56 #include "prcvar.h"
  57 #include "nsAutoLock.h"
  58 #include "nsParserCIID.h"
  59 #include "nsReadableUtils.h"
  60 #include "nsCOMPtr.h"
  61 #include "nsExpatDriver.h"
  62 #include "nsIServiceManager.h"
  63 #include "nsICategoryManager.h"
  64 #include "nsISupportsPrimitives.h"
  65 #include "nsIFragmentContentSink.h"
  66 #include "nsStreamUtils.h"
  67 #include "nsHTMLTokenizer.h"
  68 #include "nsIDocument.h"
  69 #include "nsNetUtil.h"
  70 #include "nsScriptLoader.h"
  71 #include "nsDataHashtable.h"
  72 #include "nsIThreadPool.h"
  73 #include "nsXPCOMCIDInternal.h"
  74
  75 #ifdef MOZ_VIEW_SOURCE
  76 #include "nsViewSourceHTML.h"
  77 #endif
  78
  79 #define NS_PARSER_FLAG_PARSER_ENABLED         0x00000002
  80 #define NS_PARSER_FLAG_OBSERVERS_ENABLED      0x00000004
  81 #define NS_PARSER_FLAG_PENDING_CONTINUE_EVENT 0x00000008
  82 #define NS_PARSER_FLAG_CAN_INTERRUPT          0x00000010
  83 #define NS_PARSER_FLAG_FLUSH_TOKENS           0x00000020
  84 #define NS_PARSER_FLAG_CAN_TOKENIZE           0x00000040
  85
  86 static NS_DEFINE_IID(kISupportsIID, NS_ISUPPORTS_IID);
  87 static NS_DEFINE_CID(kCParserCID, NS_PARSER_CID);
  88 static NS_DEFINE_IID(kIParserIID, NS_IPARSER_IID);
  89
  90 //-------------------------------------------------------------------
  91
  92 nsCOMArray<nsIUnicharStreamListener> *nsParser::sParserDataListeners;
  93
  94 //-------------- Begin ParseContinue Event Definition ------------------------
  95 /*
  96 The parser can be explicitly interrupted by passing a return value of
  97 NS_ERROR_HTMLPARSER_INTERRUPTED from BuildModel on the DTD. This will cause
  98 the parser to stop processing and allow the application to return to the event
  99 loop. The data which was left at the time of interruption will be processed
 100 the next time OnDataAvailable is called. If the parser has received its final
 101 chunk of data then OnDataAvailable will no longer be called by the networking
 102 module, so the parser will schedule a nsParserContinueEvent which will call
 103 the parser to process the remaining data after returning to the event loop.
 104 If the parser is interrupted while processing the remaining data it will
 105 schedule another ParseContinueEvent. The processing of data followed by
 106 scheduling of the continue events will proceed until either:
 107
 108   1) All of the remaining data can be processed without interrupting
 109   2) The parser has been cancelled.
 110
 111
 112 This capability is currently used in CNavDTD and nsHTMLContentSink. The
 113 nsHTMLContentSink is notified by CNavDTD when a chunk of tokens is going to be
 114 processed and when each token is processed. The nsHTML content sink records
 115 the time when the chunk has started processing and will return
 116 NS_ERROR_HTMLPARSER_INTERRUPTED if the token processing time has exceeded a
 117 threshold called max tokenizing processing time. This allows the content sink
 118 to limit how much data is processed in a single chunk which in turn gates how
 119 much time is spent away from the event loop. Processing smaller chunks of data
 120 also reduces the time spent in subsequent reflows.
 121
 122 This capability is most apparent when loading large documents. If the maximum
 123 token processing time is set small enough the application will remain
 124 responsive during document load.
 125
 126 A side-effect of this capability is that document load is not complete when
 127 the last chunk of data is passed to OnDataAvailable since  the parser may have
 128 been interrupted when the last chunk of data arrived. The document is complete
 129 when all of the document has been tokenized and there aren't any pending
 130 nsParserContinueEvents. This can cause problems if the application assumes
 131 that it can monitor the load requests to determine when the document load has
 132 been completed. This is what happens in Mozilla. The document is considered
 133 completely loaded when all of the load requests have been satisfied. To delay
 134 the document load until all of the parsing has been completed the
 135 nsHTMLContentSink adds a dummy parser load request which is not removed until
 136 the nsHTMLContentSink's DidBuildModel is called. The CNavDTD will not call
 137 DidBuildModel until the final chunk of data has been passed to the parser
 138 through the OnDataAvailable and there aren't any pending
 139 nsParserContineEvents.
 140
 141 Currently the parser is ignores requests to be interrupted during the
 142 processing of script.  This is because a document.write followed by JavaScript
 143 calls to manipulate the DOM may fail if the parser was interrupted during the
 144 document.write.
 145
 146 For more details @see bugzilla bug 76722
 147 */
 148
 149
 150 class nsParserContinueEvent : public nsRunnable
 151 {
 152 public:
 153   nsRefPtr<nsParser> mParser;
 154
 155   nsParserContinueEvent(nsParser* aParser)
 156     : mParser(aParser)
 157   {}
 158
 159   NS_IMETHOD Run()
 160   {
 161     mParser->HandleParserContinueEvent(this);
 162     return NS_OK;
 163   }
 164 };
 165
 166 //-------------- End ParseContinue Event Definition ------------------------
 167
 168 template <class Type>
 169 class Holder {
 170 public:
 171   typedef void (*Reaper)(Type *);
 172
 173   Holder(Reaper aReaper)
 174     : mHoldee(nsnull), mReaper(aReaper)
 175   {
 176   }
 177
 178   ~Holder() {
 179     if (mHoldee) {
 180       mReaper(mHoldee);
 181     }
 182   }
 183
 184   Type *get() {
 185     return mHoldee;
 186   }
 187   const Holder &operator =(Type *aHoldee) {
 188     if (mHoldee && aHoldee != mHoldee) {
 189       mReaper(mHoldee);
 190     }
 191     mHoldee = aHoldee;
 192     return *this;
 193   }
 194
 195 private:
 196   Type *mHoldee;
 197   Reaper mReaper;
 198 };
 199
 200 class nsSpeculativeScriptThread : public nsIRunnable {
 201 public:
 202   nsSpeculativeScriptThread()
 203     : mLock(nsAutoLock::DestroyLock),
 204       mCVar(PR_DestroyCondVar),
 205       mKeepParsing(0),
 206       mCurrentlyParsing(0),
 207       mNumURIs(0),
 208       mNumConsumed(0),
 209       mContext(nsnull),
 210       mTerminated(PR_FALSE) {
 211   }
 212
 213   ~nsSpeculativeScriptThread() {
 214     NS_ASSERTION(NS_IsMainThread() || !mDocument,
 215                  "Destroying the document on the wrong thread");
 216   }
 217
 218   NS_DECL_ISUPPORTS
 219   NS_DECL_NSIRUNNABLE
 220
 221   nsresult StartParsing(nsParser *aParser);
 222   void StopParsing(PRBool aFromDocWrite);
 223
 224   enum PrefetchType { SCRIPT, STYLESHEET, IMAGE };
 225   struct PrefetchEntry {
 226     PrefetchType type;
 227     nsString uri;
 228     nsString charset;
 229     nsString elementType;
 230   };
 231
 232   nsIDocument *GetDocument() {
 233     NS_ASSERTION(NS_IsMainThread(), "Potential threadsafety hazard");
 234     return mDocument;
 235   }
 236
 237   PRBool Parsing() {
 238     return mCurrentlyParsing;
 239   }
 240
 241   CParserContext *Context() {
 242     return mContext;
 243   }
 244
 245   typedef nsDataHashtable<nsCStringHashKey, PRBool> PreloadedType;
 246   PreloadedType& GetPreloadedURIs() {
 247     return mPreloadedURIs;
 248   }
 249
 250   void Terminate() {
 251     mTerminated = PR_TRUE;
 252     StopParsing(PR_FALSE);
 253   }
 254   PRBool Terminated() {
 255     return mTerminated;
 256   }
 257
 258 private:
 259
 260   void ProcessToken(CToken *aToken);
 261
 262   void AddToPrefetchList(const nsAString &src,
 263                          const nsAString &charset,
 264                          const nsAString &elementType,
 265                          PrefetchType type);
 266
 267   // These members are only accessed on the speculatively parsing thread.
 268   nsTokenAllocator mTokenAllocator;
 269
 270   // The following members are shared across the main thread and the
 271   // speculatively parsing thread.
 272   Holder<PRLock> mLock;
 273   Holder<PRCondVar> mCVar;
 274
 275   volatile PRUint32 mKeepParsing;
 276   volatile PRUint32 mCurrentlyParsing;
 277   nsRefPtr<nsHTMLTokenizer> mTokenizer;
 278   nsAutoPtr<nsScanner> mScanner;
 279
 280   enum { kBatchPrefetchURIs = 5 };
 281   nsAutoTArray<PrefetchEntry, kBatchPrefetchURIs> mURIs;
 282   PRUint16 mNumURIs;
 283
 284   // Number of characters consumed by the last speculative parse.
 285   PRUint32 mNumConsumed;
 286
 287   // These members are only accessed on the main thread.
 288   nsCOMPtr<nsIDocument> mDocument;
 289   CParserContext *mContext;
 290   PreloadedType mPreloadedURIs;
 291   PRBool mTerminated;
 292 };
 293
 294 class nsPreloadURIs : public nsIRunnable {
 295 public:
 296   nsPreloadURIs(nsAutoTArray<nsSpeculativeScriptThread::PrefetchEntry, 5> &aURIs,
 297                 nsSpeculativeScriptThread *aScriptThread)
 298     : mURIs(aURIs),
 299       mScriptThread(aScriptThread) {
 300   }
 301
 302   NS_DECL_ISUPPORTS
 303   NS_DECL_NSIRUNNABLE
 304
 305   static void PreloadURIs(const nsAutoTArray<nsSpeculativeScriptThread::PrefetchEntry, 5> &aURIs,
 306                           nsSpeculativeScriptThread *aScriptThread);
 307
 308 private:
 309   nsAutoTArray<nsSpeculativeScriptThread::PrefetchEntry, 5> mURIs;
 310   nsRefPtr<nsSpeculativeScriptThread> mScriptThread;
 311 };
 312
 313 NS_IMPL_THREADSAFE_ISUPPORTS1(nsPreloadURIs, nsIRunnable)
 314
 315 NS_IMETHODIMP
 316 nsPreloadURIs::Run()
 317 {
 318   PreloadURIs(mURIs, mScriptThread);
 319   return NS_OK;
 320 }
 321
 322 void
 323 nsPreloadURIs::PreloadURIs(const nsAutoTArray<nsSpeculativeScriptThread::PrefetchEntry, 5> &aURIs,
 324                            nsSpeculativeScriptThread *aScriptThread)
 325 {
 326   NS_ASSERTION(NS_IsMainThread(), "Touching non-threadsafe objects off thread");
 327
 328   if (aScriptThread->Terminated()) {
 329     return;
 330   }
 331
 332   nsIDocument *doc = aScriptThread->GetDocument();
 333   NS_ASSERTION(doc, "We shouldn't have started preloading without a document");
 334
 335   // Note: Per the code in the HTML content sink, we should be keeping track
 336   // of each <base href> as it comes. However, because we do our speculative
 337   // parsing off the main thread, this is hard to emulate. For now, just load
 338   // the URIs using the document's base URI at the potential cost of being
 339   // wrong and having to re-load a given relative URI later.
 340   nsIURI *base = doc->GetBaseURI();
 341   const nsCString &charset = doc->GetDocumentCharacterSet();
 342   nsSpeculativeScriptThread::PreloadedType &alreadyPreloaded =
 343     aScriptThread->GetPreloadedURIs();
 344   for (PRUint32 i = 0, e = aURIs.Length(); i < e; ++i) {
 345     const nsSpeculativeScriptThread::PrefetchEntry &pe = aURIs[i];
 346     if (pe.type != nsSpeculativeScriptThread::SCRIPT) {
 347       continue;
 348     }
 349
 350     nsCOMPtr<nsIURI> uri;
 351     nsresult rv = NS_NewURI(getter_AddRefs(uri), pe.uri, charset.get(), base);
 352     if (NS_FAILED(rv)) {
 353       NS_WARNING("Failed to create a URI");
 354       continue;
 355     }
 356
 357     nsCAutoString spec;
 358     uri->GetSpec(spec);
 359     PRBool answer;
 360     if (alreadyPreloaded.Get(spec, &answer)) {
 361       // Already preloaded. Don't preload again.
 362       continue;
 363     }
 364
 365     alreadyPreloaded.Put(spec, PR_TRUE);
 366
 367     doc->ScriptLoader()->PreloadURI(uri, pe.charset, pe.elementType);
 368   }
 369 }
 370
 371 NS_IMPL_THREADSAFE_ISUPPORTS1(nsSpeculativeScriptThread, nsIRunnable)
 372
 373 NS_IMETHODIMP
 374 nsSpeculativeScriptThread::Run()
 375 {
 376   NS_ASSERTION(!NS_IsMainThread(), "Speculative parsing on the main thread?");
 377
 378   mNumConsumed = 0;
 379
 380   mTokenizer->WillTokenize(PR_FALSE, &mTokenAllocator);
 381   while (mKeepParsing) {
 382     PRBool flushTokens = PR_FALSE;
 383     nsresult rv = mTokenizer->ConsumeToken(*mScanner, flushTokens);
 384     if (NS_FAILED(rv)) {
 385       break;
 386     }
 387
 388     mNumConsumed += mScanner->Mark();
 389
 390     // TODO Don't pop the tokens.
 391     CToken *token;
 392     while (mKeepParsing && (token = mTokenizer->PopToken())) {
 393       ProcessToken(token);
 394     }
 395   }
 396   mTokenizer->DidTokenize(PR_FALSE);
 397
 398   {
 399     nsAutoLock al(mLock.get());
 400
 401     mCurrentlyParsing = 0;
 402     PR_NotifyCondVar(mCVar.get());
 403   }
 404   return NS_OK;
 405 }
 406
 407 nsresult
 408 nsSpeculativeScriptThread::StartParsing(nsParser *aParser)
 409 {
 410   NS_ASSERTION(NS_IsMainThread(), "Called on the wrong thread");
 411   NS_ASSERTION(!mCurrentlyParsing, "Bad race happening");
 412
 413   if (!aParser->ThreadPool()) {
 414     return NS_OK;
 415   }
 416
 417   nsIContentSink *sink = aParser->GetContentSink();
 418   if (!sink) {
 419     return NS_OK;
 420   }
 421
 422   nsCOMPtr<nsIDocument> doc = do_QueryInterface(sink->GetTarget());
 423   if (!doc) {
 424     return NS_OK;
 425   }
 426
 427   nsAutoString toScan;
 428   CParserContext *context = aParser->PeekContext();
 429   if (!mLock.get()) {
 430     mLock = nsAutoLock::NewLock("nsSpeculativeScriptThread::mLock");
 431     if (!mLock.get()) {
 432       return NS_ERROR_OUT_OF_MEMORY;
 433     }
 434
 435     mCVar = PR_NewCondVar(mLock.get());
 436     if (!mCVar.get()) {
 437       return NS_ERROR_OUT_OF_MEMORY;
 438     }
 439
 440     if (!mPreloadedURIs.Init(15)) {
 441       return NS_ERROR_OUT_OF_MEMORY;
 442     }
 443
 444     mTokenizer = new nsHTMLTokenizer(context->mDTDMode, context->mDocType,
 445                                      context->mParserCommand, 0);
 446     if (!mTokenizer) {
 447       return NS_ERROR_OUT_OF_MEMORY;
 448     }
 449     mTokenizer->CopyState(context->mTokenizer);
 450     context->mScanner->CopyUnusedData(toScan);
 451     if (toScan.IsEmpty()) {
 452       return NS_OK;
 453     }
 454   } else if (context == mContext) {
 455     // Don't parse the same part of the document twice.
 456     nsScannerIterator end;
 457     context->mScanner->EndReading(end);
 458
 459     nsScannerIterator start;
 460     context->mScanner->CurrentPosition(start);
 461
 462     if (mNumConsumed > context->mNumConsumed) {
 463       // We consumed more the last time we tried speculatively parsing than we
 464       // did the last time we actually parsed.
 465       PRUint32 distance = Distance(start, end);
 466       start.advance(PR_MIN(mNumConsumed - context->mNumConsumed, distance));
 467     }
 468
 469     if (start == end) {
 470       // We're at the end of this context's buffer, nothing else to do.
 471       return NS_OK;
 472     }
 473
 474     CopyUnicodeTo(start, end, toScan);
 475   } else {
 476     // Grab all of the context.
 477     context->mScanner->CopyUnusedData(toScan);
 478     if (toScan.IsEmpty()) {
 479       // Nothing to parse, don't do anything.
 480       return NS_OK;
 481     }
 482   }
 483
 484   nsCAutoString charset;
 485   PRInt32 source;
 486   aParser->GetDocumentCharset(charset, source);
 487
 488   mScanner = new nsScanner(toScan, charset, source);
 489   if (!mScanner) {
 490     return NS_ERROR_OUT_OF_MEMORY;
 491   }
 492
 493   mDocument.swap(doc);
 494   mKeepParsing = 1;
 495   mCurrentlyParsing = 1;
 496   mContext = context;
 497   return aParser->ThreadPool()->Dispatch(this, NS_DISPATCH_NORMAL);
 498 }
 499
 500 void
 501 nsSpeculativeScriptThread::StopParsing(PRBool /*aFromDocWrite*/)
 502 {
 503   NS_ASSERTION(NS_IsMainThread(), "Can't stop parsing from another thread");
 504
 505   if (!mLock.get()) {
 506     // If we bailed early out of StartParsing, don't do anything.
 507     return;
 508   }
 509
 510   {
 511     nsAutoLock al(mLock.get());
 512
 513     mKeepParsing = 0;
 514     if (mCurrentlyParsing) {
 515       PR_WaitCondVar(mCVar.get(), PR_INTERVAL_NO_TIMEOUT);
 516       NS_ASSERTION(!mCurrentlyParsing, "Didn't actually stop parsing?");
 517     }
 518   }
 519
 520   // The thread is now idle.
 521   if (mTerminated) {
 522     // If we're terminated, then we need to ensure that we release our document
 523     // and tokenizer here on the main thread so that our last reference to them
 524     // isn't our alter-ego rescheduled on another thread.
 525     mDocument = nsnull;
 526     mTokenizer = nsnull;
 527     mScanner = nsnull;
 528   } else if (mNumURIs) {
 529     // Note: Don't do this if we're terminated.
 530     nsPreloadURIs::PreloadURIs(mURIs, this);
 531     mNumURIs = 0;
 532     mURIs.Clear();
 533   }
 534
 535   // Note: Currently, we pop the tokens off (see the comment in Run) so this
 536   // isn't a problem. If and when we actually use the tokens created
 537   // off-thread, we'll need to use aFromDocWrite for real.
 538 }
 539
 540 void
 541 nsSpeculativeScriptThread::ProcessToken(CToken *aToken)
 542 {
 543   // Only called on the speculative script thread.
 544
 545   CHTMLToken *token = static_cast<CHTMLToken *>(aToken);
 546   switch (static_cast<eHTMLTokenTypes>(token->GetTokenType())) {
 547     case eToken_start: {
 548         CStartToken *start = static_cast<CStartToken *>(aToken);
 549         nsHTMLTag tag = static_cast<nsHTMLTag>(start->GetTypeID());
 550         PRInt16 attrs = start->GetAttributeCount();
 551         PRInt16 i = 0;
 552         nsAutoString src;
 553         nsAutoString elementType;
 554         nsAutoString charset;
 555         PrefetchType ptype;
 556
 557         switch (tag) {
 558 #if 0 // TODO Support stylesheet and image preloading.
 559           case eHTMLTag_link: {
 560             // If this is a <link rel=stylesheet> find the src.
 561             PRBool isRelStylesheet = PR_FALSE;
 562             for (; i < attrs; ++i) {
 563               CAttributeToken *attr = static_cast<CAttributeToken *>(mTokenizer->PopToken());
 564               NS_ASSERTION(attr->GetTokenType() == eToken_attribute, "Weird token");
 565
 566               if (attr->GetKey().EqualsLiteral("rel")) {
 567                 if (!attr->GetValue().EqualsLiteral("stylesheet")) {
 568                   IF_FREE(attr, &mTokenAllocator);
 569                   break;
 570                 }
 571                 isRelStylesheet = PR_TRUE;
 572               } else if (attr->GetKey().EqualsLiteral("src")) {
 573                 src.Assign(attr->GetValue());
 574                 if (isRelStylesheet) {
 575                   IF_FREE(attr, &mTokenAllocator);
 576                   break;
 577                 }
 578               }
 579
 580               IF_FREE(attr, &mTokenAllocator);
 581             }
 582
 583             if (isRelStylesheet && !src.IsEmpty()) {
 584               AddToPrefetchList(src, STYLESHEET);
 585             }
 586             break;
 587           }
 588
 589           case eHTMLTag_style:
 590             ptype = STYLESHEET;
 591           case eHTMLTag_img:
 592             if (tag == eHTMLTag_img)
 593               ptype = IMAGE;
 594 #endif
 595           case eHTMLTag_script:
 596             if (tag == eHTMLTag_script)
 597               ptype = SCRIPT;
 598
 599             for (; i < attrs; ++i) {
 600               CAttributeToken *attr = static_cast<CAttributeToken *>(mTokenizer->PopToken());
 601               NS_ASSERTION(attr->GetTokenType() == eToken_attribute, "Weird token");
 602
 603               if (attr->GetKey().EqualsLiteral("src")) {
 604                 src.Assign(attr->GetValue());
 605               } else if (attr->GetKey().EqualsLiteral("charset")) {
 606                 charset.Assign(attr->GetValue());
 607               } else if (attr->GetKey().EqualsLiteral("type")) {
 608                 elementType.Assign(attr->GetValue());
 609               }
 610               IF_FREE(attr, &mTokenAllocator);
 611             }
 612
 613             if (!src.IsEmpty()) {
 614               AddToPrefetchList(src, charset, elementType, ptype);
 615             }
 616             break;
 617
 618           default:
 619             break;
 620         }
 621
 622         for (; i < attrs; ++i) {
 623           CToken *attr = mTokenizer->PopToken();
 624           if (!attr) {
 625             break;
 626           }
 627           NS_ASSERTION(attr->GetTokenType() == eToken_attribute, "Weird token");
 628           IF_FREE(attr, &mTokenAllocator);
 629         }
 630
 631         break;
 632       }
 633
 634     default:
 635       break;
 636   }
 637
 638   IF_FREE(aToken, &mTokenAllocator);
 639 }
 640
 641 void
 642 nsSpeculativeScriptThread::AddToPrefetchList(const nsAString &src,
 643                                       const nsAString &charset,
 644                                       const nsAString &elementType,
 645                                       PrefetchType type)
 646 {
 647   PrefetchEntry *pe = mURIs.InsertElementAt(mNumURIs++);
 648   pe->type = type;
 649   pe->uri = src;
 650   pe->charset = charset;
 651   pe->elementType = elementType;
 652
 653   if (mNumURIs == kBatchPrefetchURIs) {
 654     nsCOMPtr<nsIRunnable> r = new nsPreloadURIs(mURIs, this);
 655
 656     mNumURIs = 0;
 657     mURIs.Clear();
 658     NS_DispatchToMainThread(r, NS_DISPATCH_NORMAL);
 659   }
 660 }
 661
 662 nsICharsetAlias* nsParser::sCharsetAliasService = nsnull;
 663 nsICharsetConverterManager* nsParser::sCharsetConverterManager = nsnull;
 664 nsIThreadPool* nsParser::sSpeculativeThreadPool = nsnull;
 665
 666 /**
 667  *  This gets called when the htmlparser module is initialized.
 668  */
 669 // static
 670 nsresult
 671 nsParser::Init()
 672 {
 673   nsresult rv;
 674   nsCOMPtr<nsICategoryManager> cm =
 675     do_GetService(NS_CATEGORYMANAGER_CONTRACTID, &rv);
 676   NS_ENSURE_SUCCESS(rv, rv);
 677
 678   nsCOMPtr<nsISimpleEnumerator> e;
 679   rv = cm->EnumerateCategory("Parser data listener", getter_AddRefs(e));
 680   NS_ENSURE_SUCCESS(rv, rv);
 681
 682   nsCAutoString categoryEntry;
 683   nsXPIDLCString contractId;
 684   nsCOMPtr<nsISupports> entry;
 685
 686   while (NS_SUCCEEDED(e->GetNext(getter_AddRefs(entry)))) {
 687     nsCOMPtr<nsISupportsCString> category(do_QueryInterface(entry));
 688
 689     if (!category) {
 690       NS_WARNING("Category entry not an nsISupportsCString!");
 691       continue;
 692     }
 693
 694     rv = category->GetData(categoryEntry);
 695     NS_ENSURE_SUCCESS(rv, rv);
 696
 697     rv = cm->GetCategoryEntry("Parser data listener", categoryEntry.get(),
 698                               getter_Copies(contractId));
 699     NS_ENSURE_SUCCESS(rv, rv);
 700
 701     nsCOMPtr<nsIUnicharStreamListener> listener =
 702       do_CreateInstance(contractId.get());
 703
 704     if (listener) {
 705       if (!sParserDataListeners) {
 706         sParserDataListeners = new nsCOMArray<nsIUnicharStreamListener>();
 707
 708         if (!sParserDataListeners)
 709           return NS_ERROR_OUT_OF_MEMORY;
 710       }
 711
 712       sParserDataListeners->AppendObject(listener);
 713     }
 714   }
 715
 716   nsCOMPtr<nsICharsetAlias> charsetAlias =
 717     do_GetService(NS_CHARSETALIAS_CONTRACTID, &rv);
 718   NS_ENSURE_SUCCESS(rv, rv);
 719
 720   nsCOMPtr<nsICharsetConverterManager> charsetConverter =
 721     do_GetService(NS_CHARSETCONVERTERMANAGER_CONTRACTID, &rv);
 722   NS_ENSURE_SUCCESS(rv, rv);
 723
 724   charsetAlias.swap(sCharsetAliasService);
 725   charsetConverter.swap(sCharsetConverterManager);
 726
 727   nsCOMPtr<nsIThreadPool> threadPool =
 728     do_CreateInstance(NS_THREADPOOL_CONTRACTID, &rv);
 729   NS_ENSURE_SUCCESS(rv, rv);
 730
 731   rv = threadPool->SetThreadLimit(kSpeculativeThreadLimit);
 732   NS_ENSURE_SUCCESS(rv, rv);
 733
 734   rv = threadPool->SetIdleThreadLimit(kIdleThreadLimit);
 735   NS_ENSURE_SUCCESS(rv, rv);
 736
 737   rv = threadPool->SetIdleThreadTimeout(kIdleThreadTimeout);
 738   NS_ENSURE_SUCCESS(rv, rv);
 739
 740   threadPool.swap(sSpeculativeThreadPool);
 741
 742   return NS_OK;
 743 }
 744
 745
 746 /**
 747  *  This gets called when the htmlparser module is shutdown.
 748  */
 749 // static
 750 void nsParser::Shutdown()
 751 {
 752   delete sParserDataListeners;
 753   sParserDataListeners = nsnull;
 754
 755   NS_IF_RELEASE(sCharsetAliasService);
 756   NS_IF_RELEASE(sCharsetConverterManager);
 757   if (sSpeculativeThreadPool) {
 758     sSpeculativeThreadPool->Shutdown();
 759     NS_RELEASE(sSpeculativeThreadPool);
 760   }
 761 }
 762
 763 #ifdef DEBUG
 764 static PRBool gDumpContent=PR_FALSE;
 765 #endif
 766
 767 /**
 768  *  default constructor
 769  */
 770 nsParser::nsParser()
 771 {
 772   Initialize(PR_TRUE);
 773 }
 774
 775 nsParser::~nsParser()
 776 {
 777   Cleanup();
 778 }
 779
 780 void
 781 nsParser::Initialize(PRBool aConstructor)
 782 {
 783 #ifdef NS_DEBUG
 784   if (!gDumpContent) {
 785     gDumpContent = PR_GetEnv("PARSER_DUMP_CONTENT") != nsnull;
 786   }
 787 #endif
 788
 789   if (aConstructor) {
 790     // Raw pointer
 791     mParserContext = 0;
 792   }
 793   else {
 794     // nsCOMPtrs
 795     mObserver = nsnull;
 796     mParserFilter = nsnull;
 797     mUnusedInput.Truncate();
 798   }
 799
 800   mContinueEvent = nsnull;
 801   mCharsetSource = kCharsetUninitialized;
 802   mCharset.AssignLiteral("ISO-8859-1");
 803   mInternalState = NS_OK;
 804   mStreamStatus = 0;
 805   mCommand = eViewNormal;
 806   mFlags = NS_PARSER_FLAG_OBSERVERS_ENABLED |
 807            NS_PARSER_FLAG_PARSER_ENABLED |
 808            NS_PARSER_FLAG_CAN_TOKENIZE;
 809   mScriptsExecuting = 0;
 810
 811   MOZ_TIMER_DEBUGLOG(("Reset: Parse Time: nsParser::nsParser(), this=%p\n", this));
 812   MOZ_TIMER_RESET(mParseTime);
 813   MOZ_TIMER_RESET(mDTDTime);
 814   MOZ_TIMER_RESET(mTokenizeTime);
 815 }
 816
 817 void
 818 nsParser::Cleanup()
 819 {
 820 #ifdef NS_DEBUG
 821   if (gDumpContent) {
 822     if (mSink) {
 823       // Sink (HTMLContentSink at this time) supports nsIDebugDumpContent
 824       // interface. We can get to the content model through the sink.
 825       nsresult result = NS_OK;
 826       nsCOMPtr<nsIDebugDumpContent> trigger = do_QueryInterface(mSink, &result);
 827       if (NS_SUCCEEDED(result)) {
 828         trigger->DumpContentModel();
 829       }
 830     }
 831   }
 832 #endif
 833
 834 #ifdef DEBUG
 835   if (mParserContext && mParserContext->mPrevContext) {
 836     NS_WARNING("Extra parser contexts still on the parser stack");
 837   }
 838 #endif
 839
 840   while (mParserContext) {
 841     CParserContext *pc = mParserContext->mPrevContext;
 842     delete mParserContext;
 843     mParserContext = pc;
 844   }
 845
 846   // It should not be possible for this flag to be set when we are getting
 847   // destroyed since this flag implies a pending nsParserContinueEvent, which
 848   // has an owning reference to |this|.
 849   NS_ASSERTION(!(mFlags & NS_PARSER_FLAG_PENDING_CONTINUE_EVENT), "bad");
 850   if (mSpeculativeScriptThread) {
 851     mSpeculativeScriptThread->Terminate();
 852     mSpeculativeScriptThread = nsnull;
 853   }
 854 }
 855
 856 NS_IMPL_CYCLE_COLLECTION_CLASS(nsParser)
 857
 858 NS_IMPL_CYCLE_COLLECTION_UNLINK_BEGIN(nsParser)
 859   NS_IMPL_CYCLE_COLLECTION_UNLINK_NSCOMPTR(mSink)
 860   NS_IMPL_CYCLE_COLLECTION_UNLINK_NSCOMPTR(mObserver)
 861 NS_IMPL_CYCLE_COLLECTION_UNLINK_END
 862
 863 NS_IMPL_CYCLE_COLLECTION_TRAVERSE_BEGIN(nsParser)
 864   NS_IMPL_CYCLE_COLLECTION_TRAVERSE_NSCOMPTR(mSink)
 865   NS_IMPL_CYCLE_COLLECTION_TRAVERSE_NSCOMPTR(mObserver)
 866   CParserContext *pc = tmp->mParserContext;
 867   while (pc) {
 868     cb.NoteXPCOMChild(pc->mDTD);
 869     cb.NoteXPCOMChild(pc->mTokenizer);
 870     pc = pc->mPrevContext;
 871   }
 872 NS_IMPL_CYCLE_COLLECTION_TRAVERSE_END
 873
 874 NS_IMPL_CYCLE_COLLECTING_ADDREF_AMBIGUOUS(nsParser, nsIParser)
 875 NS_IMPL_CYCLE_COLLECTING_RELEASE_AMBIGUOUS(nsParser, nsIParser)
 876 NS_INTERFACE_MAP_BEGIN_CYCLE_COLLECTION(nsParser)
 877   NS_INTERFACE_MAP_ENTRY(nsIStreamListener)
 878   NS_INTERFACE_MAP_ENTRY(nsIParser)
 879   NS_INTERFACE_MAP_ENTRY(nsIRequestObserver)
 880   NS_INTERFACE_MAP_ENTRY_AMBIGUOUS(nsISupports, nsIParser)
 881 NS_INTERFACE_MAP_END
 882
 883 // The parser continue event is posted only if
 884 // all of the data to parse has been passed to ::OnDataAvailable
 885 // and the parser has been interrupted by the content sink
 886 // because the processing of tokens took too long.
 887
 888 nsresult
 889 nsParser::PostContinueEvent()
 890 {
 891   if (!(mFlags & NS_PARSER_FLAG_PENDING_CONTINUE_EVENT)) {
 892     // If this flag isn't set, then there shouldn't be a live continue event!
 893     NS_ASSERTION(!mContinueEvent, "bad");
 894
 895     // This creates a reference cycle between this and the event that is
 896     // broken when the event fires.
 897     nsCOMPtr<nsIRunnable> event = new nsParserContinueEvent(this);
 898     if (NS_FAILED(NS_DispatchToCurrentThread(event))) {
 899         NS_WARNING("failed to dispatch parser continuation event");
 900     } else {
 901         mFlags |= NS_PARSER_FLAG_PENDING_CONTINUE_EVENT;
 902         mContinueEvent = event;
 903     }
 904   }
 905   return NS_OK;
 906 }
 907
 908 NS_IMETHODIMP_(void)
 909 nsParser::SetParserFilter(nsIParserFilter * aFilter)
 910 {
 911   mParserFilter = aFilter;
 912 }
 913
 914 NS_IMETHODIMP_(void)
 915 nsParser::GetCommand(nsCString& aCommand)
 916 {
 917   aCommand = mCommandStr;
 918 }
 919
 920 /**
 921  *  Call this method once you've created a parser, and want to instruct it
 922  *  about the command which caused the parser to be constructed. For example,
 923  *  this allows us to select a DTD which can do, say, view-source.
 924  *
 925  *  @param   aCommand the command string to set
 926  */
 927 NS_IMETHODIMP_(void)
 928 nsParser::SetCommand(const char* aCommand)
 929 {
 930   mCommandStr.Assign(aCommand);
 931   if (mCommandStr.Equals(kViewSourceCommand)) {
 932     mCommand = eViewSource;
 933   } else if (mCommandStr.Equals(kViewFragmentCommand)) {
 934     mCommand = eViewFragment;
 935   } else {
 936     mCommand = eViewNormal;
 937   }
 938 }
 939
 940 /**
 941  *  Call this method once you've created a parser, and want to instruct it
 942  *  about the command which caused the parser to be constructed. For example,
 943  *  this allows us to select a DTD which can do, say, view-source.
 944  *
 945  *  @param   aParserCommand the command to set
 946  */
 947 NS_IMETHODIMP_(void)
 948 nsParser::SetCommand(eParserCommands aParserCommand)
 949 {
 950   mCommand = aParserCommand;
 951 }
 952
 953 /**
 954  *  Call this method once you've created a parser, and want to instruct it
 955  *  about what charset to load
 956  *
 957  *  @param   aCharset- the charset of a document
 958  *  @param   aCharsetSource- the source of the charset
 959  */
 960 NS_IMETHODIMP_(void)
 961 nsParser::SetDocumentCharset(const nsACString& aCharset, PRInt32 aCharsetSource)
 962 {
 963   mCharset = aCharset;
 964   mCharsetSource = aCharsetSource;
 965   if (mParserContext && mParserContext->mScanner) {
 966      mParserContext->mScanner->SetDocumentCharset(aCharset, aCharsetSource);
 967   }
 968 }
 969
 970 void
 971 nsParser::SetSinkCharset(nsACString& aCharset)
 972 {
 973   if (mSink) {
 974     mSink->SetDocumentCharset(aCharset);
 975   }
 976 }
 977
 978 /**
 979  *  This method gets called in order to set the content
 980  *  sink for this parser to dump nodes to.
 981  *
 982  *  @param   nsIContentSink interface for node receiver
 983  */
 984 NS_IMETHODIMP_(void)
 985 nsParser::SetContentSink(nsIContentSink* aSink)
 986 {
 987   NS_PRECONDITION(aSink, "sink cannot be null!");
 988   mSink = aSink;
 989
 990   if (mSink) {
 991     mSink->SetParser(this);
 992   }
 993 }
 994
 995 /**
 996  * retrieve the sink set into the parser
 997  * @return  current sink
 998  */
 999 NS_IMETHODIMP_(nsIContentSink*)
1000 nsParser::GetContentSink()
1001 {
1002   return mSink;
1003 }
1004
1005 /**
1006  *  Retrieve parsemode from topmost parser context
1007  *
1008  *  @return  parsemode
1009  */
1010 NS_IMETHODIMP_(nsDTDMode)
1011 nsParser::GetParseMode()
1012 {
1013   if (mParserContext) {
1014     return mParserContext->mDTDMode;
1015   }
1016   NS_NOTREACHED("no parser context");
1017   return eDTDMode_unknown;
1018 }
1019
1020 /**
1021  * Determine what DTD mode (and thus what layout nsCompatibility mode)
1022  * to use for this document based on the first chunk of data received
1023  * from the network (each parsercontext can have its own mode).  (No,
1024  * this is not an optimal solution -- we really don't need to know until
1025  * after we've received the DOCTYPE, and this could easily be part of
1026  * the regular parsing process if the parser were designed in a way that
1027  * made such modifications easy.)
1028  */
1029
1030 // Parse the PS production in the SGML spec (excluding the part dealing
1031 // with entity references) starting at theIndex into theBuffer, and
1032 // return the first index after the end of the production.
1033 static PRInt32
1034 ParsePS(const nsString& aBuffer, PRInt32 aIndex)
1035 {
1036   for (;;) {
1037     PRUnichar ch = aBuffer.CharAt(aIndex);
1038     if ((ch == PRUnichar(' ')) || (ch == PRUnichar('\t')) ||
1039         (ch == PRUnichar('\n')) || (ch == PRUnichar('\r'))) {
1040       ++aIndex;
1041     } else if (ch == PRUnichar('-')) {
1042       PRInt32 tmpIndex;
1043       if (aBuffer.CharAt(aIndex+1) == PRUnichar('-') &&
1044           kNotFound != (tmpIndex=aBuffer.Find("--",PR_FALSE,aIndex+2,-1))) {
1045         aIndex = tmpIndex + 2;
1046       } else {
1047         return aIndex;
1048       }
1049     } else {
1050       return aIndex;
1051     }
1052   }
1053 }
1054
1055 #define PARSE_DTD_HAVE_DOCTYPE          (1<<0)
1056 #define PARSE_DTD_HAVE_PUBLIC_ID        (1<<1)
1057 #define PARSE_DTD_HAVE_SYSTEM_ID        (1<<2)
1058 #define PARSE_DTD_HAVE_INTERNAL_SUBSET  (1<<3)
1059
1060 // return PR_TRUE on success (includes not present), PR_FALSE on failure
1061 static PRBool
1062 ParseDocTypeDecl(const nsString &aBuffer,
1063                  PRInt32 *aResultFlags,
1064                  nsString &aPublicID,
1065                  nsString &aSystemID)
1066 {
1067   PRBool haveDoctype = PR_FALSE;
1068   *aResultFlags = 0;
1069
1070   // Skip through any comments and processing instructions
1071   // The PI-skipping is a bit of a hack.
1072   PRInt32 theIndex = 0;
1073   do {
1074     theIndex = aBuffer.FindChar('<', theIndex);
1075     if (theIndex == kNotFound) break;
1076     PRUnichar nextChar = aBuffer.CharAt(theIndex+1);
1077     if (nextChar == PRUnichar('!')) {
1078       PRInt32 tmpIndex = theIndex + 2;
1079       if (kNotFound !=
1080           (theIndex=aBuffer.Find("DOCTYPE", PR_TRUE, tmpIndex, 0))) {
1081         haveDoctype = PR_TRUE;
1082         theIndex += 7; // skip "DOCTYPE"
1083         break;
1084       }
1085       theIndex = ParsePS(aBuffer, tmpIndex);
1086       theIndex = aBuffer.FindChar('>', theIndex);
1087     } else if (nextChar == PRUnichar('?')) {
1088       theIndex = aBuffer.FindChar('>', theIndex);
1089     } else {
1090       break;
1091     }
1092   } while (theIndex != kNotFound);
1093
1094   if (!haveDoctype)
1095     return PR_TRUE;
1096   *aResultFlags |= PARSE_DTD_HAVE_DOCTYPE;
1097
1098   theIndex = ParsePS(aBuffer, theIndex);
1099   theIndex = aBuffer.Find("HTML", PR_TRUE, theIndex, 0);
1100   if (kNotFound == theIndex)
1101     return PR_FALSE;
1102   theIndex = ParsePS(aBuffer, theIndex+4);
1103   PRInt32 tmpIndex = aBuffer.Find("PUBLIC", PR_TRUE, theIndex, 0);
1104
1105   if (kNotFound != tmpIndex) {
1106     theIndex = ParsePS(aBuffer, tmpIndex+6);
1107
1108     // We get here only if we've read <!DOCTYPE HTML PUBLIC
1109     // (not case sensitive) possibly with comments within.
1110
1111     // Now find the beginning and end of the public identifier
1112     // and the system identifier (if present).
1113
1114     PRUnichar lit = aBuffer.CharAt(theIndex);
1115     if ((lit != PRUnichar('\"')) && (lit != PRUnichar('\'')))
1116       return PR_FALSE;
1117
1118     // Start is the first character, excluding the quote, and End is
1119     // the final quote, so there are (end-start) characters.
1120
1121     PRInt32 PublicIDStart = theIndex + 1;
1122     PRInt32 PublicIDEnd = aBuffer.FindChar(lit, PublicIDStart);
1123     if (kNotFound == PublicIDEnd)
1124       return PR_FALSE;
1125     theIndex = ParsePS(aBuffer, PublicIDEnd + 1);
1126     PRUnichar next = aBuffer.CharAt(theIndex);
1127     if (next == PRUnichar('>')) {
1128       // There was a public identifier, but no system
1129       // identifier,
1130       // so do nothing.
1131       // This is needed to avoid the else at the end, and it's
1132       // also the most common case.
1133     } else if ((next == PRUnichar('\"')) ||
1134                (next == PRUnichar('\''))) {
1135       // We found a system identifier.
1136       *aResultFlags |= PARSE_DTD_HAVE_SYSTEM_ID;
1137       PRInt32 SystemIDStart = theIndex + 1;
1138       PRInt32 SystemIDEnd = aBuffer.FindChar(next, SystemIDStart);
1139       if (kNotFound == SystemIDEnd)
1140         return PR_FALSE;
1141       aSystemID =
1142         Substring(aBuffer, SystemIDStart, SystemIDEnd - SystemIDStart);
1143     } else if (next == PRUnichar('[')) {
1144       // We found an internal subset.
1145       *aResultFlags |= PARSE_DTD_HAVE_INTERNAL_SUBSET;
1146     } else {
1147       // Something's wrong.
1148       return PR_FALSE;
1149     }
1150
1151     // Since a public ID is a minimum literal, we must trim
1152     // and collapse whitespace
1153     aPublicID = Substring(aBuffer, PublicIDStart, PublicIDEnd - PublicIDStart);
1154     aPublicID.CompressWhitespace(PR_TRUE, PR_TRUE);
1155     *aResultFlags |= PARSE_DTD_HAVE_PUBLIC_ID;
1156   } else {
1157     tmpIndex=aBuffer.Find("SYSTEM", PR_TRUE, theIndex, 0);
1158     if (kNotFound != tmpIndex) {
1159       // DOCTYPES with system ID but no Public ID
1160       *aResultFlags |= PARSE_DTD_HAVE_SYSTEM_ID;
1161
1162       theIndex = ParsePS(aBuffer, tmpIndex+6);
1163       PRUnichar next = aBuffer.CharAt(theIndex);
1164       if (next != PRUnichar('\"') && next != PRUnichar('\''))
1165         return PR_FALSE;
1166
1167       PRInt32 SystemIDStart = theIndex + 1;
1168       PRInt32 SystemIDEnd = aBuffer.FindChar(next, SystemIDStart);
1169
1170       if (kNotFound == SystemIDEnd)
1171         return PR_FALSE;
1172       aSystemID =
1173         Substring(aBuffer, SystemIDStart, SystemIDEnd - SystemIDStart);
1174       theIndex = ParsePS(aBuffer, SystemIDEnd + 1);
1175     }
1176
1177     PRUnichar nextChar = aBuffer.CharAt(theIndex);
1178     if (nextChar == PRUnichar('['))
1179       *aResultFlags |= PARSE_DTD_HAVE_INTERNAL_SUBSET;
1180     else if (nextChar != PRUnichar('>'))
1181       return PR_FALSE;
1182   }
1183   return PR_TRUE;
1184 }
1185
1186 struct PubIDInfo
1187 {
1188   enum eMode {
1189     eQuirks,         /* always quirks mode, unless there's an internal subset */
1190     eAlmostStandards,/* eCompatibility_AlmostStandards */
1191     eFullStandards   /* eCompatibility_FullStandards */
1192       /*
1193        * public IDs that should trigger strict mode are not listed
1194        * since we want all future public IDs to trigger strict mode as
1195        * well
1196        */
1197   };
1198
1199   const char* name;
1200   eMode mode_if_no_sysid;
1201   eMode mode_if_sysid;
1202 };
1203
1204 #define ELEMENTS_OF(array_) (sizeof(array_)/sizeof(array_[0]))
1205
1206 // These must be in nsCRT::strcmp order so binary-search can be used.
1207 // This is verified, |#ifdef DEBUG|, below.
1208
1209 // Even though public identifiers should be case sensitive, we will do
1210 // all comparisons after converting to lower case in order to do
1211 // case-insensitive comparison since there are a number of existing web
1212 // sites that use the incorrect case.  Therefore all of the public
1213 // identifiers below are in lower case (with the correct case following,
1214 // in comments).  The case is verified, |#ifdef DEBUG|, below.
1215 static const PubIDInfo kPublicIDs[] = {
1216   {"+//silmaril//dtd html pro v0r11 19970101//en" /* "+//Silmaril//dtd html Pro v0r11 19970101//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1217   {"-//advasoft ltd//dtd html 3.0 aswedit + extensions//en" /* "-//AdvaSoft Ltd//DTD HTML 3.0 asWedit + extensions//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1218   {"-//as//dtd html 3.0 aswedit + extensions//en" /* "-//AS//DTD HTML 3.0 asWedit + extensions//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1219   {"-//ietf//dtd html 2.0 level 1//en" /* "-//IETF//DTD HTML 2.0 Level 1//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1220   {"-//ietf//dtd html 2.0 level 2//en" /* "-//IETF//DTD HTML 2.0 Level 2//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1221   {"-//ietf//dtd html 2.0 strict level 1//en" /* "-//IETF//DTD HTML 2.0 Strict Level 1//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1222   {"-//ietf//dtd html 2.0 strict level 2//en" /* "-//IETF//DTD HTML 2.0 Strict Level 2//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1223   {"-//ietf//dtd html 2.0 strict//en" /* "-//IETF//DTD HTML 2.0 Strict//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1224   {"-//ietf//dtd html 2.0//en" /* "-//IETF//DTD HTML 2.0//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1225   {"-//ietf//dtd html 2.1e//en" /* "-//IETF//DTD HTML 2.1E//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1226   {"-//ietf//dtd html 3.0//en" /* "-//IETF//DTD HTML 3.0//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1227   {"-//ietf//dtd html 3.0//en//" /* "-//IETF//DTD HTML 3.0//EN//" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1228   {"-//ietf//dtd html 3.2 final//en" /* "-//IETF//DTD HTML 3.2 Final//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1229   {"-//ietf//dtd html 3.2//en" /* "-//IETF//DTD HTML 3.2//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1230   {"-//ietf//dtd html 3//en" /* "-//IETF//DTD HTML 3//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1231   {"-//ietf//dtd html level 0//en" /* "-//IETF//DTD HTML Level 0//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1232   {"-//ietf//dtd html level 0//en//2.0" /* "-//IETF//DTD HTML Level 0//EN//2.0" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1233   {"-//ietf//dtd html level 1//en" /* "-//IETF//DTD HTML Level 1//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1234   {"-//ietf//dtd html level 1//en//2.0" /* "-//IETF//DTD HTML Level 1//EN//2.0" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1235   {"-//ietf//dtd html level 2//en" /* "-//IETF//DTD HTML Level 2//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1236   {"-//ietf//dtd html level 2//en//2.0" /* "-//IETF//DTD HTML Level 2//EN//2.0" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1237   {"-//ietf//dtd html level 3//en" /* "-//IETF//DTD HTML Level 3//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1238   {"-//ietf//dtd html level 3//en//3.0" /* "-//IETF//DTD HTML Level 3//EN//3.0" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1239   {"-//ietf//dtd html strict level 0//en" /* "-//IETF//DTD HTML Strict Level 0//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1240   {"-//ietf//dtd html strict level 0//en//2.0" /* "-//IETF//DTD HTML Strict Level 0//EN//2.0" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1241   {"-//ietf//dtd html strict level 1//en" /* "-//IETF//DTD HTML Strict Level 1//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1242   {"-//ietf//dtd html strict level 1//en//2.0" /* "-//IETF//DTD HTML Strict Level 1//EN//2.0" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1243   {"-//ietf//dtd html strict level 2//en" /* "-//IETF//DTD HTML Strict Level 2//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1244   {"-//ietf//dtd html strict level 2//en//2.0" /* "-//IETF//DTD HTML Strict Level 2//EN//2.0" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1245   {"-//ietf//dtd html strict level 3//en" /* "-//IETF//DTD HTML Strict Level 3//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1246   {"-//ietf//dtd html strict level 3//en//3.0" /* "-//IETF//DTD HTML Strict Level 3//EN//3.0" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1247   {"-//ietf//dtd html strict//en" /* "-//IETF//DTD HTML Strict//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1248   {"-//ietf//dtd html strict//en//2.0" /* "-//IETF//DTD HTML Strict//EN//2.0" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1249   {"-//ietf//dtd html strict//en//3.0" /* "-//IETF//DTD HTML Strict//EN//3.0" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1250   {"-//ietf//dtd html//en" /* "-//IETF//DTD HTML//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1251   {"-//ietf//dtd html//en//2.0" /* "-//IETF//DTD HTML//EN//2.0" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1252   {"-//ietf//dtd html//en//3.0" /* "-//IETF//DTD HTML//EN//3.0" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1253   {"-//metrius//dtd metrius presentational//en" /* "-//Metrius//DTD Metrius Presentational//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1254   {"-//microsoft//dtd internet explorer 2.0 html strict//en" /* "-//Microsoft//DTD Internet Explorer 2.0 HTML Strict//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1255   {"-//microsoft//dtd internet explorer 2.0 html//en" /* "-//Microsoft//DTD Internet Explorer 2.0 HTML//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1256   {"-//microsoft//dtd internet explorer 2.0 tables//en" /* "-//Microsoft//DTD Internet Explorer 2.0 Tables//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1257   {"-//microsoft//dtd internet explorer 3.0 html strict//en" /* "-//Microsoft//DTD Internet Explorer 3.0 HTML Strict//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1258   {"-//microsoft//dtd internet explorer 3.0 html//en" /* "-//Microsoft//DTD Internet Explorer 3.0 HTML//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1259   {"-//microsoft//dtd internet explorer 3.0 tables//en" /* "-//Microsoft//DTD Internet Explorer 3.0 Tables//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1260   {"-//netscape comm. corp.//dtd html//en" /* "-//Netscape Comm. Corp.//DTD HTML//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1261   {"-//netscape comm. corp.//dtd strict html//en" /* "-//Netscape Comm. Corp.//DTD Strict HTML//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1262   {"-//o'reilly and associates//dtd html 2.0//en" /* "-//O'Reilly and Associates//DTD HTML 2.0//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1263   {"-//o'reilly and associates//dtd html extended 1.0//en" /* "-//O'Reilly and Associates//DTD HTML Extended 1.0//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1264   {"-//o'reilly and associates//dtd html extended relaxed 1.0//en" /* "-//O'Reilly and Associates//DTD HTML Extended Relaxed 1.0//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1265   {"-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//en" /* "-//SoftQuad Software//DTD HoTMetaL PRO 6.0::19990601::extensions to HTML 4.0//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1266   {"-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//en" /* "-//SoftQuad//DTD HoTMetaL PRO 4.0::19971010::extensions to HTML 4.0//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1267   {"-//spyglass//dtd html 2.0 extended//en" /* "-//Spyglass//DTD HTML 2.0 Extended//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1268   {"-//sq//dtd html 2.0 hotmetal + extensions//en" /* "-//SQ//DTD HTML 2.0 HoTMetaL + extensions//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1269   {"-//sun microsystems corp.//dtd hotjava html//en" /* "-//Sun Microsystems Corp.//DTD HotJava HTML//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1270   {"-//sun microsystems corp.//dtd hotjava strict html//en" /* "-//Sun Microsystems Corp.//DTD HotJava Strict HTML//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1271   {"-//w3c//dtd html 3 1995-03-24//en" /* "-//W3C//DTD HTML 3 1995-03-24//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1272   {"-//w3c//dtd html 3.2 draft//en" /* "-//W3C//DTD HTML 3.2 Draft//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1273   {"-//w3c//dtd html 3.2 final//en" /* "-//W3C//DTD HTML 3.2 Final//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1274   {"-//w3c//dtd html 3.2//en" /* "-//W3C//DTD HTML 3.2//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1275   {"-//w3c//dtd html 3.2s draft//en" /* "-//W3C//DTD HTML 3.2S Draft//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1276   {"-//w3c//dtd html 4.0 frameset//en" /* "-//W3C//DTD HTML 4.0 Frameset//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1277   {"-//w3c//dtd html 4.0 transitional//en" /* "-//W3C//DTD HTML 4.0 Transitional//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1278   {"-//w3c//dtd html 4.01 frameset//en" /* "-//W3C//DTD HTML 4.01 Frameset//EN" */, PubIDInfo::eQuirks, PubIDInfo::eAlmostStandards},
1279   {"-//w3c//dtd html 4.01 transitional//en" /* "-//W3C//DTD HTML 4.01 Transitional//EN" */, PubIDInfo::eQuirks, PubIDInfo::eAlmostStandards},
1280   {"-//w3c//dtd html experimental 19960712//en" /* "-//W3C//DTD HTML Experimental 19960712//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1281   {"-//w3c//dtd html experimental 970421//en" /* "-//W3C//DTD HTML Experimental 970421//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1282   {"-//w3c//dtd w3 html//en" /* "-//W3C//DTD W3 HTML//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1283   {"-//w3c//dtd xhtml 1.0 frameset//en" /* "-//W3C//DTD XHTML 1.0 Frameset//EN" */, PubIDInfo::eAlmostStandards, PubIDInfo::eAlmostStandards},
1284   {"-//w3c//dtd xhtml 1.0 transitional//en" /* "-//W3C//DTD XHTML 1.0 Transitional//EN" */, PubIDInfo::eAlmostStandards, PubIDInfo::eAlmostStandards},
1285   {"-//w3o//dtd w3 html 3.0//en" /* "-//W3O//DTD W3 HTML 3.0//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1286   {"-//w3o//dtd w3 html 3.0//en//" /* "-//W3O//DTD W3 HTML 3.0//EN//" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1287   {"-//w3o//dtd w3 html strict 3.0//en//" /* "-//W3O//DTD W3 HTML Strict 3.0//EN//" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1288   {"-//webtechs//dtd mozilla html 2.0//en" /* "-//WebTechs//DTD Mozilla HTML 2.0//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1289   {"-//webtechs//dtd mozilla html//en" /* "-//WebTechs//DTD Mozilla HTML//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1290   {"-/w3c/dtd html 4.0 transitional/en" /* "-/W3C/DTD HTML 4.0 Transitional/EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1291   {"html" /* "HTML" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
1292 };
1293
1294 #ifdef DEBUG
1295 static void
1296 VerifyPublicIDs()
1297 {
1298   static PRBool gVerified = PR_FALSE;
1299   if (!gVerified) {
1300     gVerified = PR_TRUE;
1301     PRUint32 i;
1302     for (i = 0; i < ELEMENTS_OF(kPublicIDs) - 1; ++i) {
1303       if (nsCRT::strcmp(kPublicIDs[i].name, kPublicIDs[i+1].name) >= 0) {
1304         NS_NOTREACHED("doctypes out of order");
1305         printf("Doctypes %s and %s out of order.\n",
1306                kPublicIDs[i].name, kPublicIDs[i+1].name);
1307       }
1308     }
1309     for (i = 0; i < ELEMENTS_OF(kPublicIDs); ++i) {
1310       nsCAutoString lcPubID(kPublicIDs[i].name);
1311       ToLowerCase(lcPubID);
1312       if (nsCRT::strcmp(kPublicIDs[i].name, lcPubID.get()) != 0) {
1313         NS_NOTREACHED("doctype not lower case");
1314         printf("Doctype %s not lower case.\n", kPublicIDs[i].name);
1315       }
1316     }
1317   }
1318 }
1319 #endif
1320
1321 static void
1322 DetermineHTMLParseMode(const nsString& aBuffer,
1323                        nsDTDMode& aParseMode,
1324                        eParserDocType& aDocType)
1325 {
1326 #ifdef DEBUG
1327   VerifyPublicIDs();
1328 #endif
1329   PRInt32 resultFlags;
1330   nsAutoString publicIDUCS2, sysIDUCS2;
1331   if (ParseDocTypeDecl(aBuffer, &resultFlags, publicIDUCS2, sysIDUCS2)) {
1332     if (!(resultFlags & PARSE_DTD_HAVE_DOCTYPE)) {
1333       // no DOCTYPE
1334       aParseMode = eDTDMode_quirks;
1335       aDocType = eHTML_Quirks;
1336     } else if ((resultFlags & PARSE_DTD_HAVE_INTERNAL_SUBSET) ||
1337                !(resultFlags & PARSE_DTD_HAVE_PUBLIC_ID)) {
1338       // A doctype with an internal subset is always full_standards.
1339       // A doctype without a public ID is always full_standards.
1340       aDocType = eHTML_Strict;
1341       aParseMode = eDTDMode_full_standards;
1342
1343       // Special hack for IBM's custom DOCTYPE.
1344       if (!(resultFlags & PARSE_DTD_HAVE_INTERNAL_SUBSET) &&
1345           sysIDUCS2 == NS_LITERAL_STRING(
1346                "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd")) {
1347         aParseMode = eDTDMode_quirks;
1348         aDocType = eHTML_Quirks;
1349       }
1350
1351     } else {
1352       // We have to check our list of public IDs to see what to do.
1353       // Yes, we want UCS2 to ASCII lossy conversion.
1354       nsCAutoString publicID;
1355       publicID.AssignWithConversion(publicIDUCS2);
1356
1357       // See comment above definition of kPublicIDs about case
1358       // sensitivity.
1359       ToLowerCase(publicID);
1360
1361       // Binary search to see if we can find the correct public ID
1362       // These must be signed since maximum can go below zero and we'll
1363       // crash if it's unsigned.
1364       PRInt32 minimum = 0;
1365       PRInt32 maximum = ELEMENTS_OF(kPublicIDs) - 1;
1366       PRInt32 index;
1367       for (;;) {
1368         index = (minimum + maximum) / 2;
1369         PRInt32 comparison =
1370             nsCRT::strcmp(publicID.get(), kPublicIDs[index].name);
1371         if (comparison == 0)
1372           break;
1373         if (comparison < 0)
1374           maximum = index - 1;
1375         else
1376           minimum = index + 1;
1377
1378         if (maximum < minimum) {
1379           // The DOCTYPE is not in our list, so it must be full_standards.
1380           aParseMode = eDTDMode_full_standards;
1381           aDocType = eHTML_Strict;
1382           return;
1383         }
1384       }
1385
1386       switch ((resultFlags & PARSE_DTD_HAVE_SYSTEM_ID)
1387                 ? kPublicIDs[index].mode_if_sysid
1388                 : kPublicIDs[index].mode_if_no_sysid)
1389       {
1390         case PubIDInfo::eQuirks:
1391           aParseMode = eDTDMode_quirks;
1392           aDocType = eHTML_Quirks;
1393           break;
1394         case PubIDInfo::eAlmostStandards:
1395           aParseMode = eDTDMode_almost_standards;
1396           aDocType = eHTML_Strict;
1397           break;
1398         case PubIDInfo::eFullStandards:
1399           aParseMode = eDTDMode_full_standards;
1400           aDocType = eHTML_Strict;
1401           break;
1402         default:
1403           NS_NOTREACHED("no other cases!");
1404       }
1405     }
1406   } else {
1407     // badly formed DOCTYPE -> quirks
1408     aParseMode = eDTDMode_quirks;
1409     aDocType = eHTML_Quirks;
1410   }
1411 }
1412
1413 static void
1414 DetermineParseMode(const nsString& aBuffer, nsDTDMode& aParseMode,
1415                    eParserDocType& aDocType, const nsACString& aMimeType)
1416 {
1417   if (aMimeType.EqualsLiteral(kHTMLTextContentType)) {
1418     DetermineHTMLParseMode(aBuffer, aParseMode, aDocType);
1419   } else if (aMimeType.EqualsLiteral(kPlainTextContentType) ||
1420              aMimeType.EqualsLiteral(kTextCSSContentType) ||
1421              aMimeType.EqualsLiteral(kApplicationJSContentType) ||
1422              aMimeType.EqualsLiteral(kApplicationXJSContentType) ||
1423              aMimeType.EqualsLiteral(kTextECMAScriptContentType) ||
1424              aMimeType.EqualsLiteral(kApplicationECMAScriptContentType) ||
1425              aMimeType.EqualsLiteral(kTextJSContentType)) {
1426     aDocType = ePlainText;
1427     aParseMode = eDTDMode_quirks;
1428   } else { // Some form of XML
1429     aDocType = eXML;
1430     aParseMode = eDTDMode_full_standards;
1431   }
1432 }
1433
1434 static nsresult
1435 FindSuitableDTD(CParserContext& aParserContext)
1436 {
1437   NS_ASSERTION(!aParserContext.mDTD, "Already found a DTD");
1438
1439   // We always find a DTD.
1440   aParserContext.mAutoDetectStatus = ePrimaryDetect;
1441
1442 #ifdef MOZ_VIEW_SOURCE
1443   // Quick check for view source.
1444   if (aParserContext.mParserCommand == eViewSource) {
1445     aParserContext.mDTD = new CViewSourceHTML();
1446     return aParserContext.mDTD ? NS_OK : NS_ERROR_OUT_OF_MEMORY;
1447   }
1448 #endif
1449
1450   // Now see if we're parsing HTML (which, as far as we're concerned, simply
1451   // means "not XML").
1452   if (aParserContext.mDocType != eXML) {
1453     aParserContext.mDTD = new CNavDTD();
1454     return aParserContext.mDTD ? NS_OK : NS_ERROR_OUT_OF_MEMORY;
1455   }
1456
1457   // If we're here, then we'd better be parsing XML.
1458   NS_ASSERTION(aParserContext.mDocType == eXML, "What are you trying to send me, here?");
1459   aParserContext.mDTD = new nsExpatDriver();
1460   return aParserContext.mDTD ? NS_OK : NS_ERROR_OUT_OF_MEMORY;
1461 }
1462
1463 NS_IMETHODIMP
1464 nsParser::CancelParsingEvents()
1465 {
1466   if (mFlags & NS_PARSER_FLAG_PENDING_CONTINUE_EVENT) {
1467     NS_ASSERTION(mContinueEvent, "mContinueEvent is null");
1468     // Revoke the pending continue parsing event
1469     mContinueEvent = nsnull;
1470     mFlags &= ~NS_PARSER_FLAG_PENDING_CONTINUE_EVENT;
1471   }
1472   return NS_OK;
1473 }
1474
1475 ////////////////////////////////////////////////////////////////////////
1476
1477
1478 /**
1479  * This gets called just prior to the model actually
1480  * being constructed. It's important to make this the
1481  * last thing that happens right before parsing, so we
1482  * can delay until the last moment the resolution of
1483  * which DTD to use (unless of course we're assigned one).
1484  */
1485 nsresult
1486 nsParser::WillBuildModel(nsString& aFilename)
1487 {
1488   if (!mParserContext)
1489     return kInvalidParserContext;
1490
1491   if (eUnknownDetect != mParserContext->mAutoDetectStatus)
1492     return NS_OK;
1493
1494   if (eDTDMode_unknown == mParserContext->mDTDMode ||
1495       eDTDMode_autodetect == mParserContext->mDTDMode) {
1496     PRUnichar buf[1025];
1497     nsFixedString theBuffer(buf, 1024, 0);
1498
1499     // Grab 1024 characters, starting at the first non-whitespace
1500     // character, to look for the doctype in.
1501     mParserContext->mScanner->Peek(theBuffer, 1024, mParserContext->mScanner->FirstNonWhitespacePosition());
1502     DetermineParseMode(theBuffer, mParserContext->mDTDMode,
1503                        mParserContext->mDocType, mParserContext->mMimeType);
1504   }
1505
1506   nsresult rv = FindSuitableDTD(*mParserContext);
1507   NS_ENSURE_SUCCESS(rv, rv);
1508
1509   nsITokenizer* tokenizer;
1510   rv = mParserContext->GetTokenizer(mParserContext->mDTD->GetType(), mSink, tokenizer);
1511   NS_ENSURE_SUCCESS(rv, rv);
1512
1513   return mParserContext->mDTD->WillBuildModel(*mParserContext, tokenizer, mSink);
1514 }
1515
1516 /**
1517  * This gets called when the parser is done with its input.
1518  * Note that the parser may have been called recursively, so we
1519  * have to check for a prev. context before closing out the DTD/sink.
1520  */
1521 nsresult
1522 nsParser::DidBuildModel(nsresult anErrorCode)
1523 {
1524   nsresult result = anErrorCode;
1525
1526   if (IsComplete()) {
1527     if (mParserContext && !mParserContext->mPrevContext) {
1528       if (mParserContext->mDTD) {
1529         result = mParserContext->mDTD->DidBuildModel(anErrorCode,PR_TRUE,this,mSink);
1530       }
1531
1532       //Ref. to bug 61462.
1533       mParserContext->mRequest = 0;
1534     }
1535   }
1536
1537   return result;
1538 }
1539
1540 void
1541 nsParser::SpeculativelyParse()
1542 {
1543   if (mParserContext->mParserCommand == eViewNormal &&
1544       !mParserContext->mMimeType.EqualsLiteral("text/html")) {
1545     return;
1546   }
1547
1548   if (!mSpeculativeScriptThread) {
1549     mSpeculativeScriptThread = new nsSpeculativeScriptThread();
1550     if (!mSpeculativeScriptThread) {
1551       return;
1552     }
1553   }
1554
1555   nsresult rv = mSpeculativeScriptThread->StartParsing(this);
1556   if (NS_FAILED(rv)) {
1557     mSpeculativeScriptThread = nsnull;
1558   }
1559 }
1560
1561 /**
1562  * This method adds a new parser context to the list,
1563  * pushing the current one to the next position.
1564  *
1565  * @param   ptr to new context
1566  */
1567 void
1568 nsParser::PushContext(CParserContext& aContext)
1569 {
1570   aContext.mPrevContext = mParserContext;
1571   mParserContext = &aContext;
1572 }
1573
1574 /**
1575  * This method pops the topmost context off the stack,
1576  * returning it to the user. The next context  (if any)
1577  * becomes the current context.
1578  * @update      gess7/22/98
1579  * @return  prev. context
1580  */
1581 CParserContext*
1582 nsParser::PopContext()
1583 {
1584   CParserContext* oldContext = mParserContext;
1585   if (oldContext) {
1586     mParserContext = oldContext->mPrevContext;
1587     if (mParserContext) {
1588       // If the old context was blocked, propagate the blocked state
1589       // back to the new one. Also, propagate the stream listener state
1590       // but don't override onStop state to guarantee the call to DidBuildModel().
1591       if (mParserContext->mStreamListenerState != eOnStop) {
1592         mParserContext->mStreamListenerState = oldContext->mStreamListenerState;
1593       }
1594       // Update the current context's tokenizer to any information gleaned
1595       // while parsing document.write() calls (such as "a plaintext tag was
1596       // found")
1597       if (mParserContext->mTokenizer) {
1598         mParserContext->mTokenizer->CopyState(oldContext->mTokenizer);
1599       }
1600     }
1601   }
1602   return oldContext;
1603 }
1604
1605 /**
1606  *  Call this when you want control whether or not the parser will parse
1607  *  and tokenize input (TRUE), or whether it just caches input to be
1608  *  parsed later (FALSE).
1609  *
1610  *  @param   aState determines whether we parse/tokenize or just cache.
1611  *  @return  current state
1612  */
1613 void
1614 nsParser::SetUnusedInput(nsString& aBuffer)
1615 {
1616   mUnusedInput = aBuffer;
1617 }
1618
1619 NS_IMETHODIMP_(void *)
1620 nsParser::GetRootContextKey()
1621 {
1622   CParserContext* pc = mParserContext;
1623   if (!pc) {
1624     return nsnull;
1625   }
1626
1627   while (pc->mPrevContext) {
1628     pc = pc->mPrevContext;
1629   }
1630
1631   return pc->mKey;
1632 }
1633
1634 /**
1635  *  Call this when you want to *force* the parser to terminate the
1636  *  parsing process altogether. This is binary -- so once you terminate
1637  *  you can't resume without restarting altogether.
1638  */
1639 NS_IMETHODIMP
1640 nsParser::Terminate(void)
1641 {
1642   // We should only call DidBuildModel once, so don't do anything if this is
1643   // the second time that Terminate has been called.
1644   if (mInternalState == NS_ERROR_HTMLPARSER_STOPPARSING) {
1645     return NS_OK;
1646   }
1647
1648   nsresult result = NS_OK;
1649   // XXX - [ until we figure out a way to break parser-sink circularity ]
1650   // Hack - Hold a reference until we are completely done...
1651   nsCOMPtr<nsIParser> kungFuDeathGrip(this);
1652   mInternalState = result = NS_ERROR_HTMLPARSER_STOPPARSING;
1653
1654   // CancelParsingEvents must be called to avoid leaking the nsParser object
1655   // @see bug 108049
1656   // If NS_PARSER_FLAG_PENDING_CONTINUE_EVENT is set then CancelParsingEvents
1657   // will reset it so DidBuildModel will call DidBuildModel on the DTD. Note:
1658   // The IsComplete() call inside of DidBuildModel looks at the pendingContinueEvents flag.
1659   CancelParsingEvents();
1660   if (mSpeculativeScriptThread) {
1661     mSpeculativeScriptThread->Terminate();
1662     mSpeculativeScriptThread = nsnull;
1663   }
1664
1665   // If we got interrupted in the middle of a document.write, then we might
1666   // have more than one parser context on our parsercontext stack. This has
1667   // the effect of making DidBuildModel a no-op, meaning that we never call
1668   // our sink's DidBuildModel and break the reference cycle, causing a leak.
1669   // Since we're getting terminated, we manually clean up our context stack.
1670   while (mParserContext && mParserContext->mPrevContext) {
1671     CParserContext *prev = mParserContext->mPrevContext;
1672     NS_ASSERTION(prev->mPrevContext || prev->mDTD, "How is there no root DTD?");
1673
1674     delete mParserContext;
1675     mParserContext = prev;
1676   }
1677
1678   if (mParserContext && mParserContext->mDTD) {
1679     mParserContext->mDTD->Terminate();
1680     DidBuildModel(result);
1681   } else if (mSink) {
1682     // We have no parser context or no DTD yet (so we got terminated before we
1683     // got any data).  Manually break the reference cycle with the sink.
1684     result = mSink->DidBuildModel();
1685     NS_ENSURE_SUCCESS(result, result);
1686   }
1687
1688   return NS_OK;
1689 }
1690
1691 NS_IMETHODIMP
1692 nsParser::ContinueParsing()
1693 {
1694   if (mFlags & NS_PARSER_FLAG_PARSER_ENABLED) {
1695     NS_WARNING("Trying to continue parsing on a unblocked parser.");
1696     return NS_OK;
1697   }
1698
1699   mFlags |= NS_PARSER_FLAG_PARSER_ENABLED;
1700
1701   return ContinueInterruptedParsing();
1702 }
1703
1704 NS_IMETHODIMP
1705 nsParser::ContinueInterruptedParsing()
1706 {
1707   // If there are scripts executing, then the content sink is jumping the gun
1708   // (probably due to a synchronous XMLHttpRequest) and will re-enable us
1709   // later, see bug 460706.
1710   if (mScriptsExecuting) {
1711     return NS_OK;
1712   }
1713
1714   // If the stream has already finished, there's a good chance
1715   // that we might start closing things down when the parser
1716   // is reenabled. To make sure that we're not deleted across
1717   // the reenabling process, hold a reference to ourselves.
1718   nsresult result=NS_OK;
1719   nsCOMPtr<nsIParser> kungFuDeathGrip(this);
1720
1721 #ifdef DEBUG
1722   if (!(mFlags & NS_PARSER_FLAG_PARSER_ENABLED)) {
1723     NS_WARNING("Don't call ContinueInterruptedParsing on a blocked parser.");
1724   }
1725 #endif
1726
1727   if (mSpeculativeScriptThread) {
1728     mSpeculativeScriptThread->StopParsing(PR_FALSE);
1729   }
1730
1731   PRBool isFinalChunk = mParserContext &&
1732                         mParserContext->mStreamListenerState == eOnStop;
1733
1734   if (mSink) {
1735     mSink->WillParse();
1736   }
1737   result = ResumeParse(PR_TRUE, isFinalChunk); // Ref. bug 57999
1738
1739   if (result != NS_OK) {
1740     result=mInternalState;
1741   }
1742
1743   return result;
1744 }
1745
1746 /**
1747  *  Stops parsing temporarily. That's it will prevent the
1748  *  parser from building up content model.
1749  */
1750 NS_IMETHODIMP_(void)
1751 nsParser::BlockParser()
1752 {
1753   mFlags &= ~NS_PARSER_FLAG_PARSER_ENABLED;
1754   MOZ_TIMER_DEBUGLOG(("Stop: Parse Time: nsParser::BlockParser(), this=%p\n", this));
1755   MOZ_TIMER_STOP(mParseTime);
1756 }
1757
1758 /**
1759  *  Open up the parser for tokenization, building up content
1760  *  model..etc. However, this method does not resume parsing
1761  *  automatically. It's the callers' responsibility to restart
1762  *  the parsing engine.
1763  */
1764 NS_IMETHODIMP_(void)
1765 nsParser::UnblockParser()
1766 {
1767   if (!(mFlags & NS_PARSER_FLAG_PARSER_ENABLED)) {
1768     mFlags |= NS_PARSER_FLAG_PARSER_ENABLED;
1769     MOZ_TIMER_DEBUGLOG(("Start: Parse Time: nsParser::UnblockParser(), this=%p\n", this));
1770     MOZ_TIMER_START(mParseTime);
1771   } else {
1772     NS_WARNING("Trying to unblock an unblocked parser.");
1773   }
1774 }
1775
1776 /**
1777  * Call this to query whether the parser is enabled or not.
1778  */
1779 NS_IMETHODIMP_(PRBool)
1780 nsParser::IsParserEnabled()
1781 {
1782   return (mFlags & NS_PARSER_FLAG_PARSER_ENABLED) != 0;
1783 }
1784
1785 /**
1786  * Call this to query whether the parser thinks it's done with parsing.
1787  */
1788 NS_IMETHODIMP_(PRBool)
1789 nsParser::IsComplete()
1790 {
1791   return !(mFlags & NS_PARSER_FLAG_PENDING_CONTINUE_EVENT);
1792 }
1793
1794
1795 void nsParser::HandleParserContinueEvent(nsParserContinueEvent *ev)
1796 {
1797   // Ignore any revoked continue events...
1798   if (mContinueEvent != ev)
1799     return;
1800
1801   mFlags &= ~NS_PARSER_FLAG_PENDING_CONTINUE_EVENT;
1802   mContinueEvent = nsnull;
1803
1804   NS_ASSERTION(mScriptsExecuting == 0, "Interrupted in the middle of a script?");
1805   ContinueInterruptedParsing();
1806 }
1807
1808 void
1809 nsParser::ScriptExecuting()
1810 {
1811   ++mScriptsExecuting;
1812 }
1813
1814 void
1815 nsParser::ScriptDidExecute()
1816 {
1817   NS_ASSERTION(mScriptsExecuting > 0, "Too many calls to ScriptDidExecute");
1818   --mScriptsExecuting;
1819 }
1820
1821 nsresult
1822 nsParser::DataAdded(const nsSubstring& aData, nsIRequest *aRequest)
1823 {
1824   NS_ASSERTION(sParserDataListeners,
1825                "Don't call this with no parser data listeners!");
1826
1827   if (!mSink || !aRequest) {
1828     return NS_OK;
1829   }
1830
1831   nsISupports *ctx = mSink->GetTarget();
1832   PRInt32 count = sParserDataListeners->Count();
1833   nsresult rv = NS_OK;
1834   PRBool canceled = PR_FALSE;
1835
1836   while (count--) {
1837     rv |= sParserDataListeners->ObjectAt(count)->
1838       OnUnicharDataAvailable(aRequest, ctx, aData);
1839
1840     if (NS_FAILED(rv) && !canceled) {
1841       aRequest->Cancel(rv);
1842
1843       canceled = PR_TRUE;
1844     }
1845   }
1846
1847   return rv;
1848 }
1849
1850 PRBool
1851 nsParser::CanInterrupt()
1852 {
1853   return (mFlags & NS_PARSER_FLAG_CAN_INTERRUPT) != 0;
1854 }
1855
1856 void
1857 nsParser::SetCanInterrupt(PRBool aCanInterrupt)
1858 {
1859   if (aCanInterrupt) {
1860     mFlags |= NS_PARSER_FLAG_CAN_INTERRUPT;
1861   } else {
1862     mFlags &= ~NS_PARSER_FLAG_CAN_INTERRUPT;
1863   }
1864 }
1865
1866 /**
1867  *  This is the main controlling routine in the parsing process.
1868  *  Note that it may get called multiple times for the same scanner,
1869  *  since this is a pushed based system, and all the tokens may
1870  *  not have been consumed by the scanner during a given invocation
1871  *  of this method.
1872  */
1873 NS_IMETHODIMP
1874 nsParser::Parse(nsIURI* aURL,
1875                 nsIRequestObserver* aListener,
1876                 void* aKey,
1877                 nsDTDMode aMode)
1878 {
1879
1880   NS_PRECONDITION(aURL, "Error: Null URL given");
1881   NS_ASSERTION(!mSpeculativeScriptThread, "Can't reuse a parser like this");
1882
1883   nsresult result=kBadURL;
1884   mObserver = aListener;
1885
1886   if (aURL) {
1887     nsCAutoString spec;
1888     nsresult rv = aURL->GetSpec(spec);
1889     if (rv != NS_OK) {
1890       return rv;
1891     }
1892     NS_ConvertUTF8toUTF16 theName(spec);
1893
1894     nsScanner* theScanner = new nsScanner(theName, PR_FALSE, mCharset,
1895                                           mCharsetSource);
1896     CParserContext* pc = new CParserContext(theScanner, aKey, mCommand,
1897                                             aListener);
1898     if (pc && theScanner) {
1899       pc->mMultipart = PR_TRUE;
1900       pc->mContextType = CParserContext::eCTURL;
1901       pc->mDTDMode = aMode;
1902       PushContext(*pc);
1903
1904       // Here, and only here, hand this parser off to the scanner. We
1905       // only want to do that here since the only reason the scanner
1906       // needs the parser is to call DataAdded() on it, and that's
1907       // only ever wanted when parsing from an URI.
1908       theScanner->SetParser(this);
1909
1910       result = NS_OK;
1911     } else {
1912       result = mInternalState = NS_ERROR_HTMLPARSER_BADCONTEXT;
1913     }
1914   }
1915   return result;
1916 }
1917
1918 /**
1919  * Call this method if all you want to do is parse 1 string full of HTML text.
1920  * In particular, this method should be called by the DOM when it has an HTML
1921  * string to feed to the parser in real-time.
1922  *
1923  * @param   aSourceBuffer contains a string-full of real content
1924  * @param   aMimeType tells us what type of content to expect in the given string
1925  */
1926 NS_IMETHODIMP
1927 nsParser::Parse(const nsAString& aSourceBuffer,
1928                 void* aKey,
1929                 const nsACString& aMimeType,
1930                 PRBool aLastCall,
1931                 nsDTDMode aMode)
1932 {
1933   nsresult result = NS_OK;
1934
1935   // Don't bother if we're never going to parse this.
1936   if (mInternalState == NS_ERROR_HTMLPARSER_STOPPARSING) {
1937     return result;
1938   }
1939
1940   if (!aLastCall && aSourceBuffer.IsEmpty()) {
1941     // Nothing is being passed to the parser so return
1942     // immediately. mUnusedInput will get processed when
1943     // some data is actually passed in.
1944     // But if this is the last call, make sure to finish up
1945     // stuff correctly.
1946     return result;
1947   }
1948
1949   if (mSpeculativeScriptThread) {
1950     mSpeculativeScriptThread->StopParsing(PR_TRUE);
1951   }
1952
1953   // Hack to pass on to the dtd the caller's desire to
1954   // parse a fragment without worrying about containment rules
1955   if (aMode == eDTDMode_fragment)
1956     mCommand = eViewFragment;
1957
1958   // Maintain a reference to ourselves so we don't go away
1959   // till we're completely done.
1960   nsCOMPtr<nsIParser> kungFuDeathGrip(this);
1961
1962   if (aLastCall || !aSourceBuffer.IsEmpty() || !mUnusedInput.IsEmpty()) {
1963     // Note: The following code will always find the parser context associated
1964     // with the given key, even if that context has been suspended (e.g., for
1965     // another document.write call). This doesn't appear to be exactly what IE
1966     // does in the case where this happens, but this makes more sense.
1967     CParserContext* pc = mParserContext;
1968     while (pc && pc->mKey != aKey) {
1969       pc = pc->mPrevContext;
1970     }
1971
1972     if (!pc) {
1973       // Only make a new context if we don't have one, OR if we do, but has a
1974       // different context key.
1975       nsScanner* theScanner = new nsScanner(mUnusedInput, mCharset, mCharsetSource);
1976       NS_ENSURE_TRUE(theScanner, NS_ERROR_OUT_OF_MEMORY);
1977
1978       nsIDTD *theDTD = nsnull;
1979       eAutoDetectResult theStatus = eUnknownDetect;
1980
1981       if (mParserContext && mParserContext->mMimeType == aMimeType) {
1982         // Ref. Bug 90379
1983         NS_ASSERTION(mParserContext->mDTD, "How come the DTD is null?");
1984
1985         if (mParserContext) {
1986           // To fix bug 32263 we used create a new instance of the DTD!.
1987           // All we need is a new tokenizer which now gets created with
1988           // a parser context.
1989           theDTD = mParserContext->mDTD;
1990           theStatus = mParserContext->mAutoDetectStatus;
1991           // Added this to fix bug 32022.
1992         }
1993       }
1994
1995       pc = new CParserContext(theScanner, aKey, mCommand,
1996                               0, theDTD, theStatus, aLastCall);
1997       NS_ENSURE_TRUE(pc, NS_ERROR_OUT_OF_MEMORY);
1998
1999       PushContext(*pc);
2000
2001       pc->mMultipart = !aLastCall; // By default
2002       if (pc->mPrevContext) {
2003         pc->mMultipart |= pc->mPrevContext->mMultipart;
2004       }
2005
2006       // Start fix bug 40143
2007       if (pc->mMultipart) {
2008         pc->mStreamListenerState = eOnDataAvail;
2009         if (pc->mScanner) {
2010           pc->mScanner->SetIncremental(PR_TRUE);
2011         }
2012       } else {
2013         pc->mStreamListenerState = eOnStop;
2014         if (pc->mScanner) {
2015           pc->mScanner->SetIncremental(PR_FALSE);
2016         }
2017       }
2018       // end fix for 40143
2019
2020       pc->mContextType=CParserContext::eCTString;
2021       pc->SetMimeType(aMimeType);
2022       if (pc->mPrevContext && aMode == eDTDMode_autodetect) {
2023         // Preserve the DTD mode from the last context, bug 265814.
2024         pc->mDTDMode = pc->mPrevContext->mDTDMode;
2025       } else {
2026         pc->mDTDMode = aMode;
2027       }
2028
2029       mUnusedInput.Truncate();
2030
2031       pc->mScanner->Append(aSourceBuffer);
2032       // Do not interrupt document.write() - bug 95487
2033       result = ResumeParse(PR_FALSE, PR_FALSE, PR_FALSE);
2034     } else {
2035       pc->mScanner->Append(aSourceBuffer);
2036       if (!pc->mPrevContext) {
2037         // Set stream listener state to eOnStop, on the final context - Fix 68160,
2038         // to guarantee DidBuildModel() call - Fix 36148
2039         if (aLastCall) {
2040           pc->mStreamListenerState = eOnStop;
2041           pc->mScanner->SetIncremental(PR_FALSE);
2042         }
2043
2044         if (pc == mParserContext) {
2045           // If pc is not mParserContext, then this call to ResumeParse would
2046           // do the wrong thing and try to continue parsing using
2047           // mParserContext. We need to wait to actually resume parsing on pc.
2048           ResumeParse(PR_FALSE, PR_FALSE, PR_FALSE);
2049         }
2050       }
2051     }
2052   }
2053
2054   return result;
2055 }
2056
2057 NS_IMETHODIMP
2058 nsParser::ParseFragment(const nsAString& aSourceBuffer,
2059                         void* aKey,
2060                         nsTArray<nsString>& aTagStack,
2061                         PRBool aXMLMode,
2062                         const nsACString& aMimeType,
2063                         nsDTDMode aMode)
2064 {
2065   nsresult result = NS_OK;
2066   nsAutoString  theContext;
2067   PRUint32 theCount = aTagStack.Length();
2068   PRUint32 theIndex = 0;
2069
2070   // Disable observers for fragments
2071   mFlags &= ~NS_PARSER_FLAG_OBSERVERS_ENABLED;
2072
2073   NS_ASSERTION(!mSpeculativeScriptThread, "Can't reuse a parser like this");
2074
2075   for (theIndex = 0; theIndex < theCount; theIndex++) {
2076     theContext.AppendLiteral("<");
2077     theContext.Append(aTagStack[theCount - theIndex - 1]);
2078     theContext.AppendLiteral(">");
2079   }
2080
2081   if (theCount == 0) {
2082     // Ensure that the buffer is not empty. Because none of the DTDs care
2083     // about leading whitespace, this doesn't change the result.
2084     theContext.AssignLiteral(" ");
2085   }
2086
2087   // First, parse the context to build up the DTD's tag stack. Note that we
2088   // pass PR_FALSE for the aLastCall parameter.
2089   result = Parse(theContext, (void*)&theContext, aMimeType, PR_FALSE, aMode);
2090   if (NS_FAILED(result)) {
2091     mFlags |= NS_PARSER_FLAG_OBSERVERS_ENABLED;
2092     return result;
2093   }
2094
2095   nsCOMPtr<nsIFragmentContentSink> fragSink = do_QueryInterface(mSink);
2096   NS_ASSERTION(fragSink, "ParseFragment requires a fragment content sink");
2097
2098   if (!aXMLMode && theCount) {
2099     // First, we have to flush any tags that don't belong in the head if there
2100     // was no <body> in the context.
2101     // XXX This is extremely ugly. Maybe CNavDTD should have FlushMisplaced()?
2102     NS_ASSERTION(mParserContext, "Parsing didn't create a parser context?");
2103
2104     CNavDTD* dtd = static_cast<CNavDTD*>
2105                               (static_cast<nsIDTD*>
2106                                           (mParserContext->mDTD));
2107     NS_ASSERTION(dtd, "How did we parse anything without a dtd?");
2108
2109     CStartToken bodyToken(NS_LITERAL_STRING("BODY"), eHTMLTag_body);
2110     nsCParserNode bodyNode(&bodyToken, 0);
2111
2112     dtd->OpenContainer(&bodyNode, eHTMLTag_body);
2113
2114     // Now parse the flushed out tags.
2115     result = BuildModel();
2116     if (NS_FAILED(result)) {
2117       mFlags |= NS_PARSER_FLAG_OBSERVERS_ENABLED;
2118       return result;
2119     }
2120
2121     // Now that we've flushed all of the tags out of the body, we have to make
2122     // sure that there aren't any context tags left in the scanner.
2123     NS_ASSERTION(mParserContext->mScanner, "Where'd the scanner go?");
2124
2125     PRUnichar next;
2126     if (NS_SUCCEEDED(mParserContext->mScanner->Peek(next))) {
2127       // Uh, oh. This must mean that the context stack has a special tag on
2128       // it, such as <textarea> or <title> that requires its end tag before it
2129       // will be consumed. Tell the content sink that it will be coming.
2130       // Note: For now, we can assume that there is only one such tag.
2131       NS_ASSERTION(next == '<', "The tokenizer failed to consume a token");
2132       fragSink->IgnoreFirstContainer();
2133     }
2134   }
2135
2136   fragSink->WillBuildContent();
2137   // Now, parse the actual content. Note that this is the last call
2138   // for HTML content, but for XML, we will want to build and parse
2139   // the end tags.  However, if tagStack is empty, it's the last call
2140   // for XML as well.
2141   if (!aXMLMode || (theCount == 0)) {
2142     result = Parse(aSourceBuffer, &theContext, aMimeType,
2143                    PR_TRUE, aMode);
2144     fragSink->DidBuildContent();
2145   } else {
2146     // Add an end tag chunk, so expat will read the whole source buffer,
2147     // and not worry about ']]' etc.
2148     result = Parse(aSourceBuffer + NS_LITERAL_STRING("</"),
2149                    &theContext, aMimeType, PR_FALSE, aMode);
2150     fragSink->DidBuildContent();
2151
2152     if (NS_SUCCEEDED(result)) {
2153       nsAutoString endContext;
2154       for (theIndex = 0; theIndex < theCount; theIndex++) {
2155          // we already added an end tag chunk above
2156         if (theIndex > 0) {
2157           endContext.AppendLiteral("</");
2158         }
2159
2160         nsString& thisTag = aTagStack[theIndex];
2161         // was there an xmlns=?
2162         PRInt32 endOfTag = thisTag.FindChar(PRUnichar(' '));
2163         if (endOfTag == -1) {
2164           endContext.Append(thisTag);
2165         } else {
2166           endContext.Append(Substring(thisTag,0,endOfTag));
2167         }
2168
2169         endContext.AppendLiteral(">");
2170       }
2171
2172       result = Parse(endContext, &theContext, aMimeType,
2173                      PR_TRUE, aMode);
2174     }
2175   }
2176
2177   mFlags |= NS_PARSER_FLAG_OBSERVERS_ENABLED;
2178
2179   return result;
2180 }
2181
2182 /**
2183  *  This routine is called to cause the parser to continue parsing its
2184  *  underlying stream.  This call allows the parse process to happen in
2185  *  chunks, such as when the content is push based, and we need to parse in
2186  *  pieces.
2187  *
2188  *  An interesting change in how the parser gets used has led us to add extra
2189  *  processing to this method.  The case occurs when the parser is blocked in
2190  *  one context, and gets a parse(string) call in another context.  In this
2191  *  case, the parserContexts are linked. No problem.
2192  *
2193  *  The problem is that Parse(string) assumes that it can proceed unabated,
2194  *  but if the parser is already blocked that assumption is false. So we
2195  *  needed to add a mechanism here to allow the parser to continue to process
2196  *  (the pop and free) contexts until 1) it get's blocked again; 2) it runs
2197  *  out of contexts.
2198  *
2199  *
2200  *  @param   allowItertion : set to true if non-script resumption is requested
2201  *  @param   aIsFinalChunk : tells us when the last chunk of data is provided.
2202  *  @return  error code -- 0 if ok, non-zero if error.
2203  */
2204 nsresult
2205 nsParser::ResumeParse(PRBool allowIteration, PRBool aIsFinalChunk,
2206                       PRBool aCanInterrupt)
2207 {
2208   nsresult result = NS_OK;
2209
2210   if ((mFlags & NS_PARSER_FLAG_PARSER_ENABLED) &&
2211       mInternalState != NS_ERROR_HTMLPARSER_STOPPARSING) {
2212     MOZ_TIMER_DEBUGLOG(("Start: Parse Time: nsParser::ResumeParse(), this=%p\n", this));
2213     MOZ_TIMER_START(mParseTime);
2214
2215     NS_ASSERTION(!mSpeculativeScriptThread || !mSpeculativeScriptThread->Parsing(),
2216                  "Bad races happening, expect to crash!");
2217
2218     result = WillBuildModel(mParserContext->mScanner->GetFilename());
2219     if (NS_FAILED(result)) {
2220       mFlags &= ~NS_PARSER_FLAG_CAN_TOKENIZE;
2221       return result;
2222     }
2223
2224     if (mParserContext->mDTD) {
2225       mParserContext->mDTD->WillResumeParse(mSink);
2226       PRBool theIterationIsOk = PR_TRUE;
2227
2228       while (result == NS_OK && theIterationIsOk) {
2229         if (!mUnusedInput.IsEmpty() && mParserContext->mScanner) {
2230           // -- Ref: Bug# 22485 --
2231           // Insert the unused input into the source buffer
2232           // as if it was read from the input stream.
2233           // Adding UngetReadable() per vidur!!
2234           mParserContext->mScanner->UngetReadable(mUnusedInput);
2235           mUnusedInput.Truncate(0);
2236         }
2237
2238         // Only allow parsing to be interrupted in the subsequent call to
2239         // build model.
2240         SetCanInterrupt(aCanInterrupt);
2241         nsresult theTokenizerResult = (mFlags & NS_PARSER_FLAG_CAN_TOKENIZE)
2242                                       ? Tokenize(aIsFinalChunk)
2243                                       : NS_OK;
2244         result = BuildModel();
2245
2246         if (result == NS_ERROR_HTMLPARSER_INTERRUPTED && aIsFinalChunk) {
2247           PostContinueEvent();
2248         }
2249         SetCanInterrupt(PR_FALSE);
2250
2251         theIterationIsOk = theTokenizerResult != kEOF &&
2252                            result != NS_ERROR_HTMLPARSER_INTERRUPTED;
2253
2254         // Make sure not to stop parsing too early. Therefore, before shutting
2255         // down the parser, it's important to check whether the input buffer
2256         // has been scanned to completion (theTokenizerResult should be kEOF).
2257         // kEOF -> End of buffer.
2258
2259         // If we're told to block the parser, we disable all further parsing
2260         // (and cache any data coming in) until the parser is re-enabled.
2261         if (NS_ERROR_HTMLPARSER_BLOCK == result) {
2262           if (mParserContext->mDTD) {
2263             mParserContext->mDTD->WillInterruptParse(mSink);
2264           }
2265
2266           if (mFlags & NS_PARSER_FLAG_PARSER_ENABLED) {
2267             // If we were blocked by a recursive invocation, don't re-block.
2268             BlockParser();
2269             SpeculativelyParse();
2270           }
2271           return NS_OK;
2272         }
2273         if (NS_ERROR_HTMLPARSER_STOPPARSING == result) {
2274           // Note: Parser Terminate() calls DidBuildModel.
2275           if (mInternalState != NS_ERROR_HTMLPARSER_STOPPARSING) {
2276             DidBuildModel(mStreamStatus);
2277             mInternalState = result;
2278           }
2279
2280           return NS_OK;
2281         }
2282         if ((NS_OK == result && theTokenizerResult == kEOF) ||
2283              result == NS_ERROR_HTMLPARSER_INTERRUPTED) {
2284           PRBool theContextIsStringBased =
2285             CParserContext::eCTString == mParserContext->mContextType;
2286
2287           if (mParserContext->mStreamListenerState == eOnStop ||
2288               !mParserContext->mMultipart || theContextIsStringBased) {
2289             if (!mParserContext->mPrevContext) {
2290               if (mParserContext->mStreamListenerState == eOnStop) {
2291                 DidBuildModel(mStreamStatus);
2292
2293                 MOZ_TIMER_DEBUGLOG(("Stop: Parse Time: nsParser::ResumeParse(), this=%p\n", this));
2294                 MOZ_TIMER_STOP(mParseTime);
2295
2296                 MOZ_TIMER_LOG(("Parse Time (this=%p): ", this));
2297                 MOZ_TIMER_PRINT(mParseTime);
2298
2299                 MOZ_TIMER_LOG(("DTD Time: "));
2300                 MOZ_TIMER_PRINT(mDTDTime);
2301
2302                 MOZ_TIMER_LOG(("Tokenize Time: "));
2303                 MOZ_TIMER_PRINT(mTokenizeTime);
2304
2305                 return NS_OK;
2306               }
2307             } else {
2308               CParserContext* theContext = PopContext();
2309               if (theContext) {
2310                 theIterationIsOk = allowIteration && theContextIsStringBased;
2311                 if (theContext->mCopyUnused) {
2312                   theContext->mScanner->CopyUnusedData(mUnusedInput);
2313                 }
2314
2315                 delete theContext;
2316               }
2317
2318               result = mInternalState;
2319               aIsFinalChunk = mParserContext &&
2320                               mParserContext->mStreamListenerState == eOnStop;
2321               // ...then intentionally fall through to WillInterruptParse()...
2322             }
2323           }
2324         }
2325
2326         if (theTokenizerResult == kEOF ||
2327             result == NS_ERROR_HTMLPARSER_INTERRUPTED) {
2328           result = (result == NS_ERROR_HTMLPARSER_INTERRUPTED) ? NS_OK : result;
2329           if (mParserContext->mDTD) {
2330             mParserContext->mDTD->WillInterruptParse(mSink);
2331           }
2332         }
2333       }
2334     } else {
2335       mInternalState = result = NS_ERROR_HTMLPARSER_UNRESOLVEDDTD;
2336     }
2337   }
2338
2339   MOZ_TIMER_DEBUGLOG(("Stop: Parse Time: nsParser::ResumeParse(), this=%p\n", this));
2340   MOZ_TIMER_STOP(mParseTime);
2341
2342   return (result == NS_ERROR_HTMLPARSER_INTERRUPTED) ? NS_OK : result;
2343 }
2344
2345 /**
2346  *  This is where we loop over the tokens created in the
2347  *  tokenization phase, and try to make sense out of them.
2348  */
2349 nsresult
2350 nsParser::BuildModel()
2351 {
2352   CParserContext* theRootContext = mParserContext;
2353   nsITokenizer*   theTokenizer = nsnull;
2354
2355   nsresult result = NS_OK;
2356   if (mParserContext) {
2357     PRInt32 type = mParserContext->mDTD ? mParserContext->mDTD->GetType() :
2358                                           NS_IPARSER_FLAG_HTML;
2359     result = mParserContext->GetTokenizer(type, mSink, theTokenizer);
2360   }
2361
2362   if (NS_SUCCEEDED(result)) {
2363     // Get the root DTD for use in model building...
2364     while (theRootContext->mPrevContext) {
2365       theRootContext = theRootContext->mPrevContext;
2366     }
2367
2368     nsIDTD* theRootDTD = theRootContext->mDTD;
2369     if (theRootDTD) {
2370       MOZ_TIMER_START(mDTDTime);
2371       result = theRootDTD->BuildModel(this, theTokenizer, nsnull, mSink);
2372       MOZ_TIMER_STOP(mDTDTime);
2373     }
2374   } else {
2375     mInternalState = result = NS_ERROR_HTMLPARSER_BADTOKENIZER;
2376   }
2377   return result;
2378 }
2379
2380 /*******************************************************************
2381   These methods are used to talk to the netlib system...
2382  *******************************************************************/
2383
2384 nsresult
2385 nsParser::OnStartRequest(nsIRequest *request, nsISupports* aContext)
2386 {
2387   NS_PRECONDITION(eNone == mParserContext->mStreamListenerState,
2388                   "Parser's nsIStreamListener API was not setup "
2389                   "correctly in constructor.");
2390   if (mObserver) {
2391     mObserver->OnStartRequest(request, aContext);
2392   }
2393   mParserContext->mStreamListenerState = eOnStart;
2394   mParserContext->mAutoDetectStatus = eUnknownDetect;
2395   mParserContext->mDTD = nsnull;
2396   mParserContext->mRequest = request;
2397
2398   nsresult rv;
2399   nsCAutoString contentType;
2400   nsCOMPtr<nsIChannel> channel = do_QueryInterface(request);
2401   if (channel) {
2402     rv = channel->GetContentType(contentType);
2403     if (NS_SUCCEEDED(rv)) {
2404       mParserContext->SetMimeType(contentType);
2405     }
2406   }
2407
2408   rv = NS_OK;
2409
2410   if (sParserDataListeners && mSink) {
2411     nsISupports *ctx = mSink->GetTarget();
2412     PRInt32 count = sParserDataListeners->Count();
2413
2414     while (count--) {
2415       rv |= sParserDataListeners->ObjectAt(count)->
2416               OnStartRequest(request, ctx);
2417     }
2418   }
2419
2420   return rv;
2421 }
2422
2423
2424 #define UTF16_BOM "UTF-16"
2425 #define UTF16_BE "UTF-16BE"
2426 #define UTF16_LE "UTF-16LE"
2427 #define UCS4_BOM "UTF-32"
2428 #define UCS4_BE "UTF-32BE"
2429 #define UCS4_LE "UTF-32LE"
2430 #define UCS4_2143 "X-ISO-10646-UCS-4-2143"
2431 #define UCS4_3412 "X-ISO-10646-UCS-4-3412"
2432 #define UTF8 "UTF-8"
2433
2434 static inline PRBool IsSecondMarker(unsigned char aChar)
2435 {
2436   switch (aChar) {
2437     case '!':
2438     case '?':
2439     case 'h':
2440     case 'H':
2441       return PR_TRUE;
2442     default:
2443       return PR_FALSE;
2444   }
2445 }
2446
2447 static PRBool
2448 DetectByteOrderMark(const unsigned char* aBytes, PRInt32 aLen,
2449                     nsCString& oCharset, PRInt32& oCharsetSource)
2450 {
2451  oCharsetSource= kCharsetFromAutoDetection;
2452  oCharset.Truncate();
2453  // See http://www.w3.org/TR/2000/REC-xml-20001006#sec-guessing
2454  // for details
2455  // Also, MS Win2K notepad now generate 3 bytes BOM in UTF8 as UTF8 signature
2456  // We need to check that
2457  // UCS2 BOM FEFF = UTF8 EF BB BF
2458  switch(aBytes[0])
2459          {
2460    case 0x00:
2461      if(0x00==aBytes[1]) {
2462         // 00 00
2463         if((0xFE==aBytes[2]) && (0xFF==aBytes[3])) {
2464            // 00 00 FE FF UCS-4, big-endian machine (1234 order)
2465            oCharset.Assign(UCS4_BOM);
2466         } else if((0x00==aBytes[2]) && (0x3C==aBytes[3])) {
2467            // 00 00 00 3C UCS-4, big-endian machine (1234 order)
2468            oCharset.Assign(UCS4_BE);
2469         } else if((0xFF==aBytes[2]) && (0xFE==aBytes[3])) {
2470            // 00 00 FF FE UCS-4, unusual octet order (2143)
2471            oCharset.Assign(UCS4_2143);
2472         } else if((0x3C==aBytes[2]) && (0x00==aBytes[3])) {
2473            // 00 00 3C 00 UCS-4, unusual octet order (2143)
2474            oCharset.Assign(UCS4_2143);
2475         }
2476         oCharsetSource = kCharsetFromByteOrderMark;
2477      } else if((0x3C==aBytes[1]) && (0x00==aBytes[2])) {
2478         // 00 3C 00
2479         if(IsSecondMarker(aBytes[3])) {
2480            // 00 3C 00 SM UTF-16,  big-endian, no Byte Order Mark
2481            oCharset.Assign(UTF16_BE);
2482         } else if((0x00==aBytes[3])) {
2483            // 00 3C 00 00 UCS-4, unusual octet order (3412)
2484            oCharset.Assign(UCS4_3412);
2485         }
2486         oCharsetSource = kCharsetFromByteOrderMark;
2487      }
2488    break;
2489    case 0x3C:
2490      if(0x00==aBytes[1] && (0x00==aBytes[3])) {
2491         // 3C 00 XX 00
2492         if(IsSecondMarker(aBytes[2])) {
2493            // 3C 00 SM 00 UTF-16,  little-endian, no Byte Order Mark
2494            oCharset.Assign(UTF16_LE);
2495         } else if((0x00==aBytes[2])) {
2496            // 3C 00 00 00 UCS-4, little-endian machine (4321 order)
2497            oCharset.Assign(UCS4_LE);
2498         }
2499         oCharsetSource = kCharsetFromByteOrderMark;
2500      // For html, meta tag detector is invoked before this so that we have
2501      // to deal only with XML here.
2502      } else if(                     (0x3F==aBytes[1]) &&
2503                (0x78==aBytes[2]) && (0x6D==aBytes[3]) &&
2504                (0 == PL_strncmp("<?xml", (char*)aBytes, 5 ))) {
2505        // 3C 3F 78 6D
2506        // ASCII characters are in their normal positions, so we can safely
2507        // deal with the XML declaration in the old C way
2508        // The shortest string so far (strlen==5):
2509        // <?xml
2510        PRInt32 i;
2511        PRBool versionFound = PR_FALSE, encodingFound = PR_FALSE;
2512        for (i=6; i < aLen && !encodingFound; ++i) {
2513          // end of XML declaration?
2514          if ((((char*)aBytes)[i] == '?') &&
2515            ((i+1) < aLen) &&
2516            (((char*)aBytes)[i+1] == '>')) {
2517            break;
2518          }
2519          // Version is required.
2520          if (!versionFound) {
2521            // Want to avoid string comparisons, hence looking for 'n'
2522            // and only if found check the string leading to it. Not
2523            // foolproof, but fast.
2524            // The shortest string allowed before this is  (strlen==13):
2525            // <?xml version
2526            if ((((char*)aBytes)[i] == 'n') &&
2527              (i >= 12) &&
2528              (0 == PL_strncmp("versio", (char*)(aBytes+i-6), 6 ))) {
2529              // Fast forward through version
2530              char q = 0;
2531              for (++i; i < aLen; ++i) {
2532                char qi = ((char*)aBytes)[i];
2533                if (qi == '\'' || qi == '"') {
2534                  if (q && q == qi) {
2535                    //  ending quote
2536                    versionFound = PR_TRUE;
2537                    break;
2538                  } else {
2539                    // Starting quote
2540                    q = qi;
2541                  }
2542                }
2543              }
2544            }
2545          } else {
2546            // encoding must follow version
2547            // Want to avoid string comparisons, hence looking for 'g'
2548            // and only if found check the string leading to it. Not
2549            // foolproof, but fast.
2550            // The shortest allowed string before this (strlen==26):
2551            // <?xml version="1" encoding
2552            if ((((char*)aBytes)[i] == 'g') &&
2553              (i >= 25) &&
2554              (0 == PL_strncmp("encodin", (char*)(aBytes+i-7), 7 ))) {
2555              PRInt32 encStart = 0;
2556              char q = 0;
2557              for (++i; i < aLen; ++i) {
2558                char qi = ((char*)aBytes)[i];
2559                if (qi == '\'' || qi == '"') {
2560                  if (q && q == qi) {
2561                    PRInt32 count = i - encStart;
2562                    // encoding value is invalid if it is UTF-16
2563                    if (count > 0 &&
2564                      (0 != PL_strcmp("UTF-16", (char*)(aBytes+encStart)))) {
2565                      oCharset.Assign((char*)(aBytes+encStart),count);
2566                      oCharsetSource = kCharsetFromMetaTag;
2567                    }
2568                    encodingFound = PR_TRUE;
2569                    break;
2570                  } else {
2571                    encStart = i+1;
2572                    q = qi;
2573                  }
2574                }
2575              }
2576            }
2577          } // if (!versionFound)
2578        } // for
2579      }
2580    break;
2581    case 0xEF:
2582      if((0xBB==aBytes[1]) && (0xBF==aBytes[2])) {
2583         // EF BB BF
2584         // Win2K UTF-8 BOM
2585         oCharset.Assign(UTF8);
2586         oCharsetSource= kCharsetFromByteOrderMark;
2587      }
2588    break;
2589    case 0xFE:
2590      if(0xFF==aBytes[1]) {
2591         if(0x00==aBytes[2] && 0x00==aBytes[3]) {
2592           // FE FF 00 00  UCS-4, unusual octet order (3412)
2593           oCharset.Assign(UCS4_3412);
2594         } else {
2595           // FE FF UTF-16, big-endian
2596           oCharset.Assign(UTF16_BOM);
2597         }
2598         oCharsetSource= kCharsetFromByteOrderMark;
2599      }
2600    break;
2601    case 0xFF:
2602      if(0xFE==aBytes[1]) {
2603         if(0x00==aBytes[2] && 0x00==aBytes[3])
2604          // FF FE 00 00  UTF-32, little-endian
2605            oCharset.Assign(UCS4_BOM);
2606         else
2607         // FF FE
2608         // UTF-16, little-endian
2609            oCharset.Assign(UTF16_BOM);
2610         oCharsetSource= kCharsetFromByteOrderMark;
2611      }
2612    break;
2613    // case 0x4C: if((0x6F==aBytes[1]) && ((0xA7==aBytes[2] && (0x94==aBytes[3])) {
2614    //   We do not care EBCIDIC here....
2615    // }
2616    // break;
2617  }  // switch
2618  return !oCharset.IsEmpty();
2619 }
2620
2621 inline const char
2622 GetNextChar(nsACString::const_iterator& aStart,
2623             nsACString::const_iterator& aEnd)
2624 {
2625   NS_ASSERTION(aStart != aEnd, "end of buffer");
2626   return (++aStart != aEnd) ? *aStart : '\0';
2627 }
2628
2629 PRBool
2630 nsParser::DetectMetaTag(const char* aBytes,
2631                         PRInt32 aLen,
2632                         nsCString& aCharset,
2633                         PRInt32& aCharsetSource)
2634 {
2635   aCharsetSource= kCharsetFromMetaTag;
2636   aCharset.SetLength(0);
2637
2638   // XXX Only look inside HTML documents for now. For XML
2639   // documents we should be looking inside the XMLDecl.
2640   if (!mParserContext->mMimeType.EqualsLiteral(kHTMLTextContentType)) {
2641     return PR_FALSE;
2642   }
2643
2644   // Fast and loose parsing to determine if we have a complete
2645   // META tag in this block, looking upto 2k into it.
2646   const nsASingleFragmentCString& str =
2647       Substring(aBytes, aBytes + PR_MIN(aLen, 2048));
2648   // XXXldb Should be const_char_iterator when FindInReadable supports it.
2649   nsACString::const_iterator begin, end;
2650
2651   str.BeginReading(begin);
2652   str.EndReading(end);
2653   nsACString::const_iterator currPos(begin);
2654   nsACString::const_iterator tokEnd;
2655   nsACString::const_iterator tagEnd(begin);
2656
2657   while (currPos != end) {
2658     if (!FindCharInReadable('<', currPos, end))
2659       break; // no tag found in this buffer
2660
2661     if (GetNextChar(currPos, end) == '!') {
2662       if (GetNextChar(currPos, end) != '-' ||
2663           GetNextChar(currPos, end) != '-') {
2664         // If we only see a <! not followed by --, just skip to the next >.
2665         if (!FindCharInReadable('>', currPos, end)) {
2666           return PR_FALSE; // No more tags to follow.
2667         }
2668
2669         // Continue searching for a meta tag following this "comment".
2670         ++currPos;
2671         continue;
2672       }
2673
2674       // Found MDO ( <!-- ). Now search for MDC ( --[*s]> )
2675       PRBool foundMDC = PR_FALSE;
2676       PRBool foundMatch = PR_FALSE;
2677       while (!foundMDC) {
2678         if (GetNextChar(currPos, end) == '-' &&
2679             GetNextChar(currPos, end) == '-') {
2680           foundMatch = !foundMatch; // toggle until we've matching "--"
2681         } else if (currPos == end) {
2682           return PR_FALSE; // Couldn't find --[*s]> in this buffer
2683         } else if (foundMatch && *currPos == '>') {
2684           foundMDC = PR_TRUE; // found comment end delimiter.
2685           ++currPos;
2686         }
2687       }
2688       continue; // continue searching for META tag.
2689     }
2690
2691     // Find the end of the tag, break if incomplete
2692     tagEnd = currPos;
2693     if (!FindCharInReadable('>', tagEnd, end))
2694       break;
2695
2696     // If this is not a META tag, continue to next loop
2697     if ( (*currPos != 'm' && *currPos != 'M') ||
2698          (*(++currPos) != 'e' && *currPos != 'E') ||
2699          (*(++currPos) != 't' && *currPos != 'T') ||
2700          (*(++currPos) != 'a' && *currPos != 'A') ||
2701          !nsCRT::IsAsciiSpace(*(++currPos))) {
2702       currPos = tagEnd;
2703       continue;
2704     }
2705
2706     // If could not find "charset" in this tag, skip this tag and try next
2707     tokEnd = tagEnd;
2708     if (!CaseInsensitiveFindInReadable(NS_LITERAL_CSTRING("CHARSET"),
2709                                        currPos, tokEnd)) {
2710       currPos = tagEnd;
2711       continue;
2712     }
2713     currPos = tokEnd;
2714
2715     // skip spaces before '='
2716     while (*currPos == kSpace || *currPos == kNewLine ||
2717            *currPos == kCR || *currPos == kTab) {
2718       ++currPos;
2719     }
2720     // skip '='
2721     if (*currPos != '=') {
2722       currPos = tagEnd;
2723       continue;
2724     }
2725     ++currPos;
2726     // skip spaces after '='
2727     while (*currPos == kSpace || *currPos == kNewLine ||
2728            *currPos == kCR || *currPos == kTab) {
2729       ++currPos;
2730     }
2731
2732     // skip open quote
2733     if (*currPos == '\'' || *currPos == '\"')
2734       ++currPos;
2735
2736     // find the end of charset string
2737     tokEnd = currPos;
2738     while (*tokEnd != '\'' && *tokEnd != '\"' && tokEnd != tagEnd)
2739       ++tokEnd;
2740
2741     // return true if we successfully got something for charset
2742     if (currPos != tokEnd) {
2743       aCharset.Assign(currPos.get(), tokEnd.get() - currPos.get());
2744       return PR_TRUE;
2745     }
2746
2747     // Nothing specified as charset, continue next loop
2748     currPos = tagEnd;
2749   }
2750
2751   return PR_FALSE;
2752 }
2753
2754 typedef struct {
2755   PRBool mNeedCharsetCheck;
2756   nsParser* mParser;
2757   nsIParserFilter* mParserFilter;
2758   nsScanner* mScanner;
2759   nsIRequest* mRequest;
2760 } ParserWriteStruct;
2761
2762 /*
2763  * This function is invoked as a result of a call to a stream's
2764  * ReadSegments() method. It is called for each contiguous buffer
2765  * of data in the underlying stream or pipe. Using ReadSegments
2766  * allows us to avoid copying data to read out of the stream.
2767  */
2768 static NS_METHOD
2769 ParserWriteFunc(nsIInputStream* in,
2770                 void* closure,
2771                 const char* fromRawSegment,
2772                 PRUint32 toOffset,
2773                 PRUint32 count,
2774                 PRUint32 *writeCount)
2775 {
2776   nsresult result;
2777   ParserWriteStruct* pws = static_cast<ParserWriteStruct*>(closure);
2778   const char* buf = fromRawSegment;
2779   PRUint32 theNumRead = count;
2780
2781   if (!pws) {
2782     return NS_ERROR_FAILURE;
2783   }
2784
2785   if (pws->mNeedCharsetCheck) {
2786     PRInt32 guessSource;
2787     nsCAutoString guess;
2788     nsCAutoString preferred;
2789
2790     pws->mNeedCharsetCheck = PR_FALSE;
2791     if (pws->mParser->DetectMetaTag(buf, theNumRead, guess, guessSource) ||
2792         ((count >= 4) &&
2793          DetectByteOrderMark((const unsigned char*)buf,
2794                              theNumRead, guess, guessSource))) {
2795       nsCOMPtr<nsICharsetAlias> alias(do_GetService(NS_CHARSETALIAS_CONTRACTID));
2796       result = alias->GetPreferred(guess, preferred);
2797       // Only continue if it's a recognized charset and not
2798       // one of a designated set that we ignore.
2799       if (NS_SUCCEEDED(result) &&
2800           ((kCharsetFromByteOrderMark == guessSource) ||
2801            (!preferred.EqualsLiteral("UTF-16") &&
2802             !preferred.EqualsLiteral("UTF-16BE") &&
2803             !preferred.EqualsLiteral("UTF-16LE") &&
2804             !preferred.EqualsLiteral("UTF-32") &&
2805             !preferred.EqualsLiteral("UTF-32BE") &&
2806             !preferred.EqualsLiteral("UTF-32LE")))) {
2807         guess = preferred;
2808         pws->mParser->SetDocumentCharset(guess, guessSource);
2809         pws->mParser->SetSinkCharset(preferred);
2810         nsCOMPtr<nsICachingChannel> channel(do_QueryInterface(pws->mRequest));
2811         if (channel) {
2812           nsCOMPtr<nsISupports> cacheToken;
2813           channel->GetCacheToken(getter_AddRefs(cacheToken));
2814           if (cacheToken) {
2815             nsCOMPtr<nsICacheEntryDescriptor> cacheDescriptor(do_QueryInterface(cacheToken));
2816             if (cacheDescriptor) {
2817 #ifdef DEBUG
2818               nsresult rv =
2819 #endif
2820                 cacheDescriptor->SetMetaDataElement("charset",
2821                                                     guess.get());
2822               NS_ASSERTION(NS_SUCCEEDED(rv),"cannot SetMetaDataElement");
2823             }
2824           }
2825         }
2826       }
2827     }
2828   }
2829
2830   if (pws->mParserFilter)
2831     pws->mParserFilter->RawBuffer(buf, &theNumRead);
2832
2833   result = pws->mScanner->Append(buf, theNumRead, pws->mRequest);
2834   if (NS_SUCCEEDED(result)) {
2835     *writeCount = count;
2836   }
2837
2838   return result;
2839 }
2840
2841 nsresult
2842 nsParser::OnDataAvailable(nsIRequest *request, nsISupports* aContext,
2843                           nsIInputStream *pIStream, PRUint32 sourceOffset,
2844                           PRUint32 aLength)
2845 {
2846   NS_PRECONDITION((eOnStart == mParserContext->mStreamListenerState ||
2847                    eOnDataAvail == mParserContext->mStreamListenerState),
2848             "Error: OnStartRequest() must be called before OnDataAvailable()");
2849   NS_PRECONDITION(NS_InputStreamIsBuffered(pIStream),
2850                   "Must have a buffered input stream");
2851
2852   nsresult rv = NS_OK;
2853
2854   CParserContext *theContext = mParserContext;
2855
2856   while (theContext && theContext->mRequest != request) {
2857     theContext = theContext->mPrevContext;
2858   }
2859
2860   if (theContext) {
2861     theContext->mStreamListenerState = eOnDataAvail;
2862
2863     if ((mFlags & NS_PARSER_FLAG_PARSER_ENABLED) &&
2864         mSpeculativeScriptThread) {
2865       mSpeculativeScriptThread->StopParsing(PR_FALSE);
2866     }
2867
2868     if (eInvalidDetect == theContext->mAutoDetectStatus) {
2869       if (theContext->mScanner) {
2870         nsScannerIterator iter;
2871         theContext->mScanner->EndReading(iter);
2872         theContext->mScanner->SetPosition(iter, PR_TRUE);
2873       }
2874     }
2875
2876     PRUint32 totalRead;
2877     ParserWriteStruct pws;
2878     pws.mNeedCharsetCheck =
2879       (0 == sourceOffset) && (mCharsetSource < kCharsetFromMetaTag);
2880     pws.mParser = this;
2881     pws.mParserFilter = mParserFilter;
2882     pws.mScanner = theContext->mScanner;
2883     pws.mRequest = request;
2884
2885     rv = pIStream->ReadSegments(ParserWriteFunc, &pws, aLength, &totalRead);
2886     if (NS_FAILED(rv)) {
2887       return rv;
2888     }
2889
2890     // Don't bother to start parsing until we've seen some
2891     // non-whitespace data
2892     if (mScriptsExecuting == 0 &&
2893         theContext->mScanner->FirstNonWhitespacePosition() >= 0) {
2894       if (mSink) {
2895         mSink->WillParse();
2896       }
2897       rv = ResumeParse();
2898     }
2899   } else {
2900     rv = NS_ERROR_UNEXPECTED;
2901   }
2902
2903   return rv;
2904 }
2905
2906 /**
2907  *  This is called by the networking library once the last block of data
2908  *  has been collected from the net.
2909  */
2910 nsresult
2911 nsParser::OnStopRequest(nsIRequest *request, nsISupports* aContext,
2912                         nsresult status)
2913 {
2914   nsresult rv = NS_OK;
2915
2916   if (mSpeculativeScriptThread) {
2917     mSpeculativeScriptThread->StopParsing(PR_FALSE);
2918   }
2919
2920   CParserContext *pc = mParserContext;
2921   while (pc) {
2922     if (pc->mRequest == request) {
2923       pc->mStreamListenerState = eOnStop;
2924       pc->mScanner->SetIncremental(PR_FALSE);
2925       break;
2926     }
2927
2928     pc = pc->mPrevContext;
2929   }
2930
2931   mStreamStatus = status;
2932
2933   if (mParserFilter)
2934     mParserFilter->Finish();
2935
2936   if (mScriptsExecuting == 0 && NS_SUCCEEDED(rv)) {
2937     if (mSink) {
2938       mSink->WillParse();
2939     }
2940     rv = ResumeParse(PR_TRUE, PR_TRUE);
2941   }
2942
2943   // If the parser isn't enabled, we don't finish parsing till
2944   // it is reenabled.
2945
2946
2947   // XXX Should we wait to notify our observers as well if the
2948   // parser isn't yet enabled?
2949   if (mObserver) {
2950     mObserver->OnStopRequest(request, aContext, status);
2951   }
2952
2953   if (sParserDataListeners && mSink) {
2954     nsISupports *ctx = mSink->GetTarget();
2955     PRInt32 count = sParserDataListeners->Count();
2956
2957     while (count--) {
2958       rv |= sParserDataListeners->ObjectAt(count)->OnStopRequest(request, ctx,
2959                                                                  status);
2960     }
2961   }
2962
2963   return rv;
2964 }
2965
2966
2967 /*******************************************************************
2968   Here come the tokenization methods...
2969  *******************************************************************/
2970
2971
2972 /**
2973  *  Part of the code sandwich, this gets called right before
2974  *  the tokenization process begins. The main reason for
2975  *  this call is to allow the delegate to do initialization.
2976  */
2977 PRBool
2978 nsParser::WillTokenize(PRBool aIsFinalChunk)
2979 {
2980   if (!mParserContext) {
2981     return PR_TRUE;
2982   }
2983
2984   nsITokenizer* theTokenizer;
2985   PRInt32 type = mParserContext->mDTD ? mParserContext->mDTD->GetType() :
2986                                         NS_IPARSER_FLAG_HTML;
2987   nsresult result = mParserContext->GetTokenizer(type, mSink, theTokenizer);
2988   NS_ENSURE_SUCCESS(result, PR_FALSE);
2989   return NS_SUCCEEDED(theTokenizer->WillTokenize(aIsFinalChunk,
2990                                                  &mTokenAllocator));
2991 }
2992
2993
2994 /**
2995  * This is the primary control routine to consume tokens.
2996  * It iteratively consumes tokens until an error occurs or
2997  * you run out of data.
2998  */
2999 nsresult nsParser::Tokenize(PRBool aIsFinalChunk)
3000 {
3001   nsITokenizer* theTokenizer;
3002
3003   nsresult result = NS_ERROR_NOT_AVAILABLE;
3004   if (mParserContext) {
3005     PRInt32 type = mParserContext->mDTD ? mParserContext->mDTD->GetType()
3006                                         : NS_IPARSER_FLAG_HTML;
3007     result = mParserContext->GetTokenizer(type, mSink, theTokenizer);
3008   }
3009
3010   if (NS_SUCCEEDED(result)) {
3011     if (mFlags & NS_PARSER_FLAG_FLUSH_TOKENS) {
3012       // For some reason tokens didn't get flushed (probably
3013       // the parser got blocked before all the tokens in the
3014       // stack got handled). Flush 'em now. Ref. bug 104856
3015       if (theTokenizer->GetCount() != 0) {
3016         return result;
3017       }
3018
3019       // Reset since the tokens have been flushed.
3020       mFlags &= ~NS_PARSER_FLAG_FLUSH_TOKENS;
3021     }
3022
3023     PRBool flushTokens = PR_FALSE;
3024
3025     MOZ_TIMER_START(mTokenizeTime);
3026
3027     mParserContext->mNumConsumed = 0;
3028
3029     PRBool killSink = PR_FALSE;
3030
3031     WillTokenize(aIsFinalChunk);
3032     while (NS_SUCCEEDED(result)) {
3033       mParserContext->mNumConsumed += mParserContext->mScanner->Mark();
3034       result = theTokenizer->ConsumeToken(*mParserContext->mScanner,
3035                                           flushTokens);
3036       if (NS_FAILED(result)) {
3037         mParserContext->mScanner->RewindToMark();
3038         if (kEOF == result){
3039           break;
3040         }
3041         if (NS_ERROR_HTMLPARSER_STOPPARSING == result) {
3042           killSink = PR_TRUE;
3043           result = Terminate();
3044           break;
3045         }
3046       } else if (flushTokens && (mFlags & NS_PARSER_FLAG_OBSERVERS_ENABLED)) {
3047         // I added the extra test of NS_PARSER_FLAG_OBSERVERS_ENABLED to fix Bug# 23931.
3048         // Flush tokens on seeing </SCRIPT> -- Ref: Bug# 22485 --
3049         // Also remember to update the marked position.
3050         mFlags |= NS_PARSER_FLAG_FLUSH_TOKENS;
3051         mParserContext->mNumConsumed += mParserContext->mScanner->Mark();
3052         break;
3053       }
3054     }
3055     DidTokenize(aIsFinalChunk);
3056
3057     MOZ_TIMER_STOP(mTokenizeTime);
3058
3059     if (killSink) {
3060       mSink = nsnull;
3061     }
3062   } else {
3063     result = mInternalState = NS_ERROR_HTMLPARSER_BADTOKENIZER;
3064   }
3065
3066   return result;
3067 }
3068
3069 /**
3070  *  This is the tail-end of the code sandwich for the
3071  *  tokenization process. It gets called once tokenziation
3072  *  has completed for each phase.
3073  */
3074 PRBool
3075 nsParser::DidTokenize(PRBool aIsFinalChunk)
3076 {
3077   if (!mParserContext) {
3078     return PR_TRUE;
3079   }
3080
3081   nsITokenizer* theTokenizer;
3082   PRInt32 type = mParserContext->mDTD ? mParserContext->mDTD->GetType() :
3083                                         NS_IPARSER_FLAG_HTML;
3084   nsresult rv = mParserContext->GetTokenizer(type, mSink, theTokenizer);
3085   NS_ENSURE_SUCCESS(rv, PR_FALSE);
3086
3087   rv = theTokenizer->DidTokenize(aIsFinalChunk);
3088   return NS_SUCCEEDED(rv);
3089 }
3090
3091 /**
3092  * Get the channel associated with this parser
3093  *
3094  * @param aChannel out param that will contain the result
3095  * @return NS_OK if successful
3096  */
3097 NS_IMETHODIMP
3098 nsParser::GetChannel(nsIChannel** aChannel)
3099 {
3100   nsresult result = NS_ERROR_NOT_AVAILABLE;
3101   if (mParserContext && mParserContext->mRequest) {
3102     result = CallQueryInterface(mParserContext->mRequest, aChannel);
3103   }
3104   return result;
3105 }
3106
3107 /**
3108  * Get the DTD associated with this parser
3109  */
3110 NS_IMETHODIMP
3111 nsParser::GetDTD(nsIDTD** aDTD)
3112 {
3113   if (mParserContext) {
3114     NS_IF_ADDREF(*aDTD = mParserContext->mDTD);
3115   }
3116
3117   return NS_OK;
3118 }
3119