parser/htmlparser/src/nsParser.h

   1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
   2 /* ***** BEGIN LICENSE BLOCK *****
   3  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
   4  *
   5  * The contents of this file are subject to the Mozilla Public License Version
   6  * 1.1 (the "License"); you may not use this file except in compliance with
   7  * the License. You may obtain a copy of the License at
   8  * http://www.mozilla.org/MPL/
   9  *
  10  * Software distributed under the License is distributed on an "AS IS" basis,
  11  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
  12  * for the specific language governing rights and limitations under the
  13  * License.
  14  *
  15  * The Original Code is mozilla.org code.
  16  *
  17  * The Initial Developer of the Original Code is
  18  * Netscape Communications Corporation.
  19  * Portions created by the Initial Developer are Copyright (C) 1998
  20  * the Initial Developer. All Rights Reserved.
  21  *
  22  * Contributor(s):
  23  *
  24  * Alternatively, the contents of this file may be used under the terms of
  25  * either of the GNU General Public License Version 2 or later (the "GPL"),
  26  * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
  27  * in which case the provisions of the GPL or the LGPL are applicable instead
  28  * of those above. If you wish to allow use of your version of this file only
  29  * under the terms of either the GPL or the LGPL, and not to allow others to
  30  * use your version of this file under the terms of the MPL, indicate your
  31  * decision by deleting the provisions above and replace them with the notice
  32  * and other provisions required by the GPL or the LGPL. If you do not delete
  33  * the provisions above, a recipient may use your version of this file under
  34  * the terms of any one of the MPL, the GPL or the LGPL.
  35  *
  36  * ***** END LICENSE BLOCK ***** */
  37
  38 /**
  39  * MODULE NOTES:
  40  *
  41  *  This class does two primary jobs:
  42  *    1) It iterates the tokens provided during the
  43  *       tokenization process, identifing where elements
  44  *       begin and end (doing validation and normalization).
  45  *    2) It controls and coordinates with an instance of
  46  *       the IContentSink interface, to coordinate the
  47  *       the production of the content model.
  48  *
  49  *  The basic operation of this class assumes that an HTML
  50  *  document is non-normalized. Therefore, we don't process
  51  *  the document in a normalized way. Don't bother to look
  52  *  for methods like: doHead() or doBody().
  53  *
  54  *  Instead, in order to be backward compatible, we must
  55  *  scan the set of tokens and perform this basic set of
  56  *  operations:
  57  *    1)  Determine the token type (easy, since the tokens know)
  58  *    2)  Determine the appropriate section of the HTML document
  59  *        each token belongs in (HTML,HEAD,BODY,FRAMESET).
  60  *    3)  Insert content into our document (via the sink) into
  61  *        the correct section.
  62  *    4)  In the case of tags that belong in the BODY, we must
  63  *        ensure that our underlying document state reflects
  64  *        the appropriate context for our tag.
  65  *
  66  *        For example,if we see a <TR>, we must ensure our
  67  *        document contains a table into which the row can
  68  *        be placed. This may result in "implicit containers"
  69  *        created to ensure a well-formed document.
  70  *
  71  */
  72
  73 #ifndef NS_PARSER__
  74 #define NS_PARSER__
  75
  76 #include "nsIParser.h"
  77 #include "nsDeque.h"
  78 #include "nsParserNode.h"
  79 #include "nsIURL.h"
  80 #include "CParserContext.h"
  81 #include "nsParserCIID.h"
  82 #include "nsITokenizer.h"
  83 #include "nsHTMLTags.h"
  84 #include "nsDTDUtils.h"
  85 #include "nsTimer.h"
  86 #include "nsThreadUtils.h"
  87 #include "nsIContentSink.h"
  88 #include "nsIParserFilter.h"
  89 #include "nsCOMArray.h"
  90 #include "nsIUnicharStreamListener.h"
  91 #include "nsCycleCollectionParticipant.h"
  92
  93 class nsICharsetConverterManager;
  94 class nsICharsetAlias;
  95 class nsIDTD;
  96 class nsScanner;
  97 class nsSpeculativeScriptThread;
  98 class nsIThreadPool;
  99
 100 #ifdef _MSC_VER
 101 #pragma warning( disable : 4275 )
 102 #endif
 103
 104
 105 class nsParser : public nsIParser,
 106                  public nsIStreamListener
 107 {
 108   public:
 109     /**
 110      * Called on module init
 111      */
 112     static nsresult Init();
 113
 114     /**
 115      * Called on module shutdown
 116      */
 117     static void Shutdown();
 118
 119     NS_DECL_CYCLE_COLLECTING_ISUPPORTS
 120     NS_DECL_CYCLE_COLLECTION_CLASS_AMBIGUOUS(nsParser, nsIParser)
 121
 122     /**
 123      * default constructor
 124      * @update  gess5/11/98
 125      */
 126     nsParser();
 127
 128     /**
 129      * Destructor
 130      * @update  gess5/11/98
 131      */
 132     virtual ~nsParser();
 133
 134     /**
 135      * Select given content sink into parser for parser output
 136      * @update  gess5/11/98
 137      * @param   aSink is the new sink to be used by parser
 138      * @return  old sink, or NULL
 139      */
 140     NS_IMETHOD_(void) SetContentSink(nsIContentSink* aSink);
 141
 142     /**
 143      * retrive the sink set into the parser
 144      * @update  gess5/11/98
 145      * @param   aSink is the new sink to be used by parser
 146      * @return  old sink, or NULL
 147      */
 148     NS_IMETHOD_(nsIContentSink*) GetContentSink(void);
 149
 150     /**
 151      *  Call this method once you've created a parser, and want to instruct it
 152      *  about the command which caused the parser to be constructed. For example,
 153      *  this allows us to select a DTD which can do, say, view-source.
 154      *
 155      *  @update  gess 3/25/98
 156      *  @param   aCommand -- ptrs to string that contains command
 157      *  @return  nada
 158      */
 159     NS_IMETHOD_(void) GetCommand(nsCString& aCommand);
 160     NS_IMETHOD_(void) SetCommand(const char* aCommand);
 161     NS_IMETHOD_(void) SetCommand(eParserCommands aParserCommand);
 162
 163     /**
 164      *  Call this method once you've created a parser, and want to instruct it
 165      *  about what charset to load
 166      *
 167      *  @update  ftang 4/23/99
 168      *  @param   aCharset- the charset of a document
 169      *  @param   aCharsetSource- the source of the charset
 170      *  @return  nada
 171      */
 172     NS_IMETHOD_(void) SetDocumentCharset(const nsACString& aCharset, PRInt32 aSource);
 173
 174     NS_IMETHOD_(void) GetDocumentCharset(nsACString& aCharset, PRInt32& aSource)
 175     {
 176          aCharset = mCharset;
 177          aSource = mCharsetSource;
 178     }
 179
 180
 181     NS_IMETHOD_(void) SetParserFilter(nsIParserFilter* aFilter);
 182
 183     /**
 184      *  Retrieve the scanner from the topmost parser context
 185      *
 186      *  @update  gess 6/9/98
 187      *  @return  ptr to scanner
 188      */
 189     NS_IMETHOD_(nsDTDMode) GetParseMode(void);
 190
 191     /**
 192      * Cause parser to parse input from given URL
 193      * @update  gess5/11/98
 194      * @param   aURL is a descriptor for source document
 195      * @param   aListener is a listener to forward notifications to
 196      * @return  TRUE if all went well -- FALSE otherwise
 197      */
 198     NS_IMETHOD Parse(nsIURI* aURL,
 199                      nsIRequestObserver* aListener = nsnull,
 200                      void* aKey = 0,
 201                      nsDTDMode aMode = eDTDMode_autodetect);
 202
 203     /**
 204      * @update  gess5/11/98
 205      * @param   anHTMLString contains a string-full of real HTML
 206      * @param   appendTokens tells us whether we should insert tokens inline, or append them.
 207      * @return  TRUE if all went well -- FALSE otherwise
 208      */
 209     NS_IMETHOD Parse(const nsAString& aSourceBuffer,
 210                      void* aKey,
 211                      const nsACString& aContentType,
 212                      PRBool aLastCall,
 213                      nsDTDMode aMode = eDTDMode_autodetect);
 214
 215     NS_IMETHOD_(void *) GetRootContextKey();
 216
 217     /**
 218      * This method needs documentation
 219      */
 220     NS_IMETHOD ParseFragment(const nsAString& aSourceBuffer,
 221                              void* aKey,
 222                              nsTArray<nsString>& aTagStack,
 223                              PRBool aXMLMode,
 224                              const nsACString& aContentType,
 225                              nsDTDMode aMode = eDTDMode_autodetect);
 226
 227
 228     /**
 229      * This method gets called when the tokens have been consumed, and it's time
 230      * to build the model via the content sink.
 231      * @update  gess5/11/98
 232      * @return  YES if model building went well -- NO otherwise.
 233      */
 234     NS_IMETHOD BuildModel(void);
 235
 236     /**
 237      *  Call this when you want control whether or not the parser will parse
 238      *  and tokenize input (TRUE), or whether it just caches input to be
 239      *  parsed later (FALSE).
 240      *
 241      *  @update  gess 9/1/98
 242      *  @param   aState determines whether we parse/tokenize or just cache.
 243      *  @return  current state
 244      */
 245     NS_IMETHOD        ContinueParsing();
 246     NS_IMETHOD        ContinueInterruptedParsing();
 247     NS_IMETHOD_(void) BlockParser();
 248     NS_IMETHOD_(void) UnblockParser();
 249     NS_IMETHOD        Terminate(void);
 250
 251     /**
 252      * Call this to query whether the parser is enabled or not.
 253      *
 254      *  @update  vidur 4/12/99
 255      *  @return  current state
 256      */
 257     NS_IMETHOD_(PRBool) IsParserEnabled();
 258
 259     /**
 260      * Call this to query whether the parser thinks it's done with parsing.
 261      *
 262      *  @update  rickg 5/12/01
 263      *  @return  complete state
 264      */
 265     NS_IMETHOD_(PRBool) IsComplete();
 266
 267     /**
 268      *  This rather arcane method (hack) is used as a signal between the
 269      *  DTD and the parser. It allows the DTD to tell the parser that content
 270      *  that comes through (parser::parser(string)) but not consumed should
 271      *  propagate into the next string based parse call.
 272      *
 273      *  @update  gess 9/1/98
 274      *  @param   aState determines whether we propagate unused string content.
 275      *  @return  current state
 276      */
 277     void SetUnusedInput(nsString& aBuffer);
 278
 279     /**
 280      * This method gets called (automatically) during incremental parsing
 281      * @update  gess5/11/98
 282      * @return  TRUE if all went well, otherwise FALSE
 283      */
 284     virtual nsresult ResumeParse(PRBool allowIteration = PR_TRUE,
 285                                  PRBool aIsFinalChunk = PR_FALSE,
 286                                  PRBool aCanInterrupt = PR_TRUE);
 287
 288      //*********************************************
 289       // These methods are callback methods used by
 290       // net lib to let us know about our inputstream.
 291       //*********************************************
 292     // nsIRequestObserver methods:
 293     NS_DECL_NSIREQUESTOBSERVER
 294
 295     // nsIStreamListener methods:
 296     NS_DECL_NSISTREAMLISTENER
 297
 298     void              PushContext(CParserContext& aContext);
 299     CParserContext*   PopContext();
 300     CParserContext*   PeekContext() {return mParserContext;}
 301
 302     /**
 303      * Get the channel associated with this parser
 304      * @update harishd,gagan 07/17/01
 305      * @param aChannel out param that will contain the result
 306      * @return NS_OK if successful
 307      */
 308     NS_IMETHOD GetChannel(nsIChannel** aChannel);
 309
 310     /**
 311      * Get the DTD associated with this parser
 312      * @update vidur 9/29/99
 313      * @param aDTD out param that will contain the result
 314      * @return NS_OK if successful, NS_ERROR_FAILURE for runtime error
 315      */
 316     NS_IMETHOD GetDTD(nsIDTD** aDTD);
 317
 318     /**
 319      * Detects the existence of a META tag with charset information in
 320      * the given buffer.
 321      */
 322     PRBool DetectMetaTag(const char* aBytes,
 323                          PRInt32 aLen,
 324                          nsCString& oCharset,
 325                          PRInt32& oCharsetSource);
 326
 327     void SetSinkCharset(nsACString& aCharset);
 328
 329     /**
 330      *  Removes continue parsing events
 331      *  @update  kmcclusk 5/18/98
 332      */
 333
 334     NS_IMETHODIMP CancelParsingEvents();
 335
 336     /**
 337      *  Indicates whether the parser is in a state where it
 338      *  can be interrupted.
 339      *  @return PR_TRUE if parser can be interrupted, PR_FALSE if it can not be interrupted.
 340      *  @update  kmcclusk 5/18/98
 341      */
 342     PRBool CanInterrupt(void);
 343
 344     /**
 345      *  Set to parser state to indicate whether parsing tokens can be interrupted
 346      *  @param aCanInterrupt PR_TRUE if parser can be interrupted, PR_FALSE if it can not be interrupted.
 347      *  @update  kmcclusk 5/18/98
 348      */
 349     void SetCanInterrupt(PRBool aCanInterrupt);
 350
 351     /**
 352      * This is called when the final chunk has been
 353      * passed to the parser and the content sink has
 354      * interrupted token processing. It schedules
 355      * a ParserContinue PL_Event which will ask the parser
 356      * to HandleParserContinueEvent when it is handled.
 357      * @update  kmcclusk6/1/2001
 358      */
 359     nsresult PostContinueEvent();
 360
 361     /**
 362      *  Fired when the continue parse event is triggered.
 363      *  @update  kmcclusk 5/18/98
 364      */
 365     void HandleParserContinueEvent(class nsParserContinueEvent *);
 366
 367     /**
 368      * Called by top-level scanners when data from necko is added to
 369      * the scanner.
 370      */
 371     nsresult DataAdded(const nsSubstring& aData, nsIRequest *aRequest);
 372
 373     static nsCOMArray<nsIUnicharStreamListener> *sParserDataListeners;
 374
 375     static nsICharsetAlias* GetCharsetAliasService() {
 376       return sCharsetAliasService;
 377     }
 378
 379     static nsICharsetConverterManager* GetCharsetConverterManager() {
 380       return sCharsetConverterManager;
 381     }
 382
 383     virtual void Reset() {
 384       Cleanup();
 385       Initialize();
 386     }
 387
 388     nsIThreadPool* ThreadPool() {
 389       return sSpeculativeThreadPool;
 390     }
 391
 392     /**
 393      * Tells the parser that a script is now executing. The only data we
 394      * should resume parsing for is document.written data. We'll deal with any
 395      * data that comes in over the network later.
 396      */
 397     virtual void ScriptExecuting();
 398
 399     /**
 400      * Tells the parser that the script is done executing. We should now
 401      * continue the regular parsing process.
 402      */
 403     virtual void ScriptDidExecute();
 404
 405  protected:
 406
 407     void Initialize(PRBool aConstructor = PR_FALSE);
 408     void Cleanup();
 409
 410     /**
 411      *
 412      * @update  gess5/18/98
 413      * @param
 414      * @return
 415      */
 416     nsresult WillBuildModel(nsString& aFilename);
 417
 418     /**
 419      *
 420      * @update  gess5/18/98
 421      * @param
 422      * @return
 423      */
 424     nsresult DidBuildModel(nsresult anErrorCode);
 425
 426     void SpeculativelyParse();
 427
 428 private:
 429
 430     /*******************************************
 431       These are the tokenization methods...
 432      *******************************************/
 433
 434     /**
 435      *  Part of the code sandwich, this gets called right before
 436      *  the tokenization process begins. The main reason for
 437      *  this call is to allow the delegate to do initialization.
 438      *
 439      *  @update  gess 3/25/98
 440      *  @param
 441      *  @return  TRUE if it's ok to proceed
 442      */
 443     PRBool WillTokenize(PRBool aIsFinalChunk = PR_FALSE);
 444
 445
 446     /**
 447      *  This is the primary control routine. It iteratively
 448      *  consumes tokens until an error occurs or you run out
 449      *  of data.
 450      *
 451      *  @update  gess 3/25/98
 452      *  @return  error code
 453      */
 454     nsresult Tokenize(PRBool aIsFinalChunk = PR_FALSE);
 455
 456     /**
 457      *  This is the tail-end of the code sandwich for the
 458      *  tokenization process. It gets called once tokenziation
 459      *  has completed.
 460      *
 461      *  @update  gess 3/25/98
 462      *  @param
 463      *  @return  TRUE if all went well
 464      */
 465     PRBool DidTokenize(PRBool aIsFinalChunk = PR_FALSE);
 466
 467 protected:
 468     //*********************************************
 469     // And now, some data members...
 470     //*********************************************
 471
 472
 473     CParserContext*              mParserContext;
 474     nsCOMPtr<nsIRequestObserver> mObserver;
 475     nsCOMPtr<nsIContentSink>     mSink;
 476     nsIRunnable*                 mContinueEvent;  // weak ref
 477     nsRefPtr<nsSpeculativeScriptThread> mSpeculativeScriptThread;
 478
 479     nsCOMPtr<nsIParserFilter> mParserFilter;
 480     nsTokenAllocator          mTokenAllocator;
 481
 482     eParserCommands     mCommand;
 483     nsresult            mInternalState;
 484     PRInt32             mStreamStatus;
 485     PRInt32             mCharsetSource;
 486
 487     PRUint16            mFlags;
 488     PRUint32            mScriptsExecuting;
 489
 490     nsString            mUnusedInput;
 491     nsCString           mCharset;
 492     nsCString           mCommandStr;
 493
 494     static nsICharsetAlias*            sCharsetAliasService;
 495     static nsICharsetConverterManager* sCharsetConverterManager;
 496     static nsIThreadPool*              sSpeculativeThreadPool;
 497
 498     enum {
 499       kSpeculativeThreadLimit = 15,
 500       kIdleThreadLimit = 0,
 501       kIdleThreadTimeout = 50
 502     };
 503
 504 public:
 505
 506     MOZ_TIMER_DECLARE(mParseTime)
 507     MOZ_TIMER_DECLARE(mDTDTime)
 508     MOZ_TIMER_DECLARE(mTokenizeTime)
 509 };
 510
 511 #endif
 512