1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /* ***** BEGIN LICENSE BLOCK *****
3 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
5 * The contents of this file are subject to the Mozilla Public License Version
6 * 1.1 (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 * http://www.mozilla.org/MPL/
10 * Software distributed under the License is distributed on an "AS IS" basis,
11 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12 * for the specific language governing rights and limitations under the
15 * The Original Code is mozilla.org code.
17 * The Initial Developer of the Original Code is
18 * Netscape Communications Corporation.
19 * Portions created by the Initial Developer are Copyright (C) 1998
20 * the Initial Developer. All Rights Reserved.
24 * Alternatively, the contents of this file may be used under the terms of
25 * either of the GNU General Public License Version 2 or later (the "GPL"),
26 * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27 * in which case the provisions of the GPL or the LGPL are applicable instead
28 * of those above. If you wish to allow use of your version of this file only
29 * under the terms of either the GPL or the LGPL, and not to allow others to
30 * use your version of this file under the terms of the MPL, indicate your
31 * decision by deleting the provisions above and replace them with the notice
32 * and other provisions required by the GPL or the LGPL. If you do not delete
33 * the provisions above, a recipient may use your version of this file under
34 * the terms of any one of the MPL, the GPL or the LGPL.
36 * ***** END LICENSE BLOCK ***** */
41 * This class does two primary jobs:
42 * 1) It iterates the tokens provided during the
43 * tokenization process, identifing where elements
44 * begin and end (doing validation and normalization).
45 * 2) It controls and coordinates with an instance of
46 * the IContentSink interface, to coordinate the
47 * the production of the content model.
49 * The basic operation of this class assumes that an HTML
50 * document is non-normalized. Therefore, we don't process
51 * the document in a normalized way. Don't bother to look
52 * for methods like: doHead() or doBody().
54 * Instead, in order to be backward compatible, we must
55 * scan the set of tokens and perform this basic set of
57 * 1) Determine the token type (easy, since the tokens know)
58 * 2) Determine the appropriate section of the HTML document
59 * each token belongs in (HTML,HEAD,BODY,FRAMESET).
60 * 3) Insert content into our document (via the sink) into
61 * the correct section.
62 * 4) In the case of tags that belong in the BODY, we must
63 * ensure that our underlying document state reflects
64 * the appropriate context for our tag.
66 * For example,if we see a <TR>, we must ensure our
67 * document contains a table into which the row can
68 * be placed. This may result in "implicit containers"
69 * created to ensure a well-formed document.
76 #include "nsIParser.h"
78 #include "nsParserNode.h"
80 #include "CParserContext.h"
81 #include "nsParserCIID.h"
82 #include "nsITokenizer.h"
83 #include "nsHTMLTags.h"
84 #include "nsDTDUtils.h"
86 #include "nsThreadUtils.h"
87 #include "nsIContentSink.h"
88 #include "nsIParserFilter.h"
89 #include "nsCOMArray.h"
90 #include "nsIUnicharStreamListener.h"
91 #include "nsCycleCollectionParticipant.h"
93 class nsICharsetConverterManager
;
94 class nsICharsetAlias
;
97 class nsSpeculativeScriptThread
;
101 #pragma warning( disable : 4275 )
105 class nsParser
: public nsIParser
,
106 public nsIStreamListener
110 * Called on module init
112 static nsresult
Init();
115 * Called on module shutdown
117 static void Shutdown();
119 NS_DECL_CYCLE_COLLECTING_ISUPPORTS
120 NS_DECL_CYCLE_COLLECTION_CLASS_AMBIGUOUS(nsParser
, nsIParser
)
123 * default constructor
124 * @update gess5/11/98
130 * @update gess5/11/98
135 * Select given content sink into parser for parser output
136 * @update gess5/11/98
137 * @param aSink is the new sink to be used by parser
138 * @return old sink, or NULL
140 NS_IMETHOD_(void) SetContentSink(nsIContentSink
* aSink
);
143 * retrive the sink set into the parser
144 * @update gess5/11/98
145 * @param aSink is the new sink to be used by parser
146 * @return old sink, or NULL
148 NS_IMETHOD_(nsIContentSink
*) GetContentSink(void);
151 * Call this method once you've created a parser, and want to instruct it
152 * about the command which caused the parser to be constructed. For example,
153 * this allows us to select a DTD which can do, say, view-source.
155 * @update gess 3/25/98
156 * @param aCommand -- ptrs to string that contains command
159 NS_IMETHOD_(void) GetCommand(nsCString
& aCommand
);
160 NS_IMETHOD_(void) SetCommand(const char* aCommand
);
161 NS_IMETHOD_(void) SetCommand(eParserCommands aParserCommand
);
164 * Call this method once you've created a parser, and want to instruct it
165 * about what charset to load
167 * @update ftang 4/23/99
168 * @param aCharset- the charset of a document
169 * @param aCharsetSource- the source of the charset
172 NS_IMETHOD_(void) SetDocumentCharset(const nsACString
& aCharset
, PRInt32 aSource
);
174 NS_IMETHOD_(void) GetDocumentCharset(nsACString
& aCharset
, PRInt32
& aSource
)
177 aSource
= mCharsetSource
;
181 NS_IMETHOD_(void) SetParserFilter(nsIParserFilter
* aFilter
);
184 * Retrieve the scanner from the topmost parser context
186 * @update gess 6/9/98
187 * @return ptr to scanner
189 NS_IMETHOD_(nsDTDMode
) GetParseMode(void);
192 * Cause parser to parse input from given URL
193 * @update gess5/11/98
194 * @param aURL is a descriptor for source document
195 * @param aListener is a listener to forward notifications to
196 * @return TRUE if all went well -- FALSE otherwise
198 NS_IMETHOD
Parse(nsIURI
* aURL
,
199 nsIRequestObserver
* aListener
= nsnull
,
201 nsDTDMode aMode
= eDTDMode_autodetect
);
204 * @update gess5/11/98
205 * @param anHTMLString contains a string-full of real HTML
206 * @param appendTokens tells us whether we should insert tokens inline, or append them.
207 * @return TRUE if all went well -- FALSE otherwise
209 NS_IMETHOD
Parse(const nsAString
& aSourceBuffer
,
211 const nsACString
& aContentType
,
213 nsDTDMode aMode
= eDTDMode_autodetect
);
215 NS_IMETHOD_(void *) GetRootContextKey();
218 * This method needs documentation
220 NS_IMETHOD
ParseFragment(const nsAString
& aSourceBuffer
,
222 nsTArray
<nsString
>& aTagStack
,
224 const nsACString
& aContentType
,
225 nsDTDMode aMode
= eDTDMode_autodetect
);
229 * This method gets called when the tokens have been consumed, and it's time
230 * to build the model via the content sink.
231 * @update gess5/11/98
232 * @return YES if model building went well -- NO otherwise.
234 NS_IMETHOD
BuildModel(void);
237 * Call this when you want control whether or not the parser will parse
238 * and tokenize input (TRUE), or whether it just caches input to be
239 * parsed later (FALSE).
241 * @update gess 9/1/98
242 * @param aState determines whether we parse/tokenize or just cache.
243 * @return current state
245 NS_IMETHOD
ContinueParsing();
246 NS_IMETHOD
ContinueInterruptedParsing();
247 NS_IMETHOD_(void) BlockParser();
248 NS_IMETHOD_(void) UnblockParser();
249 NS_IMETHOD
Terminate(void);
252 * Call this to query whether the parser is enabled or not.
254 * @update vidur 4/12/99
255 * @return current state
257 NS_IMETHOD_(PRBool
) IsParserEnabled();
260 * Call this to query whether the parser thinks it's done with parsing.
262 * @update rickg 5/12/01
263 * @return complete state
265 NS_IMETHOD_(PRBool
) IsComplete();
268 * This rather arcane method (hack) is used as a signal between the
269 * DTD and the parser. It allows the DTD to tell the parser that content
270 * that comes through (parser::parser(string)) but not consumed should
271 * propagate into the next string based parse call.
273 * @update gess 9/1/98
274 * @param aState determines whether we propagate unused string content.
275 * @return current state
277 void SetUnusedInput(nsString
& aBuffer
);
280 * This method gets called (automatically) during incremental parsing
281 * @update gess5/11/98
282 * @return TRUE if all went well, otherwise FALSE
284 virtual nsresult
ResumeParse(PRBool allowIteration
= PR_TRUE
,
285 PRBool aIsFinalChunk
= PR_FALSE
,
286 PRBool aCanInterrupt
= PR_TRUE
);
288 //*********************************************
289 // These methods are callback methods used by
290 // net lib to let us know about our inputstream.
291 //*********************************************
292 // nsIRequestObserver methods:
293 NS_DECL_NSIREQUESTOBSERVER
295 // nsIStreamListener methods:
296 NS_DECL_NSISTREAMLISTENER
298 void PushContext(CParserContext
& aContext
);
299 CParserContext
* PopContext();
300 CParserContext
* PeekContext() {return mParserContext
;}
303 * Get the channel associated with this parser
304 * @update harishd,gagan 07/17/01
305 * @param aChannel out param that will contain the result
306 * @return NS_OK if successful
308 NS_IMETHOD
GetChannel(nsIChannel
** aChannel
);
311 * Get the DTD associated with this parser
312 * @update vidur 9/29/99
313 * @param aDTD out param that will contain the result
314 * @return NS_OK if successful, NS_ERROR_FAILURE for runtime error
316 NS_IMETHOD
GetDTD(nsIDTD
** aDTD
);
319 * Detects the existence of a META tag with charset information in
322 PRBool
DetectMetaTag(const char* aBytes
,
325 PRInt32
& oCharsetSource
);
327 void SetSinkCharset(nsACString
& aCharset
);
330 * Removes continue parsing events
331 * @update kmcclusk 5/18/98
334 NS_IMETHODIMP
CancelParsingEvents();
337 * Indicates whether the parser is in a state where it
338 * can be interrupted.
339 * @return PR_TRUE if parser can be interrupted, PR_FALSE if it can not be interrupted.
340 * @update kmcclusk 5/18/98
342 PRBool
CanInterrupt(void);
345 * Set to parser state to indicate whether parsing tokens can be interrupted
346 * @param aCanInterrupt PR_TRUE if parser can be interrupted, PR_FALSE if it can not be interrupted.
347 * @update kmcclusk 5/18/98
349 void SetCanInterrupt(PRBool aCanInterrupt
);
352 * This is called when the final chunk has been
353 * passed to the parser and the content sink has
354 * interrupted token processing. It schedules
355 * a ParserContinue PL_Event which will ask the parser
356 * to HandleParserContinueEvent when it is handled.
357 * @update kmcclusk6/1/2001
359 nsresult
PostContinueEvent();
362 * Fired when the continue parse event is triggered.
363 * @update kmcclusk 5/18/98
365 void HandleParserContinueEvent(class nsParserContinueEvent
*);
368 * Called by top-level scanners when data from necko is added to
371 nsresult
DataAdded(const nsSubstring
& aData
, nsIRequest
*aRequest
);
373 static nsCOMArray
<nsIUnicharStreamListener
> *sParserDataListeners
;
375 static nsICharsetAlias
* GetCharsetAliasService() {
376 return sCharsetAliasService
;
379 static nsICharsetConverterManager
* GetCharsetConverterManager() {
380 return sCharsetConverterManager
;
383 virtual void Reset() {
388 nsIThreadPool
* ThreadPool() {
389 return sSpeculativeThreadPool
;
393 * Tells the parser that a script is now executing. The only data we
394 * should resume parsing for is document.written data. We'll deal with any
395 * data that comes in over the network later.
397 virtual void ScriptExecuting();
400 * Tells the parser that the script is done executing. We should now
401 * continue the regular parsing process.
403 virtual void ScriptDidExecute();
407 void Initialize(PRBool aConstructor
= PR_FALSE
);
412 * @update gess5/18/98
416 nsresult
WillBuildModel(nsString
& aFilename
);
420 * @update gess5/18/98
424 nsresult
DidBuildModel(nsresult anErrorCode
);
426 void SpeculativelyParse();
430 /*******************************************
431 These are the tokenization methods...
432 *******************************************/
435 * Part of the code sandwich, this gets called right before
436 * the tokenization process begins. The main reason for
437 * this call is to allow the delegate to do initialization.
439 * @update gess 3/25/98
441 * @return TRUE if it's ok to proceed
443 PRBool
WillTokenize(PRBool aIsFinalChunk
= PR_FALSE
);
447 * This is the primary control routine. It iteratively
448 * consumes tokens until an error occurs or you run out
451 * @update gess 3/25/98
454 nsresult
Tokenize(PRBool aIsFinalChunk
= PR_FALSE
);
457 * This is the tail-end of the code sandwich for the
458 * tokenization process. It gets called once tokenziation
461 * @update gess 3/25/98
463 * @return TRUE if all went well
465 PRBool
DidTokenize(PRBool aIsFinalChunk
= PR_FALSE
);
468 //*********************************************
469 // And now, some data members...
470 //*********************************************
473 CParserContext
* mParserContext
;
474 nsCOMPtr
<nsIRequestObserver
> mObserver
;
475 nsCOMPtr
<nsIContentSink
> mSink
;
476 nsIRunnable
* mContinueEvent
; // weak ref
477 nsRefPtr
<nsSpeculativeScriptThread
> mSpeculativeScriptThread
;
479 nsCOMPtr
<nsIParserFilter
> mParserFilter
;
480 nsTokenAllocator mTokenAllocator
;
482 eParserCommands mCommand
;
483 nsresult mInternalState
;
484 PRInt32 mStreamStatus
;
485 PRInt32 mCharsetSource
;
488 PRUint32 mScriptsExecuting
;
490 nsString mUnusedInput
;
492 nsCString mCommandStr
;
494 static nsICharsetAlias
* sCharsetAliasService
;
495 static nsICharsetConverterManager
* sCharsetConverterManager
;
496 static nsIThreadPool
* sSpeculativeThreadPool
;
499 kSpeculativeThreadLimit
= 15,
500 kIdleThreadLimit
= 0,
501 kIdleThreadTimeout
= 50
506 MOZ_TIMER_DECLARE(mParseTime
)
507 MOZ_TIMER_DECLARE(mDTDTime
)
508 MOZ_TIMER_DECLARE(mTokenizeTime
)