1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /* ***** BEGIN LICENSE BLOCK *****
3 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
5 * The contents of this file are subject to the Mozilla Public License Version
6 * 1.1 (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 * http://www.mozilla.org/MPL/
10 * Software distributed under the License is distributed on an "AS IS" basis,
11 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12 * for the specific language governing rights and limitations under the
15 * The Original Code is mozilla.org code.
17 * The Initial Developer of the Original Code is
18 * Netscape Communications Corporation.
19 * Portions created by the Initial Developer are Copyright (C) 1998
20 * the Initial Developer. All Rights Reserved.
24 * Alternatively, the contents of this file may be used under the terms of
25 * either of the GNU General Public License Version 2 or later (the "GPL"),
26 * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27 * in which case the provisions of the GPL or the LGPL are applicable instead
28 * of those above. If you wish to allow use of your version of this file only
29 * under the terms of either the GPL or the LGPL, and not to allow others to
30 * use your version of this file under the terms of the MPL, indicate your
31 * decision by deleting the provisions above and replace them with the notice
32 * and other provisions required by the GPL or the LGPL. If you do not delete
33 * the provisions above, a recipient may use your version of this file under
34 * the terms of any one of the MPL, the GPL or the LGPL.
36 * ***** END LICENSE BLOCK ***** */
41 * This class does two primary jobs:
42 * 1) It iterates the tokens provided during the
43 * tokenization process, identifing where elements
44 * begin and end (doing validation and normalization).
45 * 2) It controls and coordinates with an instance of
46 * the IContentSink interface, to coordinate the
47 * the production of the content model.
49 * The basic operation of this class assumes that an HTML
50 * document is non-normalized. Therefore, we don't process
51 * the document in a normalized way. Don't bother to look
52 * for methods like: doHead() or doBody().
54 * Instead, in order to be backward compatible, we must
55 * scan the set of tokens and perform this basic set of
57 * 1) Determine the token type (easy, since the tokens know)
58 * 2) Determine the appropriate section of the HTML document
59 * each token belongs in (HTML,HEAD,BODY,FRAMESET).
60 * 3) Insert content into our document (via the sink) into
61 * the correct section.
62 * 4) In the case of tags that belong in the BODY, we must
63 * ensure that our underlying document state reflects
64 * the appropriate context for our tag.
66 * For example,if we see a <TR>, we must ensure our
67 * document contains a table into which the row can
68 * be placed. This may result in "implicit containers"
69 * created to ensure a well-formed document.
76 #include "nsIParser.h"
78 #include "nsParserNode.h"
80 #include "CParserContext.h"
81 #include "nsParserCIID.h"
82 #include "nsITokenizer.h"
83 #include "nsHTMLTags.h"
84 #include "nsDTDUtils.h"
86 #include "nsThreadUtils.h"
87 #include "nsIContentSink.h"
88 #include "nsIParserFilter.h"
89 #include "nsCOMArray.h"
90 #include "nsIUnicharStreamListener.h"
91 #include "nsCycleCollectionParticipant.h"
93 class nsICharsetConverterManager
;
94 class nsICharsetAlias
;
97 class nsIProgressEventSink
;
100 #pragma warning( disable : 4275 )
104 class nsParser
: public nsIParser
,
105 public nsIStreamListener
{
109 friend class CTokenHandler
;
111 * Called on module init
113 static nsresult
Init();
116 * Called on module shutdown
118 static void Shutdown();
120 NS_DECL_CYCLE_COLLECTING_ISUPPORTS
121 NS_DECL_CYCLE_COLLECTION_CLASS_AMBIGUOUS(nsParser
, nsIParser
)
124 * default constructor
125 * @update gess5/11/98
131 * @update gess5/11/98
136 * Select given content sink into parser for parser output
137 * @update gess5/11/98
138 * @param aSink is the new sink to be used by parser
139 * @return old sink, or NULL
141 NS_IMETHOD_(void) SetContentSink(nsIContentSink
* aSink
);
144 * retrive the sink set into the parser
145 * @update gess5/11/98
146 * @param aSink is the new sink to be used by parser
147 * @return old sink, or NULL
149 NS_IMETHOD_(nsIContentSink
*) GetContentSink(void);
152 * Call this method once you've created a parser, and want to instruct it
153 * about the command which caused the parser to be constructed. For example,
154 * this allows us to select a DTD which can do, say, view-source.
156 * @update gess 3/25/98
157 * @param aCommand -- ptrs to string that contains command
160 NS_IMETHOD_(void) GetCommand(nsCString
& aCommand
);
161 NS_IMETHOD_(void) SetCommand(const char* aCommand
);
162 NS_IMETHOD_(void) SetCommand(eParserCommands aParserCommand
);
165 * Call this method once you've created a parser, and want to instruct it
166 * about what charset to load
168 * @update ftang 4/23/99
169 * @param aCharset- the charset of a document
170 * @param aCharsetSource- the source of the charset
173 NS_IMETHOD_(void) SetDocumentCharset(const nsACString
& aCharset
, PRInt32 aSource
);
175 NS_IMETHOD_(void) GetDocumentCharset(nsACString
& aCharset
, PRInt32
& aSource
)
178 aSource
= mCharsetSource
;
182 NS_IMETHOD_(void) SetParserFilter(nsIParserFilter
* aFilter
);
185 * Retrieve the scanner from the topmost parser context
187 * @update gess 6/9/98
188 * @return ptr to scanner
190 NS_IMETHOD_(nsDTDMode
) GetParseMode(void);
193 * Cause parser to parse input from given URL
194 * @update gess5/11/98
195 * @param aURL is a descriptor for source document
196 * @param aListener is a listener to forward notifications to
197 * @return TRUE if all went well -- FALSE otherwise
199 NS_IMETHOD
Parse(nsIURI
* aURL
,
200 nsIRequestObserver
* aListener
= nsnull
,
202 nsDTDMode aMode
= eDTDMode_autodetect
);
205 * @update gess5/11/98
206 * @param anHTMLString contains a string-full of real HTML
207 * @param appendTokens tells us whether we should insert tokens inline, or append them.
208 * @return TRUE if all went well -- FALSE otherwise
210 NS_IMETHOD
Parse(const nsAString
& aSourceBuffer
,
212 const nsACString
& aContentType
,
214 nsDTDMode aMode
= eDTDMode_autodetect
);
216 NS_IMETHOD_(void *) GetRootContextKey();
219 * This method needs documentation
221 NS_IMETHOD
ParseFragment(const nsAString
& aSourceBuffer
,
223 nsTArray
<nsString
>& aTagStack
,
225 const nsACString
& aContentType
,
226 nsDTDMode aMode
= eDTDMode_autodetect
);
230 * This method gets called when the tokens have been consumed, and it's time
231 * to build the model via the content sink.
232 * @update gess5/11/98
233 * @return YES if model building went well -- NO otherwise.
235 NS_IMETHOD
BuildModel(void);
238 * Call this when you want control whether or not the parser will parse
239 * and tokenize input (TRUE), or whether it just caches input to be
240 * parsed later (FALSE).
242 * @update gess 9/1/98
243 * @param aState determines whether we parse/tokenize or just cache.
244 * @return current state
246 NS_IMETHOD
ContinueParsing();
247 NS_IMETHOD
ContinueInterruptedParsing();
248 NS_IMETHOD_(void) BlockParser();
249 NS_IMETHOD_(void) UnblockParser();
250 NS_IMETHOD
Terminate(void);
253 * Call this to query whether the parser is enabled or not.
255 * @update vidur 4/12/99
256 * @return current state
258 NS_IMETHOD_(PRBool
) IsParserEnabled();
261 * Call this to query whether the parser thinks it's done with parsing.
263 * @update rickg 5/12/01
264 * @return complete state
266 NS_IMETHOD_(PRBool
) IsComplete();
269 * This rather arcane method (hack) is used as a signal between the
270 * DTD and the parser. It allows the DTD to tell the parser that content
271 * that comes through (parser::parser(string)) but not consumed should
272 * propagate into the next string based parse call.
274 * @update gess 9/1/98
275 * @param aState determines whether we propagate unused string content.
276 * @return current state
278 void SetUnusedInput(nsString
& aBuffer
);
281 * This method gets called (automatically) during incremental parsing
282 * @update gess5/11/98
283 * @return TRUE if all went well, otherwise FALSE
285 virtual nsresult
ResumeParse(PRBool allowIteration
= PR_TRUE
,
286 PRBool aIsFinalChunk
= PR_FALSE
,
287 PRBool aCanInterrupt
= PR_TRUE
);
289 //*********************************************
290 // These methods are callback methods used by
291 // net lib to let us know about our inputstream.
292 //*********************************************
293 // nsIRequestObserver methods:
294 NS_DECL_NSIREQUESTOBSERVER
296 // nsIStreamListener methods:
297 NS_DECL_NSISTREAMLISTENER
299 void PushContext(CParserContext
& aContext
);
300 CParserContext
* PopContext();
301 CParserContext
* PeekContext() {return mParserContext
;}
304 * Get the channel associated with this parser
305 * @update harishd,gagan 07/17/01
306 * @param aChannel out param that will contain the result
307 * @return NS_OK if successful
309 NS_IMETHOD
GetChannel(nsIChannel
** aChannel
);
312 * Get the DTD associated with this parser
313 * @update vidur 9/29/99
314 * @param aDTD out param that will contain the result
315 * @return NS_OK if successful, NS_ERROR_FAILURE for runtime error
317 NS_IMETHOD
GetDTD(nsIDTD
** aDTD
);
320 * Detects the existence of a META tag with charset information in
323 PRBool
DetectMetaTag(const char* aBytes
,
326 PRInt32
& oCharsetSource
);
328 void SetSinkCharset(nsACString
& aCharset
);
331 * Removes continue parsing events
332 * @update kmcclusk 5/18/98
335 NS_IMETHODIMP
CancelParsingEvents();
338 * Indicates whether the parser is in a state where it
339 * can be interrupted.
340 * @return PR_TRUE if parser can be interrupted, PR_FALSE if it can not be interrupted.
341 * @update kmcclusk 5/18/98
343 PRBool
CanInterrupt(void);
346 * Set to parser state to indicate whether parsing tokens can be interrupted
347 * @param aCanInterrupt PR_TRUE if parser can be interrupted, PR_FALSE if it can not be interrupted.
348 * @update kmcclusk 5/18/98
350 void SetCanInterrupt(PRBool aCanInterrupt
);
353 * This is called when the final chunk has been
354 * passed to the parser and the content sink has
355 * interrupted token processing. It schedules
356 * a ParserContinue PL_Event which will ask the parser
357 * to HandleParserContinueEvent when it is handled.
358 * @update kmcclusk6/1/2001
360 nsresult
PostContinueEvent();
363 * Fired when the continue parse event is triggered.
364 * @update kmcclusk 5/18/98
366 void HandleParserContinueEvent(class nsParserContinueEvent
*);
369 * Called by top-level scanners when data from necko is added to
372 nsresult
DataAdded(const nsSubstring
& aData
, nsIRequest
*aRequest
);
374 static nsCOMArray
<nsIUnicharStreamListener
> *sParserDataListeners
;
376 static nsICharsetAlias
* GetCharsetAliasService() {
377 return sCharsetAliasService
;
380 static nsICharsetConverterManager
* GetCharsetConverterManager() {
381 return sCharsetConverterManager
;
384 virtual void Reset() {
391 void Initialize(PRBool aConstructor
= PR_FALSE
);
396 * @update gess5/18/98
400 nsresult
WillBuildModel(nsString
& aFilename
);
404 * @update gess5/18/98
408 nsresult
DidBuildModel(nsresult anErrorCode
);
412 /*******************************************
413 These are the tokenization methods...
414 *******************************************/
417 * Part of the code sandwich, this gets called right before
418 * the tokenization process begins. The main reason for
419 * this call is to allow the delegate to do initialization.
421 * @update gess 3/25/98
423 * @return TRUE if it's ok to proceed
425 PRBool
WillTokenize(PRBool aIsFinalChunk
= PR_FALSE
);
429 * This is the primary control routine. It iteratively
430 * consumes tokens until an error occurs or you run out
433 * @update gess 3/25/98
436 nsresult
Tokenize(PRBool aIsFinalChunk
= PR_FALSE
);
439 * This is the tail-end of the code sandwich for the
440 * tokenization process. It gets called once tokenziation
443 * @update gess 3/25/98
445 * @return TRUE if all went well
447 PRBool
DidTokenize(PRBool aIsFinalChunk
= PR_FALSE
);
451 //*********************************************
452 // And now, some data members...
453 //*********************************************
456 CParserContext
* mParserContext
;
457 nsCOMPtr
<nsIRequestObserver
> mObserver
;
458 nsCOMPtr
<nsIContentSink
> mSink
;
459 nsIRunnable
* mContinueEvent
; // weak ref
461 nsCOMPtr
<nsIParserFilter
> mParserFilter
;
462 nsTokenAllocator mTokenAllocator
;
464 eParserCommands mCommand
;
465 nsresult mInternalState
;
466 PRInt32 mStreamStatus
;
467 PRInt32 mCharsetSource
;
471 nsString mUnusedInput
;
473 nsCString mCommandStr
;
475 static nsICharsetAlias
* sCharsetAliasService
;
476 static nsICharsetConverterManager
* sCharsetConverterManager
;
480 MOZ_TIMER_DECLARE(mParseTime
)
481 MOZ_TIMER_DECLARE(mDTDTime
)
482 MOZ_TIMER_DECLARE(mTokenizeTime
)