1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* ***** BEGIN LICENSE BLOCK *****
3 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
5 * The contents of this file are subject to the Mozilla Public License Version
6 * 1.1 (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 * http://www.mozilla.org/MPL/
10 * Software distributed under the License is distributed on an "AS IS" basis,
11 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12 * for the specific language governing rights and limitations under the
15 * The Original Code is mozilla.org code.
17 * The Initial Developer of the Original Code is
18 * Netscape Communications Corporation.
19 * Portions created by the Initial Developer are Copyright (C) 1998
20 * the Initial Developer. All Rights Reserved.
24 * Alternatively, the contents of this file may be used under the terms of
25 * either of the GNU General Public License Version 2 or later (the "GPL"),
26 * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27 * in which case the provisions of the GPL or the LGPL are applicable instead
28 * of those above. If you wish to allow use of your version of this file only
29 * under the terms of either the GPL or the LGPL, and not to allow others to
30 * use your version of this file under the terms of the MPL, indicate your
31 * decision by deleting the provisions above and replace them with the notice
32 * and other provisions required by the GPL or the LGPL. If you do not delete
33 * the provisions above, a recipient may use your version of this file under
34 * the terms of any one of the MPL, the GPL or the LGPL.
36 * ***** END LICENSE BLOCK ***** */
42 * This file contains the declarations for all the HTML specific token types that
43 * our DTD's understand. In fact, the same set of token types are used for XML.
44 * Currently we have tokens for text, comments, start and end tags, entities,
45 * attributes, style, script and skipped content. Whitespace and newlines also
46 * have their own token types, but don't count on them to stay forever.
48 * If you're looking for the html tags, they're in a file called nsHTMLTag.h/cpp.
50 * Most of the token types have a similar API. They have methods to get the type
51 * of token (GetTokenType); those that represent HTML tags also have a method to
52 * get type tag type (GetTypeID). In addition, most have a method that causes the
53 * token to help in the parsing process called (Consume). We've also thrown in a
54 * few standard debugging methods as well.
61 #include "nsHTMLTags.h"
63 #include "nsScannerString.h"
67 /*******************************************************************
68 * This enum defines the set of token types that we currently support.
69 *******************************************************************/
71 enum eHTMLTokenTypes
{
73 eToken_start
=1, eToken_end
, eToken_comment
, eToken_entity
,
74 eToken_whitespace
, eToken_newline
, eToken_text
, eToken_attribute
,
75 eToken_instruction
, eToken_cdatasection
, eToken_doctypeDecl
, eToken_markupDecl
,
76 eToken_last
//make sure this stays the last token...
79 nsresult
ConsumeQuotedString(PRUnichar aChar
,nsString
& aString
,nsScanner
& aScanner
);
80 nsresult
ConsumeAttributeText(PRUnichar aChar
,nsString
& aString
,nsScanner
& aScanner
);
81 const PRUnichar
* GetTagName(PRInt32 aTag
);
82 //PRInt32 FindEntityIndex(nsString& aString,PRInt32 aCount=-1);
87 * This declares the basic token type used in the HTML DTD's.
88 * @update gess 3/25/98
90 class CHTMLToken
: public CToken
{
92 virtual ~CHTMLToken();
93 CHTMLToken(eHTMLTags aTag
);
95 virtual eContainerInfo
GetContainerInfo(void) const {return eFormUnknown
;}
96 virtual void SetContainerInfo(eContainerInfo aInfo
) { }
102 * This declares start tokens, which always take the form <xxxx>.
103 * This class also knows how to consume related attributes.
105 * @update gess 3/25/98
107 class CStartToken
: public CHTMLToken
{
111 CStartToken(eHTMLTags aTag
=eHTMLTag_unknown
);
112 CStartToken(const nsAString
& aString
);
113 CStartToken(const nsAString
& aName
,eHTMLTags aTag
);
115 virtual nsresult
Consume(PRUnichar aChar
,nsScanner
& aScanner
,PRInt32 aMode
);
116 virtual PRInt32
GetTypeID(void);
117 virtual PRInt32
GetTokenType(void);
119 virtual PRBool
IsEmpty(void);
120 virtual void SetEmpty(PRBool aValue
);
122 virtual const nsSubstring
& GetStringValue();
123 virtual void GetSource(nsString
& anOutputString
);
124 virtual void AppendSourceTo(nsAString
& anOutputString
);
126 // the following info is used to set well-formedness state on start tags...
127 virtual eContainerInfo
GetContainerInfo(void) const {return mContainerInfo
;}
128 virtual void SetContainerInfo(eContainerInfo aContainerInfo
) {
129 if (eFormUnknown
==mContainerInfo
) {
130 mContainerInfo
=aContainerInfo
;
133 virtual PRBool
IsWellFormed(void) const {
134 return eWellFormed
== mContainerInfo
;
139 eContainerInfo mContainerInfo
;
142 PRPackedBool mAttributed
;
148 * This declares end tokens, which always take the
149 * form </xxxx>. This class also knows how to consume
150 * related attributes.
152 * @update gess 3/25/98
154 class CEndToken
: public CHTMLToken
{
158 CEndToken(eHTMLTags aTag
);
159 CEndToken(const nsAString
& aString
);
160 CEndToken(const nsAString
& aName
,eHTMLTags aTag
);
161 virtual nsresult
Consume(PRUnichar aChar
,nsScanner
& aScanner
,PRInt32 aMode
);
162 virtual PRInt32
GetTypeID(void);
163 virtual PRInt32
GetTokenType(void);
165 virtual const nsSubstring
& GetStringValue();
166 virtual void GetSource(nsString
& anOutputString
);
167 virtual void AppendSourceTo(nsAString
& anOutputString
);
175 * This declares comment tokens. Comments are usually
176 * thought of as tokens, but we treat them that way
177 * here so that the parser can have a consistent view
180 * @update gess 3/25/98
182 class CCommentToken
: public CHTMLToken
{
187 CCommentToken(const nsAString
& aString
);
188 virtual nsresult
Consume(PRUnichar aChar
,nsScanner
& aScanner
,PRInt32 aMode
);
189 virtual PRInt32
GetTokenType(void);
190 virtual const nsSubstring
& GetStringValue(void);
191 virtual void AppendSourceTo(nsAString
& anOutputString
);
193 nsresult
ConsumeStrictComment(nsScanner
& aScanner
);
194 nsresult
ConsumeQuirksComment(nsScanner
& aScanner
);
197 nsScannerSubstring mComment
; // does not include MDO & MDC
198 nsScannerSubstring mCommentDecl
; // includes MDO & MDC
203 * This class declares entity tokens, which always take
204 * the form &xxxx;. This class also offers a few utility
205 * methods that allow you to easily reduce entities.
207 * @update gess 3/25/98
209 class CEntityToken
: public CHTMLToken
{
214 CEntityToken(const nsAString
& aString
);
215 virtual PRInt32
GetTokenType(void);
216 PRInt32
TranslateToUnicodeStr(nsString
& aString
);
217 virtual nsresult
Consume(PRUnichar aChar
,nsScanner
& aScanner
,PRInt32 aMode
);
218 static nsresult
ConsumeEntity(PRUnichar aChar
, nsString
& aString
,
219 nsScanner
& aScanner
);
220 static PRInt32
TranslateToUnicodeStr(PRInt32 aValue
,nsString
& aString
);
222 virtual const nsSubstring
& GetStringValue(void);
223 virtual void GetSource(nsString
& anOutputString
);
224 virtual void AppendSourceTo(nsAString
& anOutputString
);
232 * Whitespace tokens are used where whitespace can be
233 * detected as distinct from text. This allows us to
234 * easily skip leading/trailing whitespace when desired.
236 * @update gess 3/25/98
238 class CWhitespaceToken
: public CHTMLToken
{
243 CWhitespaceToken(const nsAString
& aString
);
244 virtual nsresult
Consume(PRUnichar aChar
,nsScanner
& aScanner
,PRInt32 aMode
);
245 virtual PRInt32
GetTokenType(void);
246 virtual const nsSubstring
& GetStringValue(void);
249 nsScannerSharedSubstring mTextValue
;
253 * Text tokens contain the normalized form of html text.
254 * These tokens are guaranteed not to contain entities,
255 * start or end tags, or newlines.
257 * @update gess 3/25/98
259 class CTextToken
: public CHTMLToken
{
264 CTextToken(const nsAString
& aString
);
265 virtual nsresult
Consume(PRUnichar aChar
,nsScanner
& aScanner
,PRInt32 aMode
);
266 virtual PRInt32
GetTokenType(void);
267 virtual PRInt32
GetTextLength(void);
268 virtual void CopyTo(nsAString
& aStr
);
269 virtual const nsSubstring
& GetStringValue(void);
270 virtual void Bind(nsScanner
* aScanner
, nsScannerIterator
& aStart
,
271 nsScannerIterator
& aEnd
);
272 virtual void Bind(const nsAString
& aStr
);
274 nsresult
ConsumeCharacterData(PRBool aIgnoreComments
,
276 const nsAString
& aEndTagName
,
278 PRBool
& aFlushTokens
);
280 nsresult
ConsumeParsedCharacterData(PRBool aDiscardFirstNewline
,
281 PRBool aConservativeConsume
,
283 const nsAString
& aEndTagName
,
288 nsScannerSubstring mTextValue
;
293 * CDATASection tokens contain raw unescaped text content delimited by
295 * XXX Not really a HTML construct - maybe we need a separation
297 * @update vidur 11/12/98
299 class CCDATASectionToken
: public CHTMLToken
{
303 CCDATASectionToken(eHTMLTags aTag
= eHTMLTag_unknown
);
304 CCDATASectionToken(const nsAString
& aString
);
305 virtual nsresult
Consume(PRUnichar aChar
,nsScanner
& aScanner
,PRInt32 aMode
);
306 virtual PRInt32
GetTokenType(void);
307 virtual const nsSubstring
& GetStringValue(void);
315 * Declaration tokens contain raw unescaped text content (not really, but
316 * right now we use this only for view source).
317 * XXX Not really a HTML construct - maybe we need a separation
320 class CMarkupDeclToken
: public CHTMLToken
{
325 CMarkupDeclToken(const nsAString
& aString
);
326 virtual nsresult
Consume(PRUnichar aChar
,nsScanner
& aScanner
,PRInt32 aMode
);
327 virtual PRInt32
GetTokenType(void);
328 virtual const nsSubstring
& GetStringValue(void);
331 nsScannerSubstring mTextValue
;
336 * Attribute tokens are used to contain attribute key/value
337 * pairs whereever they may occur. Typically, they should
338 * occur only in start tokens. However, we may expand that
339 * ability when XML tokens become commonplace.
341 * @update gess 3/25/98
343 class CAttributeToken
: public CHTMLToken
{
348 CAttributeToken(const nsAString
& aString
);
349 CAttributeToken(const nsAString
& aKey
, const nsAString
& aString
);
350 ~CAttributeToken() {}
351 virtual nsresult
Consume(PRUnichar aChar
,nsScanner
& aScanner
,PRInt32 aMode
);
352 virtual PRInt32
GetTokenType(void);
353 const nsSubstring
& GetKey(void) { return mTextKey
.AsString(); }
354 virtual void SetKey(const nsAString
& aKey
);
355 virtual void BindKey(nsScanner
* aScanner
, nsScannerIterator
& aStart
,
356 nsScannerIterator
& aEnd
);
357 const nsSubstring
& GetValue(void) {return mTextValue
.str();}
358 virtual const nsSubstring
& GetStringValue(void);
359 virtual void GetSource(nsString
& anOutputString
);
360 virtual void AppendSourceTo(nsAString
& anOutputString
);
362 PRPackedBool mHasEqualWithoutValue
;
364 nsScannerSharedSubstring mTextValue
;
365 nsScannerSubstring mTextKey
;
370 * Newline tokens contain, you guessed it, newlines.
371 * They consume newline (CR/LF) either alone or in pairs.
373 * @update gess 3/25/98
375 class CNewlineToken
: public CHTMLToken
{
380 virtual nsresult
Consume(PRUnichar aChar
,nsScanner
& aScanner
,PRInt32 aMode
);
381 virtual PRInt32
GetTokenType(void);
382 virtual const nsSubstring
& GetStringValue(void);
384 static void AllocNewline();
385 static void FreeNewline();
390 * Whitespace tokens are used where whitespace can be
391 * detected as distinct from text. This allows us to
392 * easily skip leading/trailing whitespace when desired.
394 * @update gess 3/25/98
396 class CInstructionToken
: public CHTMLToken
{
401 CInstructionToken(const nsAString
& aString
);
402 virtual nsresult
Consume(PRUnichar aChar
,nsScanner
& aScanner
,PRInt32 aMode
);
403 virtual PRInt32
GetTokenType(void);
404 virtual const nsSubstring
& GetStringValue(void);
412 * This token is generated by the HTML and Expat tokenizers
413 * when they see the doctype declaration ("<!DOCTYPE ... >")
417 class CDoctypeDeclToken
: public CHTMLToken
{
421 CDoctypeDeclToken(eHTMLTags aTag
=eHTMLTag_unknown
);
422 CDoctypeDeclToken(const nsAString
& aString
,eHTMLTags aTag
=eHTMLTag_unknown
);
423 virtual nsresult
Consume(PRUnichar aChar
,nsScanner
& aScanner
,PRInt32 aMode
);
424 virtual PRInt32
GetTokenType(void);
425 virtual const nsSubstring
& GetStringValue(void);
426 virtual void SetStringValue(const nsAString
& aStr
);