Bug 470455 - test_database_sync_embed_visits.js leaks, r=sdwilsh
[wine-gecko.git] / parser / htmlparser / src / nsHTMLTokens.cpp
blobee553a0f80bc9de7524f9254940f66d6754b8027
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=2 sw=2 et tw=78: */
3 /* ***** BEGIN LICENSE BLOCK *****
4 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
6 * The contents of this file are subject to the Mozilla Public License Version
7 * 1.1 (the "License"); you may not use this file except in compliance with
8 * the License. You may obtain a copy of the License at
9 * http://www.mozilla.org/MPL/
11 * Software distributed under the License is distributed on an "AS IS" basis,
12 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
13 * for the specific language governing rights and limitations under the
14 * License.
16 * The Original Code is mozilla.org code.
18 * The Initial Developer of the Original Code is
19 * Netscape Communications Corporation.
20 * Portions created by the Initial Developer are Copyright (C) 1998
21 * the Initial Developer. All Rights Reserved.
23 * Contributor(s):
24 * Blake Kaplan <mrbkap@gmail.com>
26 * Alternatively, the contents of this file may be used under the terms of
27 * either of the GNU General Public License Version 2 or later (the "GPL"),
28 * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
29 * in which case the provisions of the GPL or the LGPL are applicable instead
30 * of those above. If you wish to allow use of your version of this file only
31 * under the terms of either the GPL or the LGPL, and not to allow others to
32 * use your version of this file under the terms of the MPL, indicate your
33 * decision by deleting the provisions above and replace them with the notice
34 * and other provisions required by the GPL or the LGPL. If you do not delete
35 * the provisions above, a recipient may use your version of this file under
36 * the terms of any one of the MPL, the GPL or the LGPL.
38 * ***** END LICENSE BLOCK ***** */
40 #include <ctype.h>
41 #include <time.h>
42 #include <stdio.h>
43 #include "nsScanner.h"
44 #include "nsToken.h"
45 #include "nsHTMLTokens.h"
46 #include "prtypes.h"
47 #include "nsDebug.h"
48 #include "nsHTMLTags.h"
49 #include "nsHTMLEntities.h"
50 #include "nsCRT.h"
51 #include "nsReadableUtils.h"
52 #include "nsUnicharUtils.h"
53 #include "nsScanner.h"
56 static const PRUnichar sUserdefined[] = {'u', 's', 'e', 'r', 'd', 'e', 'f',
57 'i', 'n', 'e', 'd', 0};
59 static const PRUnichar kAttributeTerminalChars[] = {
60 PRUnichar('&'), PRUnichar('\t'), PRUnichar('\n'),
61 PRUnichar('\r'), PRUnichar(' '), PRUnichar('>'),
62 PRUnichar(0)
65 static void AppendNCR(nsSubstring& aString, PRInt32 aNCRValue);
66 /**
67 * Consumes an entity from aScanner and expands it into aString.
69 * @param aString The target string to append the entity to.
70 * @param aScanner Controller of underlying input source
71 * @param aIECompatible Controls whether we respect entities with values >
72 * 255 and no terminating semicolon.
73 * @param aFlag If NS_IPARSER_FLAG_VIEW_SOURCE do not reduce entities...
74 * @return error result
76 static nsresult
77 ConsumeEntity(nsScannerSharedSubstring& aString,
78 nsScanner& aScanner,
79 PRBool aIECompatible,
80 PRInt32 aFlag)
82 nsresult result = NS_OK;
84 PRUnichar ch;
85 result = aScanner.Peek(ch, 1);
87 if (NS_SUCCEEDED(result)) {
88 PRUnichar amp = 0;
89 PRInt32 theNCRValue = 0;
90 nsAutoString entity;
92 if (nsCRT::IsAsciiAlpha(ch) && !(aFlag & NS_IPARSER_FLAG_VIEW_SOURCE)) {
93 result = CEntityToken::ConsumeEntity(ch, entity, aScanner);
94 if (NS_SUCCEEDED(result)) {
95 theNCRValue = nsHTMLEntities::EntityToUnicode(entity);
96 PRUnichar theTermChar = entity.Last();
97 // If an entity value is greater than 255 then:
98 // Nav 4.x does not treat it as an entity,
99 // IE treats it as an entity if terminated with a semicolon.
100 // Resembling IE!!
102 nsSubstring &writable = aString.writable();
103 if (theNCRValue < 0 ||
104 (aIECompatible && theNCRValue > 255 && theTermChar != ';')) {
105 // Looks like we're not dealing with an entity
106 writable.Append(kAmpersand);
107 writable.Append(entity);
108 } else {
109 // A valid entity so reduce it.
110 writable.Append(PRUnichar(theNCRValue));
113 } else if (ch == kHashsign && !(aFlag & NS_IPARSER_FLAG_VIEW_SOURCE)) {
114 result = CEntityToken::ConsumeEntity(ch, entity, aScanner);
115 if (NS_SUCCEEDED(result)) {
116 nsSubstring &writable = aString.writable();
117 if (result == NS_HTMLTOKENS_NOT_AN_ENTITY) {
118 // Looked like an entity but it's not
119 aScanner.GetChar(amp);
120 writable.Append(amp);
121 result = NS_OK;
122 } else {
123 PRInt32 err;
124 theNCRValue = entity.ToInteger(&err, kAutoDetect);
125 AppendNCR(writable, theNCRValue);
128 } else {
129 // What we thought as entity is not really an entity...
130 aScanner.GetChar(amp);
131 aString.writable().Append(amp);
135 return result;
139 * This general purpose method is used when you want to
140 * consume attributed text value.
141 * Note: It also reduces entities.
143 * @param aNewlineCount -- the newline count to increment when hitting newlines
144 * @param aScanner -- controller of underlying input source
145 * @param aTerminalChars -- characters that stop consuming attribute.
146 * @param aAllowNewlines -- whether to allow newlines in the value.
147 * XXX it would be nice to roll this info into
148 * aTerminalChars somehow....
149 * @param aIECompatEntities IE treats entities with values > 255 as
150 * entities only if they're terminated with a
151 * semicolon. This is true to follow that behavior
152 * and false to treat all values as entities.
153 * @param aFlag - contains information such as |dtd mode|view mode|doctype|etc...
154 * @return error result
156 static nsresult
157 ConsumeUntil(nsScannerSharedSubstring& aString,
158 PRInt32& aNewlineCount,
159 nsScanner& aScanner,
160 const nsReadEndCondition& aEndCondition,
161 PRBool aAllowNewlines,
162 PRBool aIECompatEntities,
163 PRInt32 aFlag)
165 nsresult result = NS_OK;
166 PRBool done = PR_FALSE;
168 do {
169 result = aScanner.ReadUntil(aString, aEndCondition, PR_FALSE);
170 if (NS_SUCCEEDED(result)) {
171 PRUnichar ch;
172 aScanner.Peek(ch);
173 if (ch == kAmpersand) {
174 result = ConsumeEntity(aString, aScanner, aIECompatEntities, aFlag);
175 } else if (ch == kCR && aAllowNewlines) {
176 aScanner.GetChar(ch);
177 result = aScanner.Peek(ch);
178 if (NS_SUCCEEDED(result)) {
179 nsSubstring &writable = aString.writable();
180 if (ch == kNewLine) {
181 writable.AppendLiteral("\r\n");
182 aScanner.GetChar(ch);
183 } else {
184 writable.Append(PRUnichar('\r'));
186 ++aNewlineCount;
188 } else if (ch == kNewLine && aAllowNewlines) {
189 aScanner.GetChar(ch);
190 aString.writable().Append(PRUnichar('\n'));
191 ++aNewlineCount;
192 } else {
193 done = PR_TRUE;
196 } while (NS_SUCCEEDED(result) && !done);
198 return result;
201 /**************************************************************
202 And now for the token classes...
203 **************************************************************/
206 * Constructor from tag id
208 CHTMLToken::CHTMLToken(eHTMLTags aTag)
209 : CToken(aTag)
214 CHTMLToken::~CHTMLToken()
219 * Constructor from tag id
221 CStartToken::CStartToken(eHTMLTags aTag)
222 : CHTMLToken(aTag)
224 mEmpty = PR_FALSE;
225 mContainerInfo = eFormUnknown;
226 #ifdef DEBUG
227 mAttributed = PR_FALSE;
228 #endif
231 CStartToken::CStartToken(const nsAString& aName)
232 : CHTMLToken(eHTMLTag_unknown)
234 mEmpty = PR_FALSE;
235 mContainerInfo = eFormUnknown;
236 mTextValue.Assign(aName);
237 #ifdef DEBUG
238 mAttributed = PR_FALSE;
239 #endif
242 CStartToken::CStartToken(const nsAString& aName, eHTMLTags aTag)
243 : CHTMLToken(aTag)
245 mEmpty = PR_FALSE;
246 mContainerInfo = eFormUnknown;
247 mTextValue.Assign(aName);
248 #ifdef DEBUG
249 mAttributed = PR_FALSE;
250 #endif
254 * This method returns the typeid (the tag type) for this token.
256 PRInt32
257 CStartToken::GetTypeID()
259 if (eHTMLTag_unknown == mTypeID) {
260 mTypeID = nsHTMLTags::LookupTag(mTextValue);
262 return mTypeID;
265 PRInt32
266 CStartToken::GetTokenType()
268 return eToken_start;
271 void
272 CStartToken::SetEmpty(PRBool aValue)
274 mEmpty = aValue;
277 PRBool
278 CStartToken::IsEmpty()
280 return mEmpty;
284 * Consume the identifier portion of the start tag
286 nsresult
287 CStartToken::Consume(PRUnichar aChar, nsScanner& aScanner, PRInt32 aFlag)
289 // If you're here, we've already Consumed the < char, and are
290 // ready to Consume the rest of the open tag identifier.
291 // Stop consuming as soon as you see a space or a '>'.
292 // NOTE: We don't Consume the tag attributes here, nor do we eat the ">"
294 nsresult result = NS_OK;
295 nsScannerSharedSubstring tagIdent;
297 if (aFlag & NS_IPARSER_FLAG_HTML) {
298 result = aScanner.ReadTagIdentifier(tagIdent);
299 mTypeID = (PRInt32)nsHTMLTags::LookupTag(tagIdent.str());
300 // Save the original tag string if this is user-defined or if we
301 // are viewing source
302 if (eHTMLTag_userdefined == mTypeID ||
303 (aFlag & NS_IPARSER_FLAG_VIEW_SOURCE)) {
304 mTextValue = tagIdent.str();
306 } else {
307 result = aScanner.ReadTagIdentifier(tagIdent);
308 mTextValue = tagIdent.str();
309 mTypeID = nsHTMLTags::LookupTag(mTextValue);
312 if (NS_SUCCEEDED(result) && !(aFlag & NS_IPARSER_FLAG_VIEW_SOURCE)) {
313 result = aScanner.SkipWhitespace(mNewlineCount);
316 if (kEOF == result && !aScanner.IsIncremental()) {
317 // Take what we can get.
318 result = NS_OK;
321 return result;
324 const nsSubstring&
325 CStartToken::GetStringValue()
327 if (eHTMLTag_unknown < mTypeID && mTypeID < eHTMLTag_text) {
328 if (!mTextValue.Length()) {
329 mTextValue.Assign(nsHTMLTags::GetStringValue((nsHTMLTag) mTypeID));
332 return mTextValue;
335 void
336 CStartToken::GetSource(nsString& anOutputString)
338 anOutputString.Truncate();
339 AppendSourceTo(anOutputString);
342 void
343 CStartToken::AppendSourceTo(nsAString& anOutputString)
345 anOutputString.Append(PRUnichar('<'));
347 * Watch out for Bug 15204
349 if (!mTextValue.IsEmpty()) {
350 anOutputString.Append(mTextValue);
351 } else {
352 anOutputString.Append(GetTagName(mTypeID));
355 anOutputString.Append(PRUnichar('>'));
358 CEndToken::CEndToken(eHTMLTags aTag)
359 : CHTMLToken(aTag)
363 CEndToken::CEndToken(const nsAString& aName)
364 : CHTMLToken(eHTMLTag_unknown)
366 mTextValue.Assign(aName);
369 CEndToken::CEndToken(const nsAString& aName, eHTMLTags aTag)
370 : CHTMLToken(aTag)
372 mTextValue.Assign(aName);
375 nsresult
376 CEndToken::Consume(PRUnichar aChar, nsScanner& aScanner, PRInt32 aFlag)
378 nsresult result = NS_OK;
379 nsScannerSharedSubstring tagIdent;
381 if (aFlag & NS_IPARSER_FLAG_HTML) {
382 result = aScanner.ReadTagIdentifier(tagIdent);
384 mTypeID = (PRInt32)nsHTMLTags::LookupTag(tagIdent.str());
385 // Save the original tag string if this is user-defined or if we
386 // are viewing source
387 if (eHTMLTag_userdefined == mTypeID ||
388 (aFlag & NS_IPARSER_FLAG_VIEW_SOURCE)) {
389 mTextValue = tagIdent.str();
391 } else {
392 result = aScanner.ReadTagIdentifier(tagIdent);
393 mTextValue = tagIdent.str();
394 mTypeID = nsHTMLTags::LookupTag(mTextValue);
397 if (NS_SUCCEEDED(result) && !(aFlag & NS_IPARSER_FLAG_VIEW_SOURCE)) {
398 result = aScanner.SkipWhitespace(mNewlineCount);
401 if (kEOF == result && !aScanner.IsIncremental()) {
402 // Take what we can get.
403 result = NS_OK;
406 return result;
411 * Asks the token to determine the <i>HTMLTag type</i> of
412 * the token. This turns around and looks up the tag name
413 * in the tag dictionary.
415 PRInt32
416 CEndToken::GetTypeID()
418 if (eHTMLTag_unknown == mTypeID) {
419 mTypeID = nsHTMLTags::LookupTag(mTextValue);
420 switch (mTypeID) {
421 case eHTMLTag_dir:
422 case eHTMLTag_menu:
423 mTypeID = eHTMLTag_ul;
424 break;
426 default:
427 break;
431 return mTypeID;
434 PRInt32
435 CEndToken::GetTokenType()
437 return eToken_end;
440 const nsSubstring&
441 CEndToken::GetStringValue()
443 if (eHTMLTag_unknown < mTypeID && mTypeID < eHTMLTag_text) {
444 if (!mTextValue.Length()) {
445 mTextValue.Assign(nsHTMLTags::GetStringValue((nsHTMLTag) mTypeID));
448 return mTextValue;
451 void
452 CEndToken::GetSource(nsString& anOutputString)
454 anOutputString.Truncate();
455 AppendSourceTo(anOutputString);
458 void
459 CEndToken::AppendSourceTo(nsAString& anOutputString)
461 anOutputString.AppendLiteral("</");
462 if (!mTextValue.IsEmpty()) {
463 anOutputString.Append(mTextValue);
464 } else {
465 anOutputString.Append(GetTagName(mTypeID));
468 anOutputString.Append(PRUnichar('>'));
471 CTextToken::CTextToken()
472 : CHTMLToken(eHTMLTag_text)
476 CTextToken::CTextToken(const nsAString& aName)
477 : CHTMLToken(eHTMLTag_text)
479 mTextValue.Rebind(aName);
482 PRInt32
483 CTextToken::GetTokenType()
485 return eToken_text;
488 PRInt32
489 CTextToken::GetTextLength()
491 return mTextValue.Length();
494 nsresult
495 CTextToken::Consume(PRUnichar aChar, nsScanner& aScanner, PRInt32 aFlag)
497 static const PRUnichar theTerminalsChars[] =
498 { PRUnichar('\n'), PRUnichar('\r'), PRUnichar('&'), PRUnichar('<'),
499 PRUnichar(0) };
500 static const nsReadEndCondition theEndCondition(theTerminalsChars);
501 nsresult result = NS_OK;
502 PRBool done = PR_FALSE;
503 nsScannerIterator origin, start, end;
505 // Start scanning after the first character, because we know it to
506 // be part of this text token (we wouldn't have come here if it weren't)
507 aScanner.CurrentPosition(origin);
508 start = origin;
509 aScanner.EndReading(end);
511 NS_ASSERTION(start != end, "Calling CTextToken::Consume when already at the "
512 "end of a document is a bad idea.");
514 aScanner.SetPosition(++start);
516 while (NS_OK == result && !done) {
517 result = aScanner.ReadUntil(start, end, theEndCondition, PR_FALSE);
518 if (NS_OK == result) {
519 result = aScanner.Peek(aChar);
521 if (NS_OK == result && (kCR == aChar || kNewLine == aChar)) {
522 switch (aChar) {
523 case kCR:
525 // It's a carriage return. See if this is part of a CR-LF pair (in
526 // which case we need to treat it as one newline). If we're at the
527 // edge of a packet, then leave the CR on the scanner, since it
528 // could still be part of a CR-LF pair. Otherwise, it isn't.
529 PRUnichar theNextChar;
530 result = aScanner.Peek(theNextChar, 1);
532 if (result == kEOF && aScanner.IsIncremental()) {
533 break;
536 if (NS_SUCCEEDED(result)) {
537 // Actually get the carriage return.
538 aScanner.GetChar(aChar);
541 if (kLF == theNextChar) {
542 // If the "\r" is followed by a "\n", don't replace it and let
543 // it be ignored by the layout system.
544 end.advance(2);
545 aScanner.GetChar(theNextChar);
546 } else {
547 // If it is standalone, replace the "\r" with a "\n" so that it
548 // will be considered by the layout system.
549 aScanner.ReplaceCharacter(end, kLF);
550 ++end;
552 ++mNewlineCount;
553 break;
555 case kLF:
556 aScanner.GetChar(aChar);
557 ++end;
558 ++mNewlineCount;
559 break;
561 } else {
562 done = PR_TRUE;
567 // Note: This function is only called from nsHTMLTokenizer::ConsumeText. If
568 // we return an error result from the final buffer, then it is responsible
569 // for turning it into an NS_OK result.
570 aScanner.BindSubstring(mTextValue, origin, end);
572 return result;
576 * Consume as much clear text from scanner as possible.
577 * The scanner is left on the < of the perceived end tag.
579 * @param aChar -- last char consumed from stream
580 * @param aConservativeConsume -- controls our handling of content with no
581 * terminating string.
582 * @param aIgnoreComments -- whether or not we should take comments into
583 * account in looking for the end tag.
584 * @param aScanner -- controller of underlying input source
585 * @param aEndTagname -- the terminal tag name.
586 * @param aFlag -- dtd modes and such.
587 * @param aFlushTokens -- PR_TRUE if we found the terminal tag.
588 * @return error result
590 nsresult
591 CTextToken::ConsumeCharacterData(PRBool aIgnoreComments,
592 nsScanner& aScanner,
593 const nsAString& aEndTagName,
594 PRInt32 aFlag,
595 PRBool& aFlushTokens)
597 nsresult result = NS_OK;
598 nsScannerIterator theStartOffset, theCurrOffset, theTermStrPos,
599 theStartCommentPos, theAltTermStrPos, endPos;
600 PRBool done = PR_FALSE;
601 PRBool theLastIteration = PR_FALSE;
603 aScanner.CurrentPosition(theStartOffset);
604 theCurrOffset = theStartOffset;
605 aScanner.EndReading(endPos);
606 theTermStrPos = theStartCommentPos = theAltTermStrPos = endPos;
608 // ALGORITHM: *** The performance is based on correctness of the document ***
609 // 1. Look for a '<' character. This could be
610 // a) Start of a comment (<!--),
611 // b) Start of the terminal string, or
612 // c) a start of a tag.
613 // We are interested in a) and b). c) is ignored because in CDATA we
614 // don't care for tags.
615 // NOTE: Technically speaking in CDATA we should ignore the comments too!
616 // But for compatibility we don't.
617 // 2. Having the offset, for '<', search for the terminal string from there
618 // on and record its offset.
619 // 3. From the same '<' offset also search for start of a comment '<!--'.
620 // If found search for end comment '-->' between the terminal string and
621 // '<!--'. If you did not find the end comment, then we have a malformed
622 // document, i.e., this section has a prematured terminal string Ex.
623 // <SCRIPT><!-- document.write('</SCRIPT>') //--> </SCRIPT>. But record
624 // terminal string's offset if this is the first premature terminal
625 // string, and update the current offset to the terminal string
626 // (prematured) offset and goto step 1.
627 // 4. Amen...If you found a terminal string and '-->'. Otherwise goto step 1.
628 // 5. If the end of the document is reached and if we still don't have the
629 // condition in step 4. then assume that the prematured terminal string
630 // is the actual terminal string and goto step 1. This will be our last
631 // iteration. If there is no premature terminal string and we're being
632 // conservative in our consumption (aConservativeConsume), then don't
633 // consume anything from the scanner. Otherwise, we consume all the way
634 // until the end.
636 NS_NAMED_LITERAL_STRING(ltslash, "</");
637 const nsString theTerminalString = ltslash + aEndTagName;
639 PRUint32 termStrLen = theTerminalString.Length();
640 while (result == NS_OK && !done) {
641 PRBool found = PR_FALSE;
642 nsScannerIterator gtOffset, ltOffset = theCurrOffset;
643 while (FindCharInReadable(PRUnichar(kLessThan), ltOffset, endPos) &&
644 ((PRUint32)ltOffset.size_forward() >= termStrLen ||
645 Distance(ltOffset, endPos) >= termStrLen)) {
646 // Make a copy of the (presumed) end tag and
647 // do a case-insensitive comparison
649 nsScannerIterator start(ltOffset), end(ltOffset);
650 end.advance(termStrLen);
652 if (CaseInsensitiveFindInReadable(theTerminalString, start, end) &&
653 (end == endPos || (*end == '>' || *end == ' ' ||
654 *end == '\t' || *end == '\n' ||
655 *end == '\r'))) {
656 gtOffset = end;
657 // Note that aIgnoreComments is only not set for <script>. We don't
658 // want to execute scripts that aren't in the form of: <script\s.*>
659 if ((end == endPos && aIgnoreComments) ||
660 FindCharInReadable(PRUnichar(kGreaterThan), gtOffset, endPos)) {
661 found = PR_TRUE;
662 theTermStrPos = start;
664 break;
666 ltOffset.advance(1);
669 if (found && theTermStrPos != endPos) {
670 if (!(aFlag & NS_IPARSER_FLAG_STRICT_MODE) &&
671 !theLastIteration && !aIgnoreComments) {
672 nsScannerIterator endComment(ltOffset);
673 endComment.advance(5);
675 if ((theStartCommentPos == endPos) &&
676 FindInReadable(NS_LITERAL_STRING("<!--"), theCurrOffset,
677 endComment)) {
678 theStartCommentPos = theCurrOffset;
681 if (theStartCommentPos != endPos) {
682 // Search for --> between <!-- and </TERMINALSTRING>.
683 theCurrOffset = theStartCommentPos;
684 nsScannerIterator terminal(theTermStrPos);
685 if (!RFindInReadable(NS_LITERAL_STRING("-->"),
686 theCurrOffset, terminal)) {
687 // If you're here it means that we have a bogus terminal string.
688 // Even though it is bogus, the position of the terminal string
689 // could be helpful in case we hit the rock bottom.
690 if (theAltTermStrPos == endPos) {
691 // But we only want to remember the first bogus terminal string.
692 theAltTermStrPos = theTermStrPos;
695 // We did not find '-->' so keep searching for terminal string.
696 theCurrOffset = theTermStrPos;
697 theCurrOffset.advance(termStrLen);
698 continue;
703 aScanner.BindSubstring(mTextValue, theStartOffset, theTermStrPos);
704 aScanner.SetPosition(ltOffset);
706 // We found </SCRIPT> or </STYLE>...permit flushing -> Ref: Bug 22485
707 aFlushTokens = PR_TRUE;
708 done = PR_TRUE;
709 } else {
710 // We end up here if:
711 // a) when the buffer runs out ot data.
712 // b) when the terminal string is not found.
713 if (!aScanner.IsIncremental()) {
714 if (theAltTermStrPos != endPos) {
715 // If you're here it means that we hit the rock bottom and therefore
716 // switch to plan B, since we have an alternative terminating string.
717 theCurrOffset = theAltTermStrPos;
718 theLastIteration = PR_TRUE;
719 } else {
720 // Oops, We fell all the way down to the end of the document.
721 done = PR_TRUE; // Do this to fix Bug. 35456
722 result = kFakeEndTag;
723 aScanner.BindSubstring(mTextValue, theStartOffset, endPos);
724 aScanner.SetPosition(endPos);
726 } else {
727 result = kEOF;
732 if (result == NS_OK) {
733 mNewlineCount = mTextValue.CountChar(kNewLine);
736 return result;
740 * Consume as much clear text from scanner as possible. Reducing entities.
741 * The scanner is left on the < of the perceived end tag.
743 * @param aChar -- last char consumed from stream
744 * @param aConservativeConsume -- controls our handling of content with no
745 * terminating string.
746 * @param aScanner -- controller of underlying input source
747 * @param aEndTagname -- the terminal tag name.
748 * @param aFlag -- dtd modes and such.
749 * @param aFlushTokens -- PR_TRUE if we found the terminal tag.
750 * @return error result
752 nsresult
753 CTextToken::ConsumeParsedCharacterData(PRBool aDiscardFirstNewline,
754 PRBool aConservativeConsume,
755 nsScanner& aScanner,
756 const nsAString& aEndTagName,
757 PRInt32 aFlag,
758 PRBool& aFound)
760 // This function is fairly straightforward except if there is no terminating
761 // string. If there is, we simply loop through all of the entities, reducing
762 // them as necessary and skipping over non-terminal strings starting with <.
763 // If there is *no* terminal string, then we examine aConservativeConsume.
764 // If we want to be conservative, we backtrack to the first place in the
765 // document that looked like the end of PCDATA (i.e., the first tag). This
766 // is for compatibility and so we don't regress bug 42945. If we are not
767 // conservative, then we consume everything, all the way up to the end of
768 // the document.
770 static const PRUnichar terminalChars[] = {
771 PRUnichar('\r'), PRUnichar('\n'), PRUnichar('&'), PRUnichar('<'),
772 PRUnichar(0)
774 static const nsReadEndCondition theEndCondition(terminalChars);
776 nsScannerIterator currPos, endPos, altEndPos;
777 PRUint32 truncPos = 0;
778 aScanner.CurrentPosition(currPos);
779 aScanner.EndReading(endPos);
781 altEndPos = endPos;
783 nsScannerSharedSubstring theContent;
784 PRUnichar ch = 0;
786 NS_NAMED_LITERAL_STRING(commentStart, "<!--");
787 NS_NAMED_LITERAL_STRING(ltslash, "</");
788 const nsString theTerminalString = ltslash + aEndTagName;
789 PRUint32 termStrLen = theTerminalString.Length();
790 PRUint32 commentStartLen = commentStart.Length();
792 nsresult result = NS_OK;
794 // Note that if we're already at the end of the document, the ConsumeUntil
795 // will fail, and we'll do the right thing.
796 do {
797 result = ConsumeUntil(theContent, mNewlineCount, aScanner,
798 theEndCondition, PR_TRUE, PR_FALSE, aFlag);
800 if (aDiscardFirstNewline &&
801 (NS_SUCCEEDED(result) || !aScanner.IsIncremental()) &&
802 !(aFlag & NS_IPARSER_FLAG_VIEW_SOURCE)) {
803 // Check if the very first character is a newline, and if so discard it.
804 // Note that we don't want to discard it in view source!
805 // Also note that this has to happen here (as opposed to before the
806 // ConsumeUntil) because we have to expand any entities.
807 // XXX It would be nice to be able to do this without calling
808 // writable()!
809 const nsSubstring &firstChunk = theContent.str();
810 if (!firstChunk.IsEmpty()) {
811 PRUint32 where = 0;
812 PRUnichar newline = firstChunk.First();
814 if (newline == kCR || newline == kNewLine) {
815 ++where;
817 if (firstChunk.Length() > 1) {
818 if (newline == kCR && firstChunk.CharAt(1) == kNewLine) {
819 // Handle \r\n = 1 newline.
820 ++where;
822 // Note: \n\r = 2 newlines.
826 if (where != 0) {
827 theContent.writable() = Substring(firstChunk, where);
831 aDiscardFirstNewline = PR_FALSE;
833 if (NS_FAILED(result)) {
834 if (kEOF == result && !aScanner.IsIncremental()) {
835 aFound = PR_TRUE; // this is as good as it gets.
836 result = kFakeEndTag;
838 if (aConservativeConsume && altEndPos != endPos) {
839 // We ran out of room looking for a </title>. Go back to the first
840 // place that looked like a tag and use that as our stopping point.
841 theContent.writable().Truncate(truncPos);
842 aScanner.SetPosition(altEndPos, PR_FALSE, PR_TRUE);
844 // else we take everything we consumed.
845 mTextValue.Rebind(theContent.str());
846 } else {
847 aFound = PR_FALSE;
850 return result;
853 aScanner.CurrentPosition(currPos);
854 aScanner.GetChar(ch); // this character must be '&' or '<'
856 if (ch == kLessThan && altEndPos == endPos) {
857 // Keep this position in case we need it for later.
858 altEndPos = currPos;
859 truncPos = theContent.str().Length();
862 if (Distance(currPos, endPos) >= termStrLen) {
863 nsScannerIterator start(currPos), end(currPos);
864 end.advance(termStrLen);
866 if (CaseInsensitiveFindInReadable(theTerminalString, start, end)) {
867 if (end != endPos && (*end == '>' || *end == ' ' ||
868 *end == '\t' || *end == '\n' ||
869 *end == '\r')) {
870 aFound = PR_TRUE;
871 mTextValue.Rebind(theContent.str());
873 // Note: This SetPosition() is actually going backwards from the
874 // scanner's mCurrentPosition (so we pass aReverse == PR_TRUE). This
875 // is because we call GetChar() above after we get the current
876 // position.
877 aScanner.SetPosition(currPos, PR_FALSE, PR_TRUE);
878 break;
882 // IE only consumes <!-- --> as comments in PCDATA.
883 if (Distance(currPos, endPos) >= commentStartLen) {
884 nsScannerIterator start(currPos), end(currPos);
885 end.advance(commentStartLen);
887 if (CaseInsensitiveFindInReadable(commentStart, start, end)) {
888 CCommentToken consumer; // stack allocated.
890 // CCommentToken expects us to be on the '-'
891 aScanner.SetPosition(currPos.advance(2));
893 // In quirks mode we consume too many things as comments, so pretend
894 // that we're not by modifying aFlag.
895 result = consumer.Consume(*currPos, aScanner,
896 (aFlag & ~NS_IPARSER_FLAG_QUIRKS_MODE) |
897 NS_IPARSER_FLAG_STRICT_MODE);
898 if (kEOF == result) {
899 // This can only happen if we're really out of space.
900 return kEOF;
901 } else if (kNotAComment == result) {
902 // Fall through and consume this as text.
903 aScanner.CurrentPosition(currPos);
904 aScanner.SetPosition(currPos.advance(1));
905 } else {
906 consumer.AppendSourceTo(theContent.writable());
907 mNewlineCount += consumer.GetNewlineCount();
908 continue;
913 result = kEOF;
914 // We did not find the terminal string yet so
915 // include the character that stopped consumption.
916 theContent.writable().Append(ch);
917 } while (currPos != endPos);
919 return result;
922 void
923 CTextToken::CopyTo(nsAString& aStr)
925 nsScannerIterator start, end;
926 mTextValue.BeginReading(start);
927 mTextValue.EndReading(end);
928 CopyUnicodeTo(start, end, aStr);
931 const nsSubstring& CTextToken::GetStringValue()
933 return mTextValue.AsString();
936 void
937 CTextToken::Bind(nsScanner* aScanner, nsScannerIterator& aStart,
938 nsScannerIterator& aEnd)
940 aScanner->BindSubstring(mTextValue, aStart, aEnd);
943 void
944 CTextToken::Bind(const nsAString& aStr)
946 mTextValue.Rebind(aStr);
949 CCDATASectionToken::CCDATASectionToken(eHTMLTags aTag)
950 : CHTMLToken(aTag)
954 CCDATASectionToken::CCDATASectionToken(const nsAString& aName)
955 : CHTMLToken(eHTMLTag_unknown)
957 mTextValue.Assign(aName);
960 PRInt32
961 CCDATASectionToken::GetTokenType()
963 return eToken_cdatasection;
967 * Consume as much marked test from scanner as possible.
968 * Note: This has to handle case: "<![ ! IE 5]>", in addition to "<![..[..]]>"
970 * @param aChar -- last char consumed from stream
971 * @param aScanner -- controller of underlying input source
972 * @return error result
974 nsresult
975 CCDATASectionToken::Consume(PRUnichar aChar, nsScanner& aScanner,
976 PRInt32 aFlag)
978 static const PRUnichar theTerminalsChars[] =
979 { PRUnichar('\r'), PRUnichar('\n'), PRUnichar(']'), PRUnichar(0) };
980 static const nsReadEndCondition theEndCondition(theTerminalsChars);
981 nsresult result = NS_OK;
982 PRBool done = PR_FALSE;
984 while (NS_OK == result && !done) {
985 result = aScanner.ReadUntil(mTextValue, theEndCondition, PR_FALSE);
986 if (NS_OK == result) {
987 result = aScanner.Peek(aChar);
988 if (kCR == aChar && NS_OK == result) {
989 result = aScanner.GetChar(aChar); // Strip off the \r
990 result = aScanner.Peek(aChar); // Then see what's next.
991 if (NS_OK == result) {
992 switch(aChar) {
993 case kCR:
994 result = aScanner.GetChar(aChar); // Strip off the \r
995 mTextValue.AppendLiteral("\n\n");
996 mNewlineCount += 2;
997 break;
999 case kNewLine:
1000 // Which means we saw \r\n, which becomes \n
1001 result = aScanner.GetChar(aChar); // Strip off the \n
1003 // Fall through...
1004 default:
1005 mTextValue.AppendLiteral("\n");
1006 mNewlineCount++;
1007 break;
1010 } else if (kNewLine == aChar) {
1011 result = aScanner.GetChar(aChar);
1012 mTextValue.Append(aChar);
1013 ++mNewlineCount;
1014 } else if (kRightSquareBracket == aChar) {
1015 PRBool canClose = PR_FALSE;
1016 result = aScanner.GetChar(aChar); // Strip off the ]
1017 mTextValue.Append(aChar);
1018 result = aScanner.Peek(aChar); // Then see what's next.
1019 if (NS_OK == result && kRightSquareBracket == aChar) {
1020 result = aScanner.GetChar(aChar); // Strip off the second ]
1021 mTextValue.Append(aChar);
1022 canClose = PR_TRUE;
1025 // The goal here is to not lose data from the page when encountering
1026 // markup like: <![endif]-->. This means that in normal parsing, we
1027 // allow ']' to end the marked section and just drop everything between
1028 // it an the '>'. In view-source mode, we cannot drop things on the
1029 // floor like that. In fact, to make view-source of XML with script in
1030 // CDATA sections at all bearable, we need to somewhat enforce the ']]>'
1031 // terminator for marked sections. So make the tokenization somewhat
1032 // different when in view-source _and_ dealing with a CDATA section.
1033 // XXX We should remember this StringBeginsWith test.
1034 PRBool inCDATA = (aFlag & NS_IPARSER_FLAG_VIEW_SOURCE) &&
1035 StringBeginsWith(mTextValue, NS_LITERAL_STRING("[CDATA["));
1036 if (inCDATA) {
1037 // Consume all right square brackets to catch cases such as:
1038 // <![CDATA[foo]]]>
1039 while (true) {
1040 result = aScanner.Peek(aChar);
1041 if (result != NS_OK || aChar != kRightSquareBracket) {
1042 break;
1045 mTextValue.Append(aChar);
1046 aScanner.GetChar(aChar);
1048 } else {
1049 nsAutoString dummy; // Skip any bad data
1050 result = aScanner.ReadUntil(dummy, kGreaterThan, PR_FALSE);
1052 if (NS_OK == result &&
1053 (!inCDATA || (canClose && kGreaterThan == aChar))) {
1054 result = aScanner.GetChar(aChar); // Strip off the >
1055 done = PR_TRUE;
1057 } else {
1058 done = PR_TRUE;
1063 if (kEOF == result && !aScanner.IsIncremental()) {
1064 // We ran out of space looking for the end of this CDATA section.
1065 // In order to not completely lose the entire section, treat everything
1066 // until the end of the document as part of the CDATA section and let
1067 // the DTD handle it.
1068 mInError = PR_TRUE;
1069 result = NS_OK;
1072 return result;
1075 const nsSubstring&
1076 CCDATASectionToken::GetStringValue()
1078 return mTextValue;
1082 CMarkupDeclToken::CMarkupDeclToken()
1083 : CHTMLToken(eHTMLTag_markupDecl)
1087 CMarkupDeclToken::CMarkupDeclToken(const nsAString& aName)
1088 : CHTMLToken(eHTMLTag_markupDecl)
1090 mTextValue.Rebind(aName);
1093 PRInt32
1094 CMarkupDeclToken::GetTokenType()
1096 return eToken_markupDecl;
1100 * Consume as much declaration from scanner as possible.
1101 * Declaration is a markup declaration of ELEMENT, ATTLIST, ENTITY or
1102 * NOTATION, which can span multiple lines and ends in >.
1104 * @param aChar -- last char consumed from stream
1105 * @param aScanner -- controller of underlying input source
1106 * @return error result
1108 nsresult
1109 CMarkupDeclToken::Consume(PRUnichar aChar, nsScanner& aScanner,
1110 PRInt32 aFlag)
1112 static const PRUnichar theTerminalsChars[] =
1113 { PRUnichar('\n'), PRUnichar('\r'), PRUnichar('\''), PRUnichar('"'),
1114 PRUnichar('>'),
1115 PRUnichar(0) };
1116 static const nsReadEndCondition theEndCondition(theTerminalsChars);
1117 nsresult result = NS_OK;
1118 PRBool done = PR_FALSE;
1119 PRUnichar quote = 0;
1121 nsScannerIterator origin, start, end;
1122 aScanner.CurrentPosition(origin);
1123 start = origin;
1125 while (NS_OK == result && !done) {
1126 aScanner.SetPosition(start);
1127 result = aScanner.ReadUntil(start, end, theEndCondition, PR_FALSE);
1128 if (NS_OK == result) {
1129 result = aScanner.Peek(aChar);
1131 if (NS_OK == result) {
1132 PRUnichar theNextChar = 0;
1133 if (kCR == aChar || kNewLine == aChar) {
1134 result = aScanner.GetChar(aChar); // Strip off the char
1135 result = aScanner.Peek(theNextChar); // Then see what's next.
1137 switch(aChar) {
1138 case kCR:
1139 // result = aScanner.GetChar(aChar);
1140 if (kLF == theNextChar) {
1141 // If the "\r" is followed by a "\n", don't replace it and
1142 // let it be ignored by the layout system
1143 end.advance(2);
1144 result = aScanner.GetChar(theNextChar);
1145 } else {
1146 // If it standalone, replace the "\r" with a "\n" so that
1147 // it will be considered by the layout system
1148 aScanner.ReplaceCharacter(end, kLF);
1149 ++end;
1151 ++mNewlineCount;
1152 break;
1153 case kLF:
1154 ++end;
1155 ++mNewlineCount;
1156 break;
1157 case '\'':
1158 case '"':
1159 ++end;
1160 if (quote) {
1161 if (quote == aChar) {
1162 quote = 0;
1164 } else {
1165 quote = aChar;
1167 break;
1168 case kGreaterThan:
1169 if (quote) {
1170 ++end;
1171 } else {
1172 start = end;
1173 // Note that start is wrong after this, we just avoid temp var
1174 ++start;
1175 aScanner.SetPosition(start); // Skip the >
1176 done = PR_TRUE;
1178 break;
1179 default:
1180 NS_ABORT_IF_FALSE(0, "should not happen, switch is missing cases?");
1181 break;
1183 start = end;
1184 } else {
1185 done = PR_TRUE;
1189 aScanner.BindSubstring(mTextValue, origin, end);
1191 if (kEOF == result) {
1192 mInError = PR_TRUE;
1193 if (!aScanner.IsIncremental()) {
1194 // Hide this EOF.
1195 result = NS_OK;
1199 return result;
1202 const nsSubstring&
1203 CMarkupDeclToken::GetStringValue()
1205 return mTextValue.AsString();
1209 CCommentToken::CCommentToken()
1210 : CHTMLToken(eHTMLTag_comment)
1214 CCommentToken::CCommentToken(const nsAString& aName)
1215 : CHTMLToken(eHTMLTag_comment)
1217 mComment.Rebind(aName);
1220 void
1221 CCommentToken::AppendSourceTo(nsAString& anOutputString)
1223 AppendUnicodeTo(mCommentDecl, anOutputString);
1226 static PRBool
1227 IsCommentEnd(const nsScannerIterator& aCurrent, const nsScannerIterator& aEnd,
1228 nsScannerIterator& aGt)
1230 nsScannerIterator current = aCurrent;
1231 PRInt32 dashes = 0;
1233 while (current != aEnd && dashes != 2) {
1234 if (*current == kGreaterThan) {
1235 aGt = current;
1236 return PR_TRUE;
1238 if (*current == PRUnichar('-')) {
1239 ++dashes;
1240 } else {
1241 dashes = 0;
1243 ++current;
1246 return PR_FALSE;
1249 nsresult
1250 CCommentToken::ConsumeStrictComment(nsScanner& aScanner)
1252 // <!--[... -- ... -- ...]*-->
1253 /*********************************************************
1254 NOTE: This algorithm does a fine job of handling comments
1255 when they're formatted per spec, but if they're not
1256 we don't handle them well.
1257 *********************************************************/
1258 nsScannerIterator end, current, gt, lt;
1259 aScanner.EndReading(end);
1260 aScanner.CurrentPosition(current);
1262 nsScannerIterator beginData = end;
1264 lt = current;
1265 lt.advance(-2); // <!
1267 current.advance(-1);
1269 // Regular comment must start with <!--
1270 if (*current == kExclamation &&
1271 ++current != end && *current == kMinus &&
1272 ++current != end && *current == kMinus &&
1273 ++current != end) {
1274 nsScannerIterator currentEnd = end;
1275 PRBool balancedComment = PR_FALSE;
1276 NS_NAMED_LITERAL_STRING(dashes, "--");
1277 beginData = current;
1279 while (FindInReadable(dashes, current, currentEnd)) {
1280 current.advance(2);
1282 balancedComment = !balancedComment; // We need to match '--' with '--'
1284 if (balancedComment && IsCommentEnd(current, end, gt)) {
1285 // done
1286 current.advance(-2);
1287 // Note: it's ok if beginData == current, (we'll copy an empty string)
1288 // and we need to bind mComment anyway.
1289 aScanner.BindSubstring(mComment, beginData, current);
1290 aScanner.BindSubstring(mCommentDecl, lt, ++gt);
1291 aScanner.SetPosition(gt);
1292 return NS_OK;
1295 // Continue after the last '--'
1296 currentEnd = end;
1300 // If beginData == end, we did not find opening '--'
1301 if (beginData == end) {
1302 // This might have been empty comment: <!>
1303 // Or it could have been something completely bogus like: <!This is foobar>
1304 // Handle both cases below
1305 aScanner.CurrentPosition(current);
1306 beginData = current;
1307 if (FindCharInReadable('>', current, end)) {
1308 aScanner.BindSubstring(mComment, beginData, current);
1309 aScanner.BindSubstring(mCommentDecl, lt, ++current);
1310 aScanner.SetPosition(current);
1311 return NS_OK;
1315 if (aScanner.IsIncremental()) {
1316 // We got here because we saw the beginning of a comment,
1317 // but not yet the end, and we are still loading the page. In that
1318 // case the return value here will cause us to unwind,
1319 // wait for more content, and try again.
1320 // XXX For performance reasons we should cache where we were, and
1321 // continue from there for next call
1322 return kEOF;
1325 // There was no terminating string, parse this comment as text.
1326 aScanner.SetPosition(lt, PR_FALSE, PR_TRUE);
1327 return kNotAComment;
1330 nsresult
1331 CCommentToken::ConsumeQuirksComment(nsScanner& aScanner)
1333 // <![-[-]] ... [[-]-|--!]>
1334 /*********************************************************
1335 NOTE: This algorithm does a fine job of handling comments
1336 commonly used, but it doesn't really consume them
1337 per spec (But then, neither does IE or Nav).
1338 *********************************************************/
1339 nsScannerIterator end, current;
1340 aScanner.EndReading(end);
1341 aScanner.CurrentPosition(current);
1342 nsScannerIterator beginData = current,
1343 beginLastMinus = end,
1344 bestAltCommentEnd = end,
1345 lt = current;
1346 lt.advance(-2); // <!
1348 // When we get here, we have always already consumed <!
1349 // Skip over possible leading minuses
1350 if (current != end && *current == kMinus) {
1351 beginLastMinus = current;
1352 ++current;
1353 ++beginData;
1354 if (current != end && *current == kMinus) { // <!--
1355 beginLastMinus = current;
1356 ++current;
1357 ++beginData;
1358 // Long form comment
1360 nsScannerIterator currentEnd = end, gt = end;
1362 // Find the end of the comment
1363 while (FindCharInReadable(kGreaterThan, current, currentEnd)) {
1364 gt = current;
1365 if (bestAltCommentEnd == end) {
1366 bestAltCommentEnd = gt;
1368 --current;
1369 PRBool goodComment = PR_FALSE;
1370 if (current != beginLastMinus && *current == kMinus) { // ->
1371 --current;
1372 if (current != beginLastMinus && *current == kMinus) { // -->
1373 goodComment = PR_TRUE;
1374 --current;
1376 } else if (current != beginLastMinus && *current == '!') {
1377 --current;
1378 if (current != beginLastMinus && *current == kMinus) {
1379 --current;
1380 if (current != beginLastMinus && *current == kMinus) { // --!>
1381 --current;
1382 goodComment = PR_TRUE;
1385 } else if (current == beginLastMinus) {
1386 goodComment = PR_TRUE;
1389 if (goodComment) {
1390 // done
1391 aScanner.BindSubstring(mComment, beginData, ++current);
1392 aScanner.BindSubstring(mCommentDecl, lt, ++gt);
1393 aScanner.SetPosition(gt);
1394 return NS_OK;
1395 } else {
1396 // try again starting after the last '>'
1397 current = ++gt;
1398 currentEnd = end;
1402 if (aScanner.IsIncremental()) {
1403 // We got here because we saw the beginning of a comment,
1404 // but not yet the end, and we are still loading the page. In that
1405 // case the return value here will cause us to unwind,
1406 // wait for more content, and try again.
1407 // XXX For performance reasons we should cache where we were, and
1408 // continue from there for next call
1409 return kEOF;
1412 // If you're here, then we're in a special state.
1413 // The problem at hand is that we've hit the end of the document without
1414 // finding the normal endcomment delimiter "-->". In this case, the
1415 // first thing we try is to see if we found an alternate endcomment
1416 // delimiter ">". If so, rewind just pass that, and use everything up
1417 // to that point as your comment. If not, the document has no end
1418 // comment and should be treated as one big comment.
1419 gt = bestAltCommentEnd;
1420 aScanner.BindSubstring(mComment, beginData, gt);
1421 if (gt != end) {
1422 ++gt;
1424 aScanner.BindSubstring(mCommentDecl, lt, gt);
1425 aScanner.SetPosition(gt);
1426 return NS_OK;
1430 // This could be short form of comment
1431 // Find the end of the comment
1432 current = beginData;
1433 if (FindCharInReadable(kGreaterThan, current, end)) {
1434 nsScannerIterator gt = current;
1435 if (current != beginData) {
1436 --current;
1437 if (current != beginData && *current == kMinus) { // ->
1438 --current;
1439 if (current != beginData && *current == kMinus) { // -->
1440 --current;
1442 } else if (current != beginData && *current == '!') { // !>
1443 --current;
1444 if (current != beginData && *current == kMinus) { // -!>
1445 --current;
1446 if (current != beginData && *current == kMinus) { // --!>
1447 --current;
1453 if (current != gt) {
1454 aScanner.BindSubstring(mComment, beginData, ++current);
1455 } else {
1456 // Bind mComment to an empty string (note that if current == gt,
1457 // then current == beginData). We reach this for <!>
1458 aScanner.BindSubstring(mComment, beginData, current);
1460 aScanner.BindSubstring(mCommentDecl, lt, ++gt);
1461 aScanner.SetPosition(gt);
1462 return NS_OK;
1465 if (!aScanner.IsIncremental()) {
1466 // This isn't a comment at all, go back to the < and consume as text.
1467 aScanner.SetPosition(lt, PR_FALSE, PR_TRUE);
1468 return kNotAComment;
1471 // Wait for more data...
1472 return kEOF;
1476 * Consume the identifier portion of the comment.
1477 * Note that we've already eaten the "<!" portion.
1479 * @param aChar -- last char consumed from stream
1480 * @param aScanner -- controller of underlying input source
1481 * @return error result
1483 nsresult
1484 CCommentToken::Consume(PRUnichar aChar, nsScanner& aScanner, PRInt32 aFlag)
1486 nsresult result = PR_TRUE;
1488 if (aFlag & NS_IPARSER_FLAG_STRICT_MODE) {
1489 // Enabling strict comment parsing for Bug 53011 and 2749 contradicts!
1490 result = ConsumeStrictComment(aScanner);
1491 } else {
1492 result = ConsumeQuirksComment(aScanner);
1495 if (NS_SUCCEEDED(result)) {
1496 mNewlineCount = mCommentDecl.CountChar(kNewLine);
1499 return result;
1502 const nsSubstring&
1503 CCommentToken::GetStringValue()
1505 return mComment.AsString();
1508 PRInt32
1509 CCommentToken::GetTokenType()
1511 return eToken_comment;
1514 CNewlineToken::CNewlineToken()
1515 : CHTMLToken(eHTMLTag_newline)
1519 PRInt32
1520 CNewlineToken::GetTokenType()
1522 return eToken_newline;
1525 static nsScannerSubstring* gNewlineStr;
1526 void
1527 CNewlineToken::AllocNewline()
1529 gNewlineStr = new nsScannerSubstring(NS_LITERAL_STRING("\n"));
1532 void
1533 CNewlineToken::FreeNewline()
1535 if (gNewlineStr) {
1536 delete gNewlineStr;
1537 gNewlineStr = nsnull;
1542 * This method retrieves the value of this internal string.
1544 * @return nsString reference to internal string value
1546 const nsSubstring&
1547 CNewlineToken::GetStringValue()
1549 return gNewlineStr->AsString();
1553 * Consume one newline (cr/lf pair).
1555 * @param aChar -- last char consumed from stream
1556 * @param aScanner -- controller of underlying input source
1557 * @return error result
1559 nsresult
1560 CNewlineToken::Consume(PRUnichar aChar, nsScanner& aScanner, PRInt32 aFlag)
1563 * Here's what the HTML spec says about newlines:
1565 * "A line break is defined to be a carriage return (&#x000D;),
1566 * a line feed (&#x000A;), or a carriage return/line feed pair.
1567 * All line breaks constitute white space."
1570 nsresult rv = NS_OK;
1571 if (aChar == kCR) {
1572 PRUnichar theChar;
1573 rv = aScanner.Peek(theChar);
1574 if (theChar == kNewLine) {
1575 rv = aScanner.GetChar(theChar);
1576 } else if (rv == kEOF && !aScanner.IsIncremental()) {
1577 // Make sure we don't lose information about this trailing newline.
1578 rv = NS_OK;
1582 mNewlineCount = 1;
1583 return rv;
1586 CAttributeToken::CAttributeToken()
1587 : CHTMLToken(eHTMLTag_unknown)
1589 mHasEqualWithoutValue = PR_FALSE;
1593 * String based constructor
1595 CAttributeToken::CAttributeToken(const nsAString& aName)
1596 : CHTMLToken(eHTMLTag_unknown)
1598 mTextValue.writable().Assign(aName);
1599 mHasEqualWithoutValue = PR_FALSE;
1603 * construct initializing data to key value pair
1605 CAttributeToken::CAttributeToken(const nsAString& aKey, const nsAString& aName)
1606 : CHTMLToken(eHTMLTag_unknown)
1608 mTextValue.writable().Assign(aName);
1609 mTextKey.Rebind(aKey);
1610 mHasEqualWithoutValue = PR_FALSE;
1613 PRInt32
1614 CAttributeToken::GetTokenType()
1616 return eToken_attribute;
1619 const nsSubstring&
1620 CAttributeToken::GetStringValue()
1622 return mTextValue.str();
1625 void
1626 CAttributeToken::GetSource(nsString& anOutputString)
1628 anOutputString.Truncate();
1629 AppendSourceTo(anOutputString);
1632 void
1633 CAttributeToken::AppendSourceTo(nsAString& anOutputString)
1635 AppendUnicodeTo(mTextKey, anOutputString);
1636 if (mTextValue.str().Length() || mHasEqualWithoutValue) {
1637 anOutputString.AppendLiteral("=");
1639 anOutputString.Append(mTextValue.str());
1640 // anOutputString.AppendLiteral(";");
1644 * This general purpose method is used when you want to
1645 * consume a known quoted string.
1647 static nsresult
1648 ConsumeQuotedString(PRUnichar aChar,
1649 nsScannerSharedSubstring& aString,
1650 PRInt32& aNewlineCount,
1651 nsScanner& aScanner,
1652 PRInt32 aFlag)
1654 NS_ASSERTION(aChar == kQuote || aChar == kApostrophe,
1655 "char is neither quote nor apostrophe");
1656 // Hold onto this in case this is an unterminated string literal
1657 PRUint32 origLen = aString.str().Length();
1659 static const PRUnichar theTerminalCharsQuote[] = {
1660 PRUnichar(kQuote), PRUnichar('&'), PRUnichar(kCR),
1661 PRUnichar(kNewLine), PRUnichar(0) };
1662 static const PRUnichar theTerminalCharsApostrophe[] = {
1663 PRUnichar(kApostrophe), PRUnichar('&'), PRUnichar(kCR),
1664 PRUnichar(kNewLine), PRUnichar(0) };
1665 static const nsReadEndCondition
1666 theTerminateConditionQuote(theTerminalCharsQuote);
1667 static const nsReadEndCondition
1668 theTerminateConditionApostrophe(theTerminalCharsApostrophe);
1670 // Assume Quote to init to something
1671 const nsReadEndCondition *terminateCondition = &theTerminateConditionQuote;
1672 if (aChar == kApostrophe) {
1673 terminateCondition = &theTerminateConditionApostrophe;
1676 nsresult result = NS_OK;
1677 nsScannerIterator theOffset;
1678 aScanner.CurrentPosition(theOffset);
1680 result = ConsumeUntil(aString, aNewlineCount, aScanner,
1681 *terminateCondition, PR_TRUE, PR_TRUE, aFlag);
1683 if (NS_SUCCEEDED(result)) {
1684 result = aScanner.GetChar(aChar); // aChar should be " or '
1687 // Ref: Bug 35806
1688 // A back up measure when disaster strikes...
1689 // Ex <table> <tr d="><td>hello</td></tr></table>
1690 if (!aString.str().IsEmpty() && aString.str().Last() != aChar &&
1691 !aScanner.IsIncremental() && result == kEOF) {
1692 static const nsReadEndCondition
1693 theAttributeTerminator(kAttributeTerminalChars);
1694 aString.writable().Truncate(origLen);
1695 aScanner.SetPosition(theOffset, PR_FALSE, PR_TRUE);
1696 result = ConsumeUntil(aString, aNewlineCount, aScanner,
1697 theAttributeTerminator, PR_FALSE, PR_TRUE, aFlag);
1698 if (NS_SUCCEEDED(result) && (aFlag & NS_IPARSER_FLAG_VIEW_SOURCE)) {
1699 // Remember that this string literal was unterminated.
1700 result = NS_ERROR_HTMLPARSER_UNTERMINATEDSTRINGLITERAL;
1703 return result;
1707 * This method is meant to be used by view-source to consume invalid attributes.
1708 * For the purposes of this method, an invalid attribute is an attribute that
1709 * starts with either ', ", or /. We consume all ', ", or / and the following
1710 * whitespace.
1712 * @param aScanner -- the scanner we're reading our data from.
1713 * @param aChar -- the character we're skipping
1714 * @param aCurrent -- the current position that we're looking at.
1715 * @param aNewlineCount -- a count of the newlines we've consumed.
1716 * @return error result.
1718 static nsresult
1719 ConsumeInvalidAttribute(nsScanner& aScanner,
1720 PRUnichar aChar,
1721 nsScannerIterator& aCurrent,
1722 PRInt32& aNewlineCount)
1724 NS_ASSERTION(aChar == kApostrophe || aChar == kQuote || aChar == kForwardSlash,
1725 "aChar must be a quote or apostrophe");
1726 nsScannerIterator end, wsbeg;
1727 aScanner.EndReading(end);
1729 while (aCurrent != end && *aCurrent == aChar) {
1730 ++aCurrent;
1733 aScanner.SetPosition(aCurrent);
1734 return aScanner.ReadWhitespace(wsbeg, aCurrent, aNewlineCount);
1738 * Consume the key and value portions of the attribute.
1740 nsresult
1741 CAttributeToken::Consume(PRUnichar aChar, nsScanner& aScanner, PRInt32 aFlag)
1743 nsresult result;
1744 nsScannerIterator wsstart, wsend;
1746 if (aFlag & NS_IPARSER_FLAG_VIEW_SOURCE) {
1747 result = aScanner.ReadWhitespace(wsstart, wsend, mNewlineCount);
1748 if (kEOF == result && wsstart != wsend) {
1749 // Do this here so if this is the final token in the document, we don't
1750 // lose the whitespace.
1751 aScanner.BindSubstring(mTextKey, wsstart, wsend);
1753 } else {
1754 result = aScanner.SkipWhitespace(mNewlineCount);
1757 if (NS_OK == result) {
1758 static const PRUnichar theTerminalsChars[] =
1759 { PRUnichar(' '), PRUnichar('"'),
1760 PRUnichar('='), PRUnichar('\n'),
1761 PRUnichar('\r'), PRUnichar('\t'),
1762 PRUnichar('>'), PRUnichar('<'),
1763 PRUnichar('\''), PRUnichar('/'),
1764 PRUnichar(0) };
1765 static const nsReadEndCondition theEndCondition(theTerminalsChars);
1767 nsScannerIterator start, end;
1768 result = aScanner.ReadUntil(start, end, theEndCondition, PR_FALSE);
1770 if (!(aFlag & NS_IPARSER_FLAG_VIEW_SOURCE)) {
1771 aScanner.BindSubstring(mTextKey, start, end);
1772 } else if (kEOF == result && wsstart != end) {
1773 // Capture all of the text (from the beginning of the whitespace to the
1774 // end of the document).
1775 aScanner.BindSubstring(mTextKey, wsstart, end);
1778 // Now it's time to Consume the (optional) value...
1779 if (NS_OK == result) {
1780 if (aFlag & NS_IPARSER_FLAG_VIEW_SOURCE) {
1781 result = aScanner.ReadWhitespace(start, wsend, mNewlineCount);
1782 aScanner.BindSubstring(mTextKey, wsstart, wsend);
1783 } else {
1784 result = aScanner.SkipWhitespace(mNewlineCount);
1787 if (NS_OK == result) {
1788 // Skip ahead until you find an equal sign or a '>'...
1789 result = aScanner.Peek(aChar);
1790 if (NS_OK == result) {
1791 if (kEqual == aChar) {
1792 result = aScanner.GetChar(aChar); // Skip the equal sign...
1793 if (NS_OK == result) {
1794 if (aFlag & NS_IPARSER_FLAG_VIEW_SOURCE) {
1795 PRBool haveCR;
1796 result = aScanner.ReadWhitespace(mTextValue, mNewlineCount,
1797 haveCR);
1798 } else {
1799 result = aScanner.SkipWhitespace(mNewlineCount);
1802 if (NS_OK == result) {
1803 result = aScanner.Peek(aChar); // And grab the next char.
1804 if (NS_OK == result) {
1805 if (kQuote == aChar || kApostrophe == aChar) {
1806 aScanner.GetChar(aChar);
1807 if (aFlag & NS_IPARSER_FLAG_VIEW_SOURCE) {
1808 mTextValue.writable().Append(aChar);
1811 result = ConsumeQuotedString(aChar, mTextValue,
1812 mNewlineCount, aScanner,
1813 aFlag);
1814 if (NS_SUCCEEDED(result) &&
1815 (aFlag & NS_IPARSER_FLAG_VIEW_SOURCE)) {
1816 mTextValue.writable().Append(aChar);
1817 } else if (result ==
1818 NS_ERROR_HTMLPARSER_UNTERMINATEDSTRINGLITERAL) {
1819 result = NS_OK;
1820 mInError = PR_TRUE;
1822 // According to spec. we ( who? ) should ignore linefeeds.
1823 // But look, even the carriage return was getting stripped
1824 // ( wonder why! ) - Ref. to bug 15204. Okay, so the
1825 // spec. told us to ignore linefeeds, bug then what about
1826 // bug 47535 ? Should we preserve everything then? Well,
1827 // let's make it so!
1828 } else if (kGreaterThan == aChar) {
1829 mHasEqualWithoutValue = PR_TRUE;
1830 mInError = PR_TRUE;
1831 } else {
1832 static const nsReadEndCondition
1833 theAttributeTerminator(kAttributeTerminalChars);
1834 result =
1835 ConsumeUntil(mTextValue,
1836 mNewlineCount,
1837 aScanner,
1838 theAttributeTerminator,
1839 PR_FALSE,
1840 PR_TRUE,
1841 aFlag);
1844 if (NS_OK == result) {
1845 if (aFlag & NS_IPARSER_FLAG_VIEW_SOURCE) {
1846 PRBool haveCR;
1847 result = aScanner.ReadWhitespace(mTextValue, mNewlineCount,
1848 haveCR);
1849 } else {
1850 result = aScanner.SkipWhitespace(mNewlineCount);
1853 } else {
1854 // We saw an equal sign but ran out of room looking for a value.
1855 mHasEqualWithoutValue = PR_TRUE;
1856 mInError = PR_TRUE;
1859 } else {
1860 // This is where we have to handle fairly busted content.
1861 // If you're here, it means we saw an attribute name, but couldn't
1862 // find the following equal sign. <tag NAME....
1864 // Doing this right in all cases is <i>REALLY</i> ugly.
1865 // My best guess is to grab the next non-ws char. We know it's not
1866 // '=', so let's see what it is. If it's a '"', then assume we're
1867 // reading from the middle of the value. Try stripping the quote
1868 // and continuing... Note that this code also strips forward
1869 // slashes to handle cases like <tag NAME/>
1870 if (kQuote == aChar || kApostrophe == aChar ||
1871 kForwardSlash == aChar) {
1872 // In XML, a trailing slash isn't an error.
1873 if (kForwardSlash != aChar || !(aFlag & NS_IPARSER_FLAG_XML)) {
1874 mInError = PR_TRUE;
1877 if (!(aFlag & NS_IPARSER_FLAG_VIEW_SOURCE)) {
1878 result = aScanner.SkipOver(aChar); // Strip quote or slash.
1879 if (NS_SUCCEEDED(result)) {
1880 result = aScanner.SkipWhitespace(mNewlineCount);
1882 } else {
1883 // We want to collect whitespace here so that following
1884 // attributes can have the right line number (and for
1885 // parity with the non-view-source code above).
1886 result = ConsumeInvalidAttribute(aScanner, aChar,
1887 wsend, mNewlineCount);
1889 aScanner.BindSubstring(mTextKey, wsstart, wsend);
1890 aScanner.SetPosition(wsend);
1898 if (NS_OK == result) {
1899 if (mTextValue.str().Length() == 0 && mTextKey.Length() == 0 &&
1900 mNewlineCount == 0 && !mHasEqualWithoutValue) {
1901 // This attribute contains no useful information for us, so there is no
1902 // use in keeping it around. Attributes that are otherwise empty, but
1903 // have newlines in them are passed on the the DTD so it can get line
1904 // numbering right.
1905 return NS_ERROR_HTMLPARSER_BADATTRIBUTE;
1910 if (kEOF == result && !aScanner.IsIncremental()) {
1911 // This is our run-of-the mill "don't lose content at the end of a
1912 // document" with a slight twist: we don't want to bother returning an
1913 // empty attribute key, even if this is the end of the document.
1914 if (mTextKey.Length() == 0) {
1915 result = NS_ERROR_HTMLPARSER_BADATTRIBUTE;
1916 } else {
1917 result = NS_OK;
1921 return result;
1924 void
1925 CAttributeToken::SetKey(const nsAString& aKey)
1927 mTextKey.Rebind(aKey);
1930 void
1931 CAttributeToken::BindKey(nsScanner* aScanner,
1932 nsScannerIterator& aStart,
1933 nsScannerIterator& aEnd)
1935 aScanner->BindSubstring(mTextKey, aStart, aEnd);
1938 CWhitespaceToken::CWhitespaceToken()
1939 : CHTMLToken(eHTMLTag_whitespace)
1943 CWhitespaceToken::CWhitespaceToken(const nsAString& aName)
1944 : CHTMLToken(eHTMLTag_whitespace)
1946 mTextValue.writable().Assign(aName);
1949 PRInt32 CWhitespaceToken::GetTokenType()
1951 return eToken_whitespace;
1955 * This general purpose method is used when you want to
1956 * consume an aribrary sequence of whitespace.
1958 * @param aChar -- last char consumed from stream
1959 * @param aScanner -- controller of underlying input source
1960 * @return error result
1962 nsresult
1963 CWhitespaceToken::Consume(PRUnichar aChar, nsScanner& aScanner, PRInt32 aFlag)
1965 // If possible, we'd like to just be a dependent substring starting at
1966 // |aChar|. The scanner has already been advanced, so we need to
1967 // back it up to facilitate this.
1969 nsScannerIterator start;
1970 aScanner.CurrentPosition(start);
1971 aScanner.SetPosition(--start, PR_FALSE, PR_TRUE);
1973 PRBool haveCR;
1975 nsresult result = aScanner.ReadWhitespace(mTextValue, mNewlineCount, haveCR);
1977 if (result == kEOF && !aScanner.IsIncremental()) {
1978 // Oops, we ran off the end, make sure we don't lose the trailing
1979 // whitespace!
1980 result = NS_OK;
1983 if (NS_OK == result && haveCR) {
1984 mTextValue.writable().StripChar(kCR);
1986 return result;
1989 const nsSubstring&
1990 CWhitespaceToken::GetStringValue()
1992 return mTextValue.str();
1995 CEntityToken::CEntityToken()
1996 : CHTMLToken(eHTMLTag_entity)
2000 CEntityToken::CEntityToken(const nsAString& aName)
2001 : CHTMLToken(eHTMLTag_entity)
2003 mTextValue.Assign(aName);
2008 * Consume the rest of the entity. We've already eaten the "&".
2010 * @param aChar -- last char consumed from stream
2011 * @param aScanner -- controller of underlying input source
2012 * @return error result
2014 nsresult
2015 CEntityToken::Consume(PRUnichar aChar, nsScanner& aScanner, PRInt32 aFlag)
2017 nsresult result = ConsumeEntity(aChar, mTextValue, aScanner);
2018 return result;
2021 PRInt32
2022 CEntityToken::GetTokenType()
2024 return eToken_entity;
2028 * This general purpose method is used when you want to
2029 * consume an entity &xxxx;. Keep in mind that entities
2030 * are <i>not</i> reduced inline.
2032 * @param aChar -- last char consumed from stream
2033 * @param aScanner -- controller of underlying input source
2034 * @return error result
2036 nsresult
2037 CEntityToken::ConsumeEntity(PRUnichar aChar,
2038 nsString& aString,
2039 nsScanner& aScanner)
2041 nsresult result = NS_OK;
2042 if (kLeftBrace == aChar) {
2043 // You're consuming a script entity...
2044 aScanner.GetChar(aChar); // Consume &
2046 PRInt32 rightBraceCount = 0;
2047 PRInt32 leftBraceCount = 0;
2049 do {
2050 result = aScanner.GetChar(aChar);
2052 if (NS_FAILED(result)) {
2053 return result;
2056 aString.Append(aChar);
2057 if (aChar == kRightBrace) {
2058 ++rightBraceCount;
2059 } else if (aChar == kLeftBrace) {
2060 ++leftBraceCount;
2062 } while (leftBraceCount != rightBraceCount);
2063 } else {
2064 PRUnichar theChar = 0;
2065 if (kHashsign == aChar) {
2066 result = aScanner.Peek(theChar, 2);
2068 if (NS_FAILED(result)) {
2069 if (kEOF == result && !aScanner.IsIncremental()) {
2070 // If this is the last buffer then we are certainly
2071 // not dealing with an entity. That's, there are
2072 // no more characters after &#. Bug 188278.
2073 return NS_HTMLTOKENS_NOT_AN_ENTITY;
2075 return result;
2078 if (nsCRT::IsAsciiDigit(theChar)) {
2079 aScanner.GetChar(aChar); // Consume &
2080 aScanner.GetChar(aChar); // Consume #
2081 aString.Assign(aChar);
2082 result = aScanner.ReadNumber(aString, 10);
2083 } else if (theChar == 'x' || theChar == 'X') {
2084 aScanner.GetChar(aChar); // Consume &
2085 aScanner.GetChar(aChar); // Consume #
2086 aScanner.GetChar(theChar); // Consume x
2087 aString.Assign(aChar);
2088 aString.Append(theChar);
2089 result = aScanner.ReadNumber(aString, 16);
2090 } else {
2091 return NS_HTMLTOKENS_NOT_AN_ENTITY;
2093 } else {
2094 result = aScanner.Peek(theChar, 1);
2096 if (NS_FAILED(result)) {
2097 return result;
2100 if (nsCRT::IsAsciiAlpha(theChar) ||
2101 theChar == '_' ||
2102 theChar == ':') {
2103 aScanner.GetChar(aChar); // Consume &
2104 result = aScanner.ReadEntityIdentifier(aString);
2105 } else {
2106 return NS_HTMLTOKENS_NOT_AN_ENTITY;
2111 if (NS_FAILED(result)) {
2112 return result;
2115 result = aScanner.Peek(aChar);
2117 if (NS_FAILED(result)) {
2118 return result;
2121 if (aChar == kSemicolon) {
2122 // Consume semicolon that stopped the scan
2123 aString.Append(aChar);
2124 result = aScanner.GetChar(aChar);
2127 return result;
2131 * Map some illegal but commonly used numeric entities into their
2132 * appropriate unicode value.
2134 #define NOT_USED 0xfffd
2136 static const PRUint16 PA_HackTable[] = {
2137 0x20ac, /* EURO SIGN */
2138 NOT_USED,
2139 0x201a, /* SINGLE LOW-9 QUOTATION MARK */
2140 0x0192, /* LATIN SMALL LETTER F WITH HOOK */
2141 0x201e, /* DOUBLE LOW-9 QUOTATION MARK */
2142 0x2026, /* HORIZONTAL ELLIPSIS */
2143 0x2020, /* DAGGER */
2144 0x2021, /* DOUBLE DAGGER */
2145 0x02c6, /* MODIFIER LETTER CIRCUMFLEX ACCENT */
2146 0x2030, /* PER MILLE SIGN */
2147 0x0160, /* LATIN CAPITAL LETTER S WITH CARON */
2148 0x2039, /* SINGLE LEFT-POINTING ANGLE QUOTATION MARK */
2149 0x0152, /* LATIN CAPITAL LIGATURE OE */
2150 NOT_USED,
2151 0x017D, /* LATIN CAPITAL LETTER Z WITH CARON */
2152 NOT_USED,
2153 NOT_USED,
2154 0x2018, /* LEFT SINGLE QUOTATION MARK */
2155 0x2019, /* RIGHT SINGLE QUOTATION MARK */
2156 0x201c, /* LEFT DOUBLE QUOTATION MARK */
2157 0x201d, /* RIGHT DOUBLE QUOTATION MARK */
2158 0x2022, /* BULLET */
2159 0x2013, /* EN DASH */
2160 0x2014, /* EM DASH */
2161 0x02dc, /* SMALL TILDE */
2162 0x2122, /* TRADE MARK SIGN */
2163 0x0161, /* LATIN SMALL LETTER S WITH CARON */
2164 0x203a, /* SINGLE RIGHT-POINTING ANGLE QUOTATION MARK */
2165 0x0153, /* LATIN SMALL LIGATURE OE */
2166 NOT_USED,
2167 0x017E, /* LATIN SMALL LETTER Z WITH CARON */
2168 0x0178 /* LATIN CAPITAL LETTER Y WITH DIAERESIS */
2171 static void
2172 AppendNCR(nsSubstring& aString, PRInt32 aNCRValue)
2174 /* For some illegal, but popular usage */
2175 if (aNCRValue >= 0x0080 && aNCRValue <= 0x009f) {
2176 aNCRValue = PA_HackTable[aNCRValue - 0x0080];
2179 AppendUCS4ToUTF16(ENSURE_VALID_CHAR(aNCRValue), aString);
2183 * This method converts this entity into its underlying
2184 * unicode equivalent.
2186 * @param aString will hold the resulting string value
2187 * @return numeric (unichar) value
2189 PRInt32
2190 CEntityToken::TranslateToUnicodeStr(nsString& aString)
2192 PRInt32 value = 0;
2194 if (mTextValue.Length() > 1) {
2195 PRUnichar theChar0 = mTextValue.CharAt(0);
2197 if (kHashsign == theChar0) {
2198 PRInt32 err = 0;
2200 value = mTextValue.ToInteger(&err, kAutoDetect);
2202 if (0 == err) {
2203 AppendNCR(aString, value);
2205 } else {
2206 value = nsHTMLEntities::EntityToUnicode(mTextValue);
2207 if (-1 < value) {
2208 // We found a named entity...
2209 aString.Assign(PRUnichar(value));
2214 return value;
2218 const
2219 nsSubstring& CEntityToken::GetStringValue()
2221 return mTextValue;
2224 void
2225 CEntityToken::GetSource(nsString& anOutputString)
2227 anOutputString.AppendLiteral("&");
2228 anOutputString += mTextValue;
2229 // Any possible ; is part of our text value.
2232 void
2233 CEntityToken::AppendSourceTo(nsAString& anOutputString)
2235 anOutputString.AppendLiteral("&");
2236 anOutputString += mTextValue;
2237 // Any possible ; is part of our text value.
2240 const PRUnichar*
2241 GetTagName(PRInt32 aTag)
2243 const PRUnichar *result = nsHTMLTags::GetStringValue((nsHTMLTag) aTag);
2245 if (result) {
2246 return result;
2249 if (aTag >= eHTMLTag_userdefined) {
2250 return sUserdefined;
2253 return 0;
2257 CInstructionToken::CInstructionToken()
2258 : CHTMLToken(eHTMLTag_instruction)
2262 CInstructionToken::CInstructionToken(const nsAString& aString)
2263 : CHTMLToken(eHTMLTag_unknown)
2265 mTextValue.Assign(aString);
2268 nsresult
2269 CInstructionToken::Consume(PRUnichar aChar, nsScanner& aScanner, PRInt32 aFlag)
2271 mTextValue.AssignLiteral("<?");
2272 nsresult result = NS_OK;
2273 PRBool done = PR_FALSE;
2275 while (NS_OK == result && !done) {
2276 // Note, this call does *not* consume the >.
2277 result = aScanner.ReadUntil(mTextValue, kGreaterThan, PR_FALSE);
2278 if (NS_SUCCEEDED(result)) {
2279 // In HTML, PIs end with a '>', in XML, they end with a '?>'. Cover both
2280 // cases here.
2281 if (!(aFlag & NS_IPARSER_FLAG_XML) ||
2282 kQuestionMark == mTextValue.Last()) {
2283 // This really is the end of the PI.
2284 done = PR_TRUE;
2286 // Need to append this character no matter what.
2287 aScanner.GetChar(aChar);
2288 mTextValue.Append(aChar);
2292 if (kEOF == result && !aScanner.IsIncremental()) {
2293 // Hide the EOF result because there is no more text coming.
2294 mInError = PR_TRUE;
2295 result = NS_OK;
2298 return result;
2301 PRInt32
2302 CInstructionToken::GetTokenType()
2304 return eToken_instruction;
2307 const nsSubstring&
2308 CInstructionToken::GetStringValue()
2310 return mTextValue;
2313 // Doctype decl token
2315 CDoctypeDeclToken::CDoctypeDeclToken(eHTMLTags aTag)
2316 : CHTMLToken(aTag)
2320 CDoctypeDeclToken::CDoctypeDeclToken(const nsAString& aString, eHTMLTags aTag)
2321 : CHTMLToken(aTag), mTextValue(aString)
2326 * This method consumes a doctype element.
2327 * Note: I'm rewriting this method to seek to the first <, since quotes can
2328 * really screw us up.
2329 * XXX Maybe this should do better in XML or strict mode?
2331 nsresult
2332 CDoctypeDeclToken::Consume(PRUnichar aChar, nsScanner& aScanner, PRInt32 aFlag)
2334 static const PRUnichar terminalChars[] =
2335 { PRUnichar('>'), PRUnichar('<'),
2336 PRUnichar(0)
2338 static const nsReadEndCondition theEndCondition(terminalChars);
2340 nsScannerIterator start, end;
2342 aScanner.CurrentPosition(start);
2343 aScanner.EndReading(end);
2345 nsresult result = aScanner.ReadUntil(start, end, theEndCondition, PR_FALSE);
2347 if (NS_SUCCEEDED(result)) {
2348 PRUnichar ch;
2349 aScanner.Peek(ch);
2350 if (ch == kGreaterThan) {
2351 // Include '>' but not '<' since '<'
2352 // could belong to another tag.
2353 aScanner.GetChar(ch);
2354 end.advance(1);
2355 } else {
2356 NS_ASSERTION(kLessThan == ch,
2357 "Make sure this doctype decl. is really in error.");
2358 mInError = PR_TRUE;
2360 } else if (!aScanner.IsIncremental()) {
2361 // We have reached the document end but haven't
2362 // found either a '<' or a '>'. Therefore use
2363 // whatever we have.
2364 mInError = PR_TRUE;
2365 result = NS_OK;
2368 if (NS_SUCCEEDED(result)) {
2369 start.advance(-2); // Make sure to consume <!
2370 CopyUnicodeTo(start, end, mTextValue);
2373 return result;
2376 PRInt32
2377 CDoctypeDeclToken::GetTokenType()
2379 return eToken_doctypeDecl;
2382 const nsSubstring&
2383 CDoctypeDeclToken::GetStringValue()
2385 return mTextValue;
2388 void
2389 CDoctypeDeclToken::SetStringValue(const nsAString& aStr)
2391 mTextValue.Assign(aStr);