1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=2 sw=2 et tw=78: */
3 /* ***** BEGIN LICENSE BLOCK *****
4 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
6 * The contents of this file are subject to the Mozilla Public License Version
7 * 1.1 (the "License"); you may not use this file except in compliance with
8 * the License. You may obtain a copy of the License at
9 * http://www.mozilla.org/MPL/
11 * Software distributed under the License is distributed on an "AS IS" basis,
12 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
13 * for the specific language governing rights and limitations under the
16 * The Original Code is mozilla.org code.
18 * The Initial Developer of the Original Code is
19 * Netscape Communications Corporation.
20 * Portions created by the Initial Developer are Copyright (C) 1998
21 * the Initial Developer. All Rights Reserved.
24 * Blake Kaplan <mrbkap@gmail.com>
26 * Alternatively, the contents of this file may be used under the terms of
27 * either of the GNU General Public License Version 2 or later (the "GPL"),
28 * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
29 * in which case the provisions of the GPL or the LGPL are applicable instead
30 * of those above. If you wish to allow use of your version of this file only
31 * under the terms of either the GPL or the LGPL, and not to allow others to
32 * use your version of this file under the terms of the MPL, indicate your
33 * decision by deleting the provisions above and replace them with the notice
34 * and other provisions required by the GPL or the LGPL. If you do not delete
35 * the provisions above, a recipient may use your version of this file under
36 * the terms of any one of the MPL, the GPL or the LGPL.
38 * ***** END LICENSE BLOCK ***** */
43 #include "nsScanner.h"
45 #include "nsHTMLTokens.h"
48 #include "nsHTMLTags.h"
49 #include "nsHTMLEntities.h"
51 #include "nsReadableUtils.h"
52 #include "nsUnicharUtils.h"
53 #include "nsScanner.h"
56 static const PRUnichar sUserdefined
[] = {'u', 's', 'e', 'r', 'd', 'e', 'f',
57 'i', 'n', 'e', 'd', 0};
59 static const PRUnichar kAttributeTerminalChars
[] = {
60 PRUnichar('&'), PRUnichar('\t'), PRUnichar('\n'),
61 PRUnichar('\r'), PRUnichar(' '), PRUnichar('>'),
65 static void AppendNCR(nsSubstring
& aString
, PRInt32 aNCRValue
);
67 * Consumes an entity from aScanner and expands it into aString.
69 * @param aString The target string to append the entity to.
70 * @param aScanner Controller of underlying input source
71 * @param aIECompatible Controls whether we respect entities with values >
72 * 255 and no terminating semicolon.
73 * @param aFlag If NS_IPARSER_FLAG_VIEW_SOURCE do not reduce entities...
74 * @return error result
77 ConsumeEntity(nsScannerSharedSubstring
& aString
,
82 nsresult result
= NS_OK
;
85 result
= aScanner
.Peek(ch
, 1);
87 if (NS_SUCCEEDED(result
)) {
89 PRInt32 theNCRValue
= 0;
92 if (nsCRT::IsAsciiAlpha(ch
) && !(aFlag
& NS_IPARSER_FLAG_VIEW_SOURCE
)) {
93 result
= CEntityToken::ConsumeEntity(ch
, entity
, aScanner
);
94 if (NS_SUCCEEDED(result
)) {
95 theNCRValue
= nsHTMLEntities::EntityToUnicode(entity
);
96 PRUnichar theTermChar
= entity
.Last();
97 // If an entity value is greater than 255 then:
98 // Nav 4.x does not treat it as an entity,
99 // IE treats it as an entity if terminated with a semicolon.
102 nsSubstring
&writable
= aString
.writable();
103 if (theNCRValue
< 0 ||
104 (aIECompatible
&& theNCRValue
> 255 && theTermChar
!= ';')) {
105 // Looks like we're not dealing with an entity
106 writable
.Append(kAmpersand
);
107 writable
.Append(entity
);
109 // A valid entity so reduce it.
110 writable
.Append(PRUnichar(theNCRValue
));
113 } else if (ch
== kHashsign
&& !(aFlag
& NS_IPARSER_FLAG_VIEW_SOURCE
)) {
114 result
= CEntityToken::ConsumeEntity(ch
, entity
, aScanner
);
115 if (NS_SUCCEEDED(result
)) {
116 nsSubstring
&writable
= aString
.writable();
117 if (result
== NS_HTMLTOKENS_NOT_AN_ENTITY
) {
118 // Looked like an entity but it's not
119 aScanner
.GetChar(amp
);
120 writable
.Append(amp
);
124 theNCRValue
= entity
.ToInteger(&err
, kAutoDetect
);
125 AppendNCR(writable
, theNCRValue
);
129 // What we thought as entity is not really an entity...
130 aScanner
.GetChar(amp
);
131 aString
.writable().Append(amp
);
139 * This general purpose method is used when you want to
140 * consume attributed text value.
141 * Note: It also reduces entities.
143 * @param aNewlineCount -- the newline count to increment when hitting newlines
144 * @param aScanner -- controller of underlying input source
145 * @param aTerminalChars -- characters that stop consuming attribute.
146 * @param aAllowNewlines -- whether to allow newlines in the value.
147 * XXX it would be nice to roll this info into
148 * aTerminalChars somehow....
149 * @param aIECompatEntities IE treats entities with values > 255 as
150 * entities only if they're terminated with a
151 * semicolon. This is true to follow that behavior
152 * and false to treat all values as entities.
153 * @param aFlag - contains information such as |dtd mode|view mode|doctype|etc...
154 * @return error result
157 ConsumeUntil(nsScannerSharedSubstring
& aString
,
158 PRInt32
& aNewlineCount
,
160 const nsReadEndCondition
& aEndCondition
,
161 PRBool aAllowNewlines
,
162 PRBool aIECompatEntities
,
165 nsresult result
= NS_OK
;
166 PRBool done
= PR_FALSE
;
169 result
= aScanner
.ReadUntil(aString
, aEndCondition
, PR_FALSE
);
170 if (NS_SUCCEEDED(result
)) {
173 if (ch
== kAmpersand
) {
174 result
= ConsumeEntity(aString
, aScanner
, aIECompatEntities
, aFlag
);
175 } else if (ch
== kCR
&& aAllowNewlines
) {
176 aScanner
.GetChar(ch
);
177 result
= aScanner
.Peek(ch
);
178 if (NS_SUCCEEDED(result
)) {
179 nsSubstring
&writable
= aString
.writable();
180 if (ch
== kNewLine
) {
181 writable
.AppendLiteral("\r\n");
182 aScanner
.GetChar(ch
);
184 writable
.Append(PRUnichar('\r'));
188 } else if (ch
== kNewLine
&& aAllowNewlines
) {
189 aScanner
.GetChar(ch
);
190 aString
.writable().Append(PRUnichar('\n'));
196 } while (NS_SUCCEEDED(result
) && !done
);
201 /**************************************************************
202 And now for the token classes...
203 **************************************************************/
206 * Constructor from tag id
208 CHTMLToken::CHTMLToken(eHTMLTags aTag
)
214 CHTMLToken::~CHTMLToken()
219 * Constructor from tag id
221 CStartToken::CStartToken(eHTMLTags aTag
)
225 mContainerInfo
= eFormUnknown
;
227 mAttributed
= PR_FALSE
;
231 CStartToken::CStartToken(const nsAString
& aName
)
232 : CHTMLToken(eHTMLTag_unknown
)
235 mContainerInfo
= eFormUnknown
;
236 mTextValue
.Assign(aName
);
238 mAttributed
= PR_FALSE
;
242 CStartToken::CStartToken(const nsAString
& aName
, eHTMLTags aTag
)
246 mContainerInfo
= eFormUnknown
;
247 mTextValue
.Assign(aName
);
249 mAttributed
= PR_FALSE
;
254 * This method returns the typeid (the tag type) for this token.
257 CStartToken::GetTypeID()
259 if (eHTMLTag_unknown
== mTypeID
) {
260 mTypeID
= nsHTMLTags::LookupTag(mTextValue
);
266 CStartToken::GetTokenType()
272 CStartToken::SetEmpty(PRBool aValue
)
278 CStartToken::IsEmpty()
284 * Consume the identifier portion of the start tag
287 CStartToken::Consume(PRUnichar aChar
, nsScanner
& aScanner
, PRInt32 aFlag
)
289 // If you're here, we've already Consumed the < char, and are
290 // ready to Consume the rest of the open tag identifier.
291 // Stop consuming as soon as you see a space or a '>'.
292 // NOTE: We don't Consume the tag attributes here, nor do we eat the ">"
294 nsresult result
= NS_OK
;
295 nsScannerSharedSubstring tagIdent
;
297 if (aFlag
& NS_IPARSER_FLAG_HTML
) {
298 result
= aScanner
.ReadTagIdentifier(tagIdent
);
299 mTypeID
= (PRInt32
)nsHTMLTags::LookupTag(tagIdent
.str());
300 // Save the original tag string if this is user-defined or if we
301 // are viewing source
302 if (eHTMLTag_userdefined
== mTypeID
||
303 (aFlag
& NS_IPARSER_FLAG_VIEW_SOURCE
)) {
304 mTextValue
= tagIdent
.str();
307 result
= aScanner
.ReadTagIdentifier(tagIdent
);
308 mTextValue
= tagIdent
.str();
309 mTypeID
= nsHTMLTags::LookupTag(mTextValue
);
312 if (NS_SUCCEEDED(result
) && !(aFlag
& NS_IPARSER_FLAG_VIEW_SOURCE
)) {
313 result
= aScanner
.SkipWhitespace(mNewlineCount
);
316 if (kEOF
== result
&& !aScanner
.IsIncremental()) {
317 // Take what we can get.
325 CStartToken::GetStringValue()
327 if (eHTMLTag_unknown
< mTypeID
&& mTypeID
< eHTMLTag_text
) {
328 if (!mTextValue
.Length()) {
329 mTextValue
.Assign(nsHTMLTags::GetStringValue((nsHTMLTag
) mTypeID
));
336 CStartToken::GetSource(nsString
& anOutputString
)
338 anOutputString
.Truncate();
339 AppendSourceTo(anOutputString
);
343 CStartToken::AppendSourceTo(nsAString
& anOutputString
)
345 anOutputString
.Append(PRUnichar('<'));
347 * Watch out for Bug 15204
349 if (!mTextValue
.IsEmpty()) {
350 anOutputString
.Append(mTextValue
);
352 anOutputString
.Append(GetTagName(mTypeID
));
355 anOutputString
.Append(PRUnichar('>'));
358 CEndToken::CEndToken(eHTMLTags aTag
)
363 CEndToken::CEndToken(const nsAString
& aName
)
364 : CHTMLToken(eHTMLTag_unknown
)
366 mTextValue
.Assign(aName
);
369 CEndToken::CEndToken(const nsAString
& aName
, eHTMLTags aTag
)
372 mTextValue
.Assign(aName
);
376 CEndToken::Consume(PRUnichar aChar
, nsScanner
& aScanner
, PRInt32 aFlag
)
378 nsresult result
= NS_OK
;
379 nsScannerSharedSubstring tagIdent
;
381 if (aFlag
& NS_IPARSER_FLAG_HTML
) {
382 result
= aScanner
.ReadTagIdentifier(tagIdent
);
384 mTypeID
= (PRInt32
)nsHTMLTags::LookupTag(tagIdent
.str());
385 // Save the original tag string if this is user-defined or if we
386 // are viewing source
387 if (eHTMLTag_userdefined
== mTypeID
||
388 (aFlag
& NS_IPARSER_FLAG_VIEW_SOURCE
)) {
389 mTextValue
= tagIdent
.str();
392 result
= aScanner
.ReadTagIdentifier(tagIdent
);
393 mTextValue
= tagIdent
.str();
394 mTypeID
= nsHTMLTags::LookupTag(mTextValue
);
397 if (NS_SUCCEEDED(result
) && !(aFlag
& NS_IPARSER_FLAG_VIEW_SOURCE
)) {
398 result
= aScanner
.SkipWhitespace(mNewlineCount
);
401 if (kEOF
== result
&& !aScanner
.IsIncremental()) {
402 // Take what we can get.
411 * Asks the token to determine the <i>HTMLTag type</i> of
412 * the token. This turns around and looks up the tag name
413 * in the tag dictionary.
416 CEndToken::GetTypeID()
418 if (eHTMLTag_unknown
== mTypeID
) {
419 mTypeID
= nsHTMLTags::LookupTag(mTextValue
);
423 mTypeID
= eHTMLTag_ul
;
435 CEndToken::GetTokenType()
441 CEndToken::GetStringValue()
443 if (eHTMLTag_unknown
< mTypeID
&& mTypeID
< eHTMLTag_text
) {
444 if (!mTextValue
.Length()) {
445 mTextValue
.Assign(nsHTMLTags::GetStringValue((nsHTMLTag
) mTypeID
));
452 CEndToken::GetSource(nsString
& anOutputString
)
454 anOutputString
.Truncate();
455 AppendSourceTo(anOutputString
);
459 CEndToken::AppendSourceTo(nsAString
& anOutputString
)
461 anOutputString
.AppendLiteral("</");
462 if (!mTextValue
.IsEmpty()) {
463 anOutputString
.Append(mTextValue
);
465 anOutputString
.Append(GetTagName(mTypeID
));
468 anOutputString
.Append(PRUnichar('>'));
471 CTextToken::CTextToken()
472 : CHTMLToken(eHTMLTag_text
)
476 CTextToken::CTextToken(const nsAString
& aName
)
477 : CHTMLToken(eHTMLTag_text
)
479 mTextValue
.Rebind(aName
);
483 CTextToken::GetTokenType()
489 CTextToken::GetTextLength()
491 return mTextValue
.Length();
495 CTextToken::Consume(PRUnichar aChar
, nsScanner
& aScanner
, PRInt32 aFlag
)
497 static const PRUnichar theTerminalsChars
[] =
498 { PRUnichar('\n'), PRUnichar('\r'), PRUnichar('&'), PRUnichar('<'),
500 static const nsReadEndCondition
theEndCondition(theTerminalsChars
);
501 nsresult result
= NS_OK
;
502 PRBool done
= PR_FALSE
;
503 nsScannerIterator origin
, start
, end
;
505 // Start scanning after the first character, because we know it to
506 // be part of this text token (we wouldn't have come here if it weren't)
507 aScanner
.CurrentPosition(origin
);
509 aScanner
.EndReading(end
);
511 NS_ASSERTION(start
!= end
, "Calling CTextToken::Consume when already at the "
512 "end of a document is a bad idea.");
514 aScanner
.SetPosition(++start
);
516 while (NS_OK
== result
&& !done
) {
517 result
= aScanner
.ReadUntil(start
, end
, theEndCondition
, PR_FALSE
);
518 if (NS_OK
== result
) {
519 result
= aScanner
.Peek(aChar
);
521 if (NS_OK
== result
&& (kCR
== aChar
|| kNewLine
== aChar
)) {
525 // It's a carriage return. See if this is part of a CR-LF pair (in
526 // which case we need to treat it as one newline). If we're at the
527 // edge of a packet, then leave the CR on the scanner, since it
528 // could still be part of a CR-LF pair. Otherwise, it isn't.
529 PRUnichar theNextChar
;
530 result
= aScanner
.Peek(theNextChar
, 1);
532 if (result
== kEOF
&& aScanner
.IsIncremental()) {
536 if (NS_SUCCEEDED(result
)) {
537 // Actually get the carriage return.
538 aScanner
.GetChar(aChar
);
541 if (kLF
== theNextChar
) {
542 // If the "\r" is followed by a "\n", don't replace it and let
543 // it be ignored by the layout system.
545 aScanner
.GetChar(theNextChar
);
547 // If it is standalone, replace the "\r" with a "\n" so that it
548 // will be considered by the layout system.
549 aScanner
.ReplaceCharacter(end
, kLF
);
556 aScanner
.GetChar(aChar
);
567 // Note: This function is only called from nsHTMLTokenizer::ConsumeText. If
568 // we return an error result from the final buffer, then it is responsible
569 // for turning it into an NS_OK result.
570 aScanner
.BindSubstring(mTextValue
, origin
, end
);
576 * Consume as much clear text from scanner as possible.
577 * The scanner is left on the < of the perceived end tag.
579 * @param aChar -- last char consumed from stream
580 * @param aConservativeConsume -- controls our handling of content with no
581 * terminating string.
582 * @param aIgnoreComments -- whether or not we should take comments into
583 * account in looking for the end tag.
584 * @param aScanner -- controller of underlying input source
585 * @param aEndTagname -- the terminal tag name.
586 * @param aFlag -- dtd modes and such.
587 * @param aFlushTokens -- PR_TRUE if we found the terminal tag.
588 * @return error result
591 CTextToken::ConsumeCharacterData(PRBool aIgnoreComments
,
593 const nsAString
& aEndTagName
,
595 PRBool
& aFlushTokens
)
597 nsresult result
= NS_OK
;
598 nsScannerIterator theStartOffset
, theCurrOffset
, theTermStrPos
,
599 theStartCommentPos
, theAltTermStrPos
, endPos
;
600 PRBool done
= PR_FALSE
;
601 PRBool theLastIteration
= PR_FALSE
;
603 aScanner
.CurrentPosition(theStartOffset
);
604 theCurrOffset
= theStartOffset
;
605 aScanner
.EndReading(endPos
);
606 theTermStrPos
= theStartCommentPos
= theAltTermStrPos
= endPos
;
608 // ALGORITHM: *** The performance is based on correctness of the document ***
609 // 1. Look for a '<' character. This could be
610 // a) Start of a comment (<!--),
611 // b) Start of the terminal string, or
612 // c) a start of a tag.
613 // We are interested in a) and b). c) is ignored because in CDATA we
614 // don't care for tags.
615 // NOTE: Technically speaking in CDATA we should ignore the comments too!
616 // But for compatibility we don't.
617 // 2. Having the offset, for '<', search for the terminal string from there
618 // on and record its offset.
619 // 3. From the same '<' offset also search for start of a comment '<!--'.
620 // If found search for end comment '-->' between the terminal string and
621 // '<!--'. If you did not find the end comment, then we have a malformed
622 // document, i.e., this section has a prematured terminal string Ex.
623 // <SCRIPT><!-- document.write('</SCRIPT>') //--> </SCRIPT>. But record
624 // terminal string's offset if this is the first premature terminal
625 // string, and update the current offset to the terminal string
626 // (prematured) offset and goto step 1.
627 // 4. Amen...If you found a terminal string and '-->'. Otherwise goto step 1.
628 // 5. If the end of the document is reached and if we still don't have the
629 // condition in step 4. then assume that the prematured terminal string
630 // is the actual terminal string and goto step 1. This will be our last
631 // iteration. If there is no premature terminal string and we're being
632 // conservative in our consumption (aConservativeConsume), then don't
633 // consume anything from the scanner. Otherwise, we consume all the way
636 NS_NAMED_LITERAL_STRING(ltslash
, "</");
637 const nsString theTerminalString
= ltslash
+ aEndTagName
;
639 PRUint32 termStrLen
= theTerminalString
.Length();
640 while (result
== NS_OK
&& !done
) {
641 PRBool found
= PR_FALSE
;
642 nsScannerIterator gtOffset
, ltOffset
= theCurrOffset
;
643 while (FindCharInReadable(PRUnichar(kLessThan
), ltOffset
, endPos
) &&
644 ((PRUint32
)ltOffset
.size_forward() >= termStrLen
||
645 Distance(ltOffset
, endPos
) >= termStrLen
)) {
646 // Make a copy of the (presumed) end tag and
647 // do a case-insensitive comparison
649 nsScannerIterator
start(ltOffset
), end(ltOffset
);
650 end
.advance(termStrLen
);
652 if (CaseInsensitiveFindInReadable(theTerminalString
, start
, end
) &&
653 (end
== endPos
|| (*end
== '>' || *end
== ' ' ||
654 *end
== '\t' || *end
== '\n' ||
657 // Note that aIgnoreComments is only not set for <script>. We don't
658 // want to execute scripts that aren't in the form of: <script\s.*>
659 if ((end
== endPos
&& aIgnoreComments
) ||
660 FindCharInReadable(PRUnichar(kGreaterThan
), gtOffset
, endPos
)) {
662 theTermStrPos
= start
;
669 if (found
&& theTermStrPos
!= endPos
) {
670 if (!(aFlag
& NS_IPARSER_FLAG_STRICT_MODE
) &&
671 !theLastIteration
&& !aIgnoreComments
) {
672 nsScannerIterator
endComment(ltOffset
);
673 endComment
.advance(5);
675 if ((theStartCommentPos
== endPos
) &&
676 FindInReadable(NS_LITERAL_STRING("<!--"), theCurrOffset
,
678 theStartCommentPos
= theCurrOffset
;
681 if (theStartCommentPos
!= endPos
) {
682 // Search for --> between <!-- and </TERMINALSTRING>.
683 theCurrOffset
= theStartCommentPos
;
684 nsScannerIterator
terminal(theTermStrPos
);
685 if (!RFindInReadable(NS_LITERAL_STRING("-->"),
686 theCurrOffset
, terminal
)) {
687 // If you're here it means that we have a bogus terminal string.
688 // Even though it is bogus, the position of the terminal string
689 // could be helpful in case we hit the rock bottom.
690 if (theAltTermStrPos
== endPos
) {
691 // But we only want to remember the first bogus terminal string.
692 theAltTermStrPos
= theTermStrPos
;
695 // We did not find '-->' so keep searching for terminal string.
696 theCurrOffset
= theTermStrPos
;
697 theCurrOffset
.advance(termStrLen
);
703 aScanner
.BindSubstring(mTextValue
, theStartOffset
, theTermStrPos
);
704 aScanner
.SetPosition(ltOffset
);
706 // We found </SCRIPT> or </STYLE>...permit flushing -> Ref: Bug 22485
707 aFlushTokens
= PR_TRUE
;
710 // We end up here if:
711 // a) when the buffer runs out ot data.
712 // b) when the terminal string is not found.
713 if (!aScanner
.IsIncremental()) {
714 if (theAltTermStrPos
!= endPos
) {
715 // If you're here it means that we hit the rock bottom and therefore
716 // switch to plan B, since we have an alternative terminating string.
717 theCurrOffset
= theAltTermStrPos
;
718 theLastIteration
= PR_TRUE
;
720 // Oops, We fell all the way down to the end of the document.
721 done
= PR_TRUE
; // Do this to fix Bug. 35456
722 result
= kFakeEndTag
;
723 aScanner
.BindSubstring(mTextValue
, theStartOffset
, endPos
);
724 aScanner
.SetPosition(endPos
);
732 if (result
== NS_OK
) {
733 mNewlineCount
= mTextValue
.CountChar(kNewLine
);
740 * Consume as much clear text from scanner as possible. Reducing entities.
741 * The scanner is left on the < of the perceived end tag.
743 * @param aChar -- last char consumed from stream
744 * @param aConservativeConsume -- controls our handling of content with no
745 * terminating string.
746 * @param aScanner -- controller of underlying input source
747 * @param aEndTagname -- the terminal tag name.
748 * @param aFlag -- dtd modes and such.
749 * @param aFlushTokens -- PR_TRUE if we found the terminal tag.
750 * @return error result
753 CTextToken::ConsumeParsedCharacterData(PRBool aDiscardFirstNewline
,
754 PRBool aConservativeConsume
,
756 const nsAString
& aEndTagName
,
760 // This function is fairly straightforward except if there is no terminating
761 // string. If there is, we simply loop through all of the entities, reducing
762 // them as necessary and skipping over non-terminal strings starting with <.
763 // If there is *no* terminal string, then we examine aConservativeConsume.
764 // If we want to be conservative, we backtrack to the first place in the
765 // document that looked like the end of PCDATA (i.e., the first tag). This
766 // is for compatibility and so we don't regress bug 42945. If we are not
767 // conservative, then we consume everything, all the way up to the end of
770 static const PRUnichar terminalChars
[] = {
771 PRUnichar('\r'), PRUnichar('\n'), PRUnichar('&'), PRUnichar('<'),
774 static const nsReadEndCondition
theEndCondition(terminalChars
);
776 nsScannerIterator currPos
, endPos
, altEndPos
;
777 PRUint32 truncPos
= 0;
778 aScanner
.CurrentPosition(currPos
);
779 aScanner
.EndReading(endPos
);
783 nsScannerSharedSubstring theContent
;
786 NS_NAMED_LITERAL_STRING(commentStart
, "<!--");
787 NS_NAMED_LITERAL_STRING(ltslash
, "</");
788 const nsString theTerminalString
= ltslash
+ aEndTagName
;
789 PRUint32 termStrLen
= theTerminalString
.Length();
790 PRUint32 commentStartLen
= commentStart
.Length();
792 nsresult result
= NS_OK
;
794 // Note that if we're already at the end of the document, the ConsumeUntil
795 // will fail, and we'll do the right thing.
797 result
= ConsumeUntil(theContent
, mNewlineCount
, aScanner
,
798 theEndCondition
, PR_TRUE
, PR_FALSE
, aFlag
);
800 if (aDiscardFirstNewline
&&
801 (NS_SUCCEEDED(result
) || !aScanner
.IsIncremental()) &&
802 !(aFlag
& NS_IPARSER_FLAG_VIEW_SOURCE
)) {
803 // Check if the very first character is a newline, and if so discard it.
804 // Note that we don't want to discard it in view source!
805 // Also note that this has to happen here (as opposed to before the
806 // ConsumeUntil) because we have to expand any entities.
807 // XXX It would be nice to be able to do this without calling
809 const nsSubstring
&firstChunk
= theContent
.str();
810 if (!firstChunk
.IsEmpty()) {
812 PRUnichar newline
= firstChunk
.First();
814 if (newline
== kCR
|| newline
== kNewLine
) {
817 if (firstChunk
.Length() > 1) {
818 if (newline
== kCR
&& firstChunk
.CharAt(1) == kNewLine
) {
819 // Handle \r\n = 1 newline.
822 // Note: \n\r = 2 newlines.
827 theContent
.writable() = Substring(firstChunk
, where
);
831 aDiscardFirstNewline
= PR_FALSE
;
833 if (NS_FAILED(result
)) {
834 if (kEOF
== result
&& !aScanner
.IsIncremental()) {
835 aFound
= PR_TRUE
; // this is as good as it gets.
836 result
= kFakeEndTag
;
838 if (aConservativeConsume
&& altEndPos
!= endPos
) {
839 // We ran out of room looking for a </title>. Go back to the first
840 // place that looked like a tag and use that as our stopping point.
841 theContent
.writable().Truncate(truncPos
);
842 aScanner
.SetPosition(altEndPos
, PR_FALSE
, PR_TRUE
);
844 // else we take everything we consumed.
845 mTextValue
.Rebind(theContent
.str());
853 aScanner
.CurrentPosition(currPos
);
854 aScanner
.GetChar(ch
); // this character must be '&' or '<'
856 if (ch
== kLessThan
&& altEndPos
== endPos
) {
857 // Keep this position in case we need it for later.
859 truncPos
= theContent
.str().Length();
862 if (Distance(currPos
, endPos
) >= termStrLen
) {
863 nsScannerIterator
start(currPos
), end(currPos
);
864 end
.advance(termStrLen
);
866 if (CaseInsensitiveFindInReadable(theTerminalString
, start
, end
)) {
867 if (end
!= endPos
&& (*end
== '>' || *end
== ' ' ||
868 *end
== '\t' || *end
== '\n' ||
871 mTextValue
.Rebind(theContent
.str());
873 // Note: This SetPosition() is actually going backwards from the
874 // scanner's mCurrentPosition (so we pass aReverse == PR_TRUE). This
875 // is because we call GetChar() above after we get the current
877 aScanner
.SetPosition(currPos
, PR_FALSE
, PR_TRUE
);
882 // IE only consumes <!-- --> as comments in PCDATA.
883 if (Distance(currPos
, endPos
) >= commentStartLen
) {
884 nsScannerIterator
start(currPos
), end(currPos
);
885 end
.advance(commentStartLen
);
887 if (CaseInsensitiveFindInReadable(commentStart
, start
, end
)) {
888 CCommentToken consumer
; // stack allocated.
890 // CCommentToken expects us to be on the '-'
891 aScanner
.SetPosition(currPos
.advance(2));
893 // In quirks mode we consume too many things as comments, so pretend
894 // that we're not by modifying aFlag.
895 result
= consumer
.Consume(*currPos
, aScanner
,
896 (aFlag
& ~NS_IPARSER_FLAG_QUIRKS_MODE
) |
897 NS_IPARSER_FLAG_STRICT_MODE
);
898 if (kEOF
== result
) {
899 // This can only happen if we're really out of space.
901 } else if (kNotAComment
== result
) {
902 // Fall through and consume this as text.
903 aScanner
.CurrentPosition(currPos
);
904 aScanner
.SetPosition(currPos
.advance(1));
906 consumer
.AppendSourceTo(theContent
.writable());
907 mNewlineCount
+= consumer
.GetNewlineCount();
914 // We did not find the terminal string yet so
915 // include the character that stopped consumption.
916 theContent
.writable().Append(ch
);
917 } while (currPos
!= endPos
);
923 CTextToken::CopyTo(nsAString
& aStr
)
925 nsScannerIterator start
, end
;
926 mTextValue
.BeginReading(start
);
927 mTextValue
.EndReading(end
);
928 CopyUnicodeTo(start
, end
, aStr
);
931 const nsSubstring
& CTextToken::GetStringValue()
933 return mTextValue
.AsString();
937 CTextToken::Bind(nsScanner
* aScanner
, nsScannerIterator
& aStart
,
938 nsScannerIterator
& aEnd
)
940 aScanner
->BindSubstring(mTextValue
, aStart
, aEnd
);
944 CTextToken::Bind(const nsAString
& aStr
)
946 mTextValue
.Rebind(aStr
);
949 CCDATASectionToken::CCDATASectionToken(eHTMLTags aTag
)
954 CCDATASectionToken::CCDATASectionToken(const nsAString
& aName
)
955 : CHTMLToken(eHTMLTag_unknown
)
957 mTextValue
.Assign(aName
);
961 CCDATASectionToken::GetTokenType()
963 return eToken_cdatasection
;
967 * Consume as much marked test from scanner as possible.
968 * Note: This has to handle case: "<![ ! IE 5]>", in addition to "<![..[..]]>"
970 * @param aChar -- last char consumed from stream
971 * @param aScanner -- controller of underlying input source
972 * @return error result
975 CCDATASectionToken::Consume(PRUnichar aChar
, nsScanner
& aScanner
,
978 static const PRUnichar theTerminalsChars
[] =
979 { PRUnichar('\r'), PRUnichar('\n'), PRUnichar(']'), PRUnichar(0) };
980 static const nsReadEndCondition
theEndCondition(theTerminalsChars
);
981 nsresult result
= NS_OK
;
982 PRBool done
= PR_FALSE
;
984 while (NS_OK
== result
&& !done
) {
985 result
= aScanner
.ReadUntil(mTextValue
, theEndCondition
, PR_FALSE
);
986 if (NS_OK
== result
) {
987 result
= aScanner
.Peek(aChar
);
988 if (kCR
== aChar
&& NS_OK
== result
) {
989 result
= aScanner
.GetChar(aChar
); // Strip off the \r
990 result
= aScanner
.Peek(aChar
); // Then see what's next.
991 if (NS_OK
== result
) {
994 result
= aScanner
.GetChar(aChar
); // Strip off the \r
995 mTextValue
.AppendLiteral("\n\n");
1000 // Which means we saw \r\n, which becomes \n
1001 result
= aScanner
.GetChar(aChar
); // Strip off the \n
1005 mTextValue
.AppendLiteral("\n");
1010 } else if (kNewLine
== aChar
) {
1011 result
= aScanner
.GetChar(aChar
);
1012 mTextValue
.Append(aChar
);
1014 } else if (kRightSquareBracket
== aChar
) {
1015 PRBool canClose
= PR_FALSE
;
1016 result
= aScanner
.GetChar(aChar
); // Strip off the ]
1017 mTextValue
.Append(aChar
);
1018 result
= aScanner
.Peek(aChar
); // Then see what's next.
1019 if (NS_OK
== result
&& kRightSquareBracket
== aChar
) {
1020 result
= aScanner
.GetChar(aChar
); // Strip off the second ]
1021 mTextValue
.Append(aChar
);
1025 // The goal here is to not lose data from the page when encountering
1026 // markup like: <![endif]-->. This means that in normal parsing, we
1027 // allow ']' to end the marked section and just drop everything between
1028 // it an the '>'. In view-source mode, we cannot drop things on the
1029 // floor like that. In fact, to make view-source of XML with script in
1030 // CDATA sections at all bearable, we need to somewhat enforce the ']]>'
1031 // terminator for marked sections. So make the tokenization somewhat
1032 // different when in view-source _and_ dealing with a CDATA section.
1033 // XXX We should remember this StringBeginsWith test.
1034 PRBool inCDATA
= (aFlag
& NS_IPARSER_FLAG_VIEW_SOURCE
) &&
1035 StringBeginsWith(mTextValue
, NS_LITERAL_STRING("[CDATA["));
1037 // Consume all right square brackets to catch cases such as:
1040 result
= aScanner
.Peek(aChar
);
1041 if (result
!= NS_OK
|| aChar
!= kRightSquareBracket
) {
1045 mTextValue
.Append(aChar
);
1046 aScanner
.GetChar(aChar
);
1049 nsAutoString dummy
; // Skip any bad data
1050 result
= aScanner
.ReadUntil(dummy
, kGreaterThan
, PR_FALSE
);
1052 if (NS_OK
== result
&&
1053 (!inCDATA
|| (canClose
&& kGreaterThan
== aChar
))) {
1054 result
= aScanner
.GetChar(aChar
); // Strip off the >
1063 if (kEOF
== result
&& !aScanner
.IsIncremental()) {
1064 // We ran out of space looking for the end of this CDATA section.
1065 // In order to not completely lose the entire section, treat everything
1066 // until the end of the document as part of the CDATA section and let
1067 // the DTD handle it.
1076 CCDATASectionToken::GetStringValue()
1082 CMarkupDeclToken::CMarkupDeclToken()
1083 : CHTMLToken(eHTMLTag_markupDecl
)
1087 CMarkupDeclToken::CMarkupDeclToken(const nsAString
& aName
)
1088 : CHTMLToken(eHTMLTag_markupDecl
)
1090 mTextValue
.Rebind(aName
);
1094 CMarkupDeclToken::GetTokenType()
1096 return eToken_markupDecl
;
1100 * Consume as much declaration from scanner as possible.
1101 * Declaration is a markup declaration of ELEMENT, ATTLIST, ENTITY or
1102 * NOTATION, which can span multiple lines and ends in >.
1104 * @param aChar -- last char consumed from stream
1105 * @param aScanner -- controller of underlying input source
1106 * @return error result
1109 CMarkupDeclToken::Consume(PRUnichar aChar
, nsScanner
& aScanner
,
1112 static const PRUnichar theTerminalsChars
[] =
1113 { PRUnichar('\n'), PRUnichar('\r'), PRUnichar('\''), PRUnichar('"'),
1116 static const nsReadEndCondition
theEndCondition(theTerminalsChars
);
1117 nsresult result
= NS_OK
;
1118 PRBool done
= PR_FALSE
;
1119 PRUnichar quote
= 0;
1121 nsScannerIterator origin
, start
, end
;
1122 aScanner
.CurrentPosition(origin
);
1125 while (NS_OK
== result
&& !done
) {
1126 aScanner
.SetPosition(start
);
1127 result
= aScanner
.ReadUntil(start
, end
, theEndCondition
, PR_FALSE
);
1128 if (NS_OK
== result
) {
1129 result
= aScanner
.Peek(aChar
);
1131 if (NS_OK
== result
) {
1132 PRUnichar theNextChar
= 0;
1133 if (kCR
== aChar
|| kNewLine
== aChar
) {
1134 result
= aScanner
.GetChar(aChar
); // Strip off the char
1135 result
= aScanner
.Peek(theNextChar
); // Then see what's next.
1139 // result = aScanner.GetChar(aChar);
1140 if (kLF
== theNextChar
) {
1141 // If the "\r" is followed by a "\n", don't replace it and
1142 // let it be ignored by the layout system
1144 result
= aScanner
.GetChar(theNextChar
);
1146 // If it standalone, replace the "\r" with a "\n" so that
1147 // it will be considered by the layout system
1148 aScanner
.ReplaceCharacter(end
, kLF
);
1161 if (quote
== aChar
) {
1173 // Note that start is wrong after this, we just avoid temp var
1175 aScanner
.SetPosition(start
); // Skip the >
1180 NS_ABORT_IF_FALSE(0, "should not happen, switch is missing cases?");
1189 aScanner
.BindSubstring(mTextValue
, origin
, end
);
1191 if (kEOF
== result
) {
1193 if (!aScanner
.IsIncremental()) {
1203 CMarkupDeclToken::GetStringValue()
1205 return mTextValue
.AsString();
1209 CCommentToken::CCommentToken()
1210 : CHTMLToken(eHTMLTag_comment
)
1214 CCommentToken::CCommentToken(const nsAString
& aName
)
1215 : CHTMLToken(eHTMLTag_comment
)
1217 mComment
.Rebind(aName
);
1221 CCommentToken::AppendSourceTo(nsAString
& anOutputString
)
1223 AppendUnicodeTo(mCommentDecl
, anOutputString
);
1227 IsCommentEnd(const nsScannerIterator
& aCurrent
, const nsScannerIterator
& aEnd
,
1228 nsScannerIterator
& aGt
)
1230 nsScannerIterator current
= aCurrent
;
1233 while (current
!= aEnd
&& dashes
!= 2) {
1234 if (*current
== kGreaterThan
) {
1238 if (*current
== PRUnichar('-')) {
1250 CCommentToken::ConsumeStrictComment(nsScanner
& aScanner
)
1252 // <!--[... -- ... -- ...]*-->
1253 /*********************************************************
1254 NOTE: This algorithm does a fine job of handling comments
1255 when they're formatted per spec, but if they're not
1256 we don't handle them well.
1257 *********************************************************/
1258 nsScannerIterator end
, current
, gt
, lt
;
1259 aScanner
.EndReading(end
);
1260 aScanner
.CurrentPosition(current
);
1262 nsScannerIterator beginData
= end
;
1265 lt
.advance(-2); // <!
1267 current
.advance(-1);
1269 // Regular comment must start with <!--
1270 if (*current
== kExclamation
&&
1271 ++current
!= end
&& *current
== kMinus
&&
1272 ++current
!= end
&& *current
== kMinus
&&
1274 nsScannerIterator currentEnd
= end
;
1275 PRBool balancedComment
= PR_FALSE
;
1276 NS_NAMED_LITERAL_STRING(dashes
, "--");
1277 beginData
= current
;
1279 while (FindInReadable(dashes
, current
, currentEnd
)) {
1282 balancedComment
= !balancedComment
; // We need to match '--' with '--'
1284 if (balancedComment
&& IsCommentEnd(current
, end
, gt
)) {
1286 current
.advance(-2);
1287 // Note: it's ok if beginData == current, (we'll copy an empty string)
1288 // and we need to bind mComment anyway.
1289 aScanner
.BindSubstring(mComment
, beginData
, current
);
1290 aScanner
.BindSubstring(mCommentDecl
, lt
, ++gt
);
1291 aScanner
.SetPosition(gt
);
1295 // Continue after the last '--'
1300 // If beginData == end, we did not find opening '--'
1301 if (beginData
== end
) {
1302 // This might have been empty comment: <!>
1303 // Or it could have been something completely bogus like: <!This is foobar>
1304 // Handle both cases below
1305 aScanner
.CurrentPosition(current
);
1306 beginData
= current
;
1307 if (FindCharInReadable('>', current
, end
)) {
1308 aScanner
.BindSubstring(mComment
, beginData
, current
);
1309 aScanner
.BindSubstring(mCommentDecl
, lt
, ++current
);
1310 aScanner
.SetPosition(current
);
1315 if (aScanner
.IsIncremental()) {
1316 // We got here because we saw the beginning of a comment,
1317 // but not yet the end, and we are still loading the page. In that
1318 // case the return value here will cause us to unwind,
1319 // wait for more content, and try again.
1320 // XXX For performance reasons we should cache where we were, and
1321 // continue from there for next call
1325 // There was no terminating string, parse this comment as text.
1326 aScanner
.SetPosition(lt
, PR_FALSE
, PR_TRUE
);
1327 return kNotAComment
;
1331 CCommentToken::ConsumeQuirksComment(nsScanner
& aScanner
)
1333 // <![-[-]] ... [[-]-|--!]>
1334 /*********************************************************
1335 NOTE: This algorithm does a fine job of handling comments
1336 commonly used, but it doesn't really consume them
1337 per spec (But then, neither does IE or Nav).
1338 *********************************************************/
1339 nsScannerIterator end
, current
;
1340 aScanner
.EndReading(end
);
1341 aScanner
.CurrentPosition(current
);
1342 nsScannerIterator beginData
= current
,
1343 beginLastMinus
= end
,
1344 bestAltCommentEnd
= end
,
1346 lt
.advance(-2); // <!
1348 // When we get here, we have always already consumed <!
1349 // Skip over possible leading minuses
1350 if (current
!= end
&& *current
== kMinus
) {
1351 beginLastMinus
= current
;
1354 if (current
!= end
&& *current
== kMinus
) { // <!--
1355 beginLastMinus
= current
;
1358 // Long form comment
1360 nsScannerIterator currentEnd
= end
, gt
= end
;
1362 // Find the end of the comment
1363 while (FindCharInReadable(kGreaterThan
, current
, currentEnd
)) {
1365 if (bestAltCommentEnd
== end
) {
1366 bestAltCommentEnd
= gt
;
1369 PRBool goodComment
= PR_FALSE
;
1370 if (current
!= beginLastMinus
&& *current
== kMinus
) { // ->
1372 if (current
!= beginLastMinus
&& *current
== kMinus
) { // -->
1373 goodComment
= PR_TRUE
;
1376 } else if (current
!= beginLastMinus
&& *current
== '!') {
1378 if (current
!= beginLastMinus
&& *current
== kMinus
) {
1380 if (current
!= beginLastMinus
&& *current
== kMinus
) { // --!>
1382 goodComment
= PR_TRUE
;
1385 } else if (current
== beginLastMinus
) {
1386 goodComment
= PR_TRUE
;
1391 aScanner
.BindSubstring(mComment
, beginData
, ++current
);
1392 aScanner
.BindSubstring(mCommentDecl
, lt
, ++gt
);
1393 aScanner
.SetPosition(gt
);
1396 // try again starting after the last '>'
1402 if (aScanner
.IsIncremental()) {
1403 // We got here because we saw the beginning of a comment,
1404 // but not yet the end, and we are still loading the page. In that
1405 // case the return value here will cause us to unwind,
1406 // wait for more content, and try again.
1407 // XXX For performance reasons we should cache where we were, and
1408 // continue from there for next call
1412 // If you're here, then we're in a special state.
1413 // The problem at hand is that we've hit the end of the document without
1414 // finding the normal endcomment delimiter "-->". In this case, the
1415 // first thing we try is to see if we found an alternate endcomment
1416 // delimiter ">". If so, rewind just pass that, and use everything up
1417 // to that point as your comment. If not, the document has no end
1418 // comment and should be treated as one big comment.
1419 gt
= bestAltCommentEnd
;
1420 aScanner
.BindSubstring(mComment
, beginData
, gt
);
1424 aScanner
.BindSubstring(mCommentDecl
, lt
, gt
);
1425 aScanner
.SetPosition(gt
);
1430 // This could be short form of comment
1431 // Find the end of the comment
1432 current
= beginData
;
1433 if (FindCharInReadable(kGreaterThan
, current
, end
)) {
1434 nsScannerIterator gt
= current
;
1435 if (current
!= beginData
) {
1437 if (current
!= beginData
&& *current
== kMinus
) { // ->
1439 if (current
!= beginData
&& *current
== kMinus
) { // -->
1442 } else if (current
!= beginData
&& *current
== '!') { // !>
1444 if (current
!= beginData
&& *current
== kMinus
) { // -!>
1446 if (current
!= beginData
&& *current
== kMinus
) { // --!>
1453 if (current
!= gt
) {
1454 aScanner
.BindSubstring(mComment
, beginData
, ++current
);
1456 // Bind mComment to an empty string (note that if current == gt,
1457 // then current == beginData). We reach this for <!>
1458 aScanner
.BindSubstring(mComment
, beginData
, current
);
1460 aScanner
.BindSubstring(mCommentDecl
, lt
, ++gt
);
1461 aScanner
.SetPosition(gt
);
1465 if (!aScanner
.IsIncremental()) {
1466 // This isn't a comment at all, go back to the < and consume as text.
1467 aScanner
.SetPosition(lt
, PR_FALSE
, PR_TRUE
);
1468 return kNotAComment
;
1471 // Wait for more data...
1476 * Consume the identifier portion of the comment.
1477 * Note that we've already eaten the "<!" portion.
1479 * @param aChar -- last char consumed from stream
1480 * @param aScanner -- controller of underlying input source
1481 * @return error result
1484 CCommentToken::Consume(PRUnichar aChar
, nsScanner
& aScanner
, PRInt32 aFlag
)
1486 nsresult result
= PR_TRUE
;
1488 if (aFlag
& NS_IPARSER_FLAG_STRICT_MODE
) {
1489 // Enabling strict comment parsing for Bug 53011 and 2749 contradicts!
1490 result
= ConsumeStrictComment(aScanner
);
1492 result
= ConsumeQuirksComment(aScanner
);
1495 if (NS_SUCCEEDED(result
)) {
1496 mNewlineCount
= mCommentDecl
.CountChar(kNewLine
);
1503 CCommentToken::GetStringValue()
1505 return mComment
.AsString();
1509 CCommentToken::GetTokenType()
1511 return eToken_comment
;
1514 CNewlineToken::CNewlineToken()
1515 : CHTMLToken(eHTMLTag_newline
)
1520 CNewlineToken::GetTokenType()
1522 return eToken_newline
;
1525 static nsScannerSubstring
* gNewlineStr
;
1527 CNewlineToken::AllocNewline()
1529 gNewlineStr
= new nsScannerSubstring(NS_LITERAL_STRING("\n"));
1533 CNewlineToken::FreeNewline()
1537 gNewlineStr
= nsnull
;
1542 * This method retrieves the value of this internal string.
1544 * @return nsString reference to internal string value
1547 CNewlineToken::GetStringValue()
1549 return gNewlineStr
->AsString();
1553 * Consume one newline (cr/lf pair).
1555 * @param aChar -- last char consumed from stream
1556 * @param aScanner -- controller of underlying input source
1557 * @return error result
1560 CNewlineToken::Consume(PRUnichar aChar
, nsScanner
& aScanner
, PRInt32 aFlag
)
1563 * Here's what the HTML spec says about newlines:
1565 * "A line break is defined to be a carriage return (
),
1566 * a line feed (
), or a carriage return/line feed pair.
1567 * All line breaks constitute white space."
1570 nsresult rv
= NS_OK
;
1573 rv
= aScanner
.Peek(theChar
);
1574 if (theChar
== kNewLine
) {
1575 rv
= aScanner
.GetChar(theChar
);
1576 } else if (rv
== kEOF
&& !aScanner
.IsIncremental()) {
1577 // Make sure we don't lose information about this trailing newline.
1586 CAttributeToken::CAttributeToken()
1587 : CHTMLToken(eHTMLTag_unknown
)
1589 mHasEqualWithoutValue
= PR_FALSE
;
1593 * String based constructor
1595 CAttributeToken::CAttributeToken(const nsAString
& aName
)
1596 : CHTMLToken(eHTMLTag_unknown
)
1598 mTextValue
.writable().Assign(aName
);
1599 mHasEqualWithoutValue
= PR_FALSE
;
1603 * construct initializing data to key value pair
1605 CAttributeToken::CAttributeToken(const nsAString
& aKey
, const nsAString
& aName
)
1606 : CHTMLToken(eHTMLTag_unknown
)
1608 mTextValue
.writable().Assign(aName
);
1609 mTextKey
.Rebind(aKey
);
1610 mHasEqualWithoutValue
= PR_FALSE
;
1614 CAttributeToken::GetTokenType()
1616 return eToken_attribute
;
1620 CAttributeToken::GetStringValue()
1622 return mTextValue
.str();
1626 CAttributeToken::GetSource(nsString
& anOutputString
)
1628 anOutputString
.Truncate();
1629 AppendSourceTo(anOutputString
);
1633 CAttributeToken::AppendSourceTo(nsAString
& anOutputString
)
1635 AppendUnicodeTo(mTextKey
, anOutputString
);
1636 if (mTextValue
.str().Length() || mHasEqualWithoutValue
) {
1637 anOutputString
.AppendLiteral("=");
1639 anOutputString
.Append(mTextValue
.str());
1640 // anOutputString.AppendLiteral(";");
1644 * This general purpose method is used when you want to
1645 * consume a known quoted string.
1648 ConsumeQuotedString(PRUnichar aChar
,
1649 nsScannerSharedSubstring
& aString
,
1650 PRInt32
& aNewlineCount
,
1651 nsScanner
& aScanner
,
1654 NS_ASSERTION(aChar
== kQuote
|| aChar
== kApostrophe
,
1655 "char is neither quote nor apostrophe");
1656 // Hold onto this in case this is an unterminated string literal
1657 PRUint32 origLen
= aString
.str().Length();
1659 static const PRUnichar theTerminalCharsQuote
[] = {
1660 PRUnichar(kQuote
), PRUnichar('&'), PRUnichar(kCR
),
1661 PRUnichar(kNewLine
), PRUnichar(0) };
1662 static const PRUnichar theTerminalCharsApostrophe
[] = {
1663 PRUnichar(kApostrophe
), PRUnichar('&'), PRUnichar(kCR
),
1664 PRUnichar(kNewLine
), PRUnichar(0) };
1665 static const nsReadEndCondition
1666 theTerminateConditionQuote(theTerminalCharsQuote
);
1667 static const nsReadEndCondition
1668 theTerminateConditionApostrophe(theTerminalCharsApostrophe
);
1670 // Assume Quote to init to something
1671 const nsReadEndCondition
*terminateCondition
= &theTerminateConditionQuote
;
1672 if (aChar
== kApostrophe
) {
1673 terminateCondition
= &theTerminateConditionApostrophe
;
1676 nsresult result
= NS_OK
;
1677 nsScannerIterator theOffset
;
1678 aScanner
.CurrentPosition(theOffset
);
1680 result
= ConsumeUntil(aString
, aNewlineCount
, aScanner
,
1681 *terminateCondition
, PR_TRUE
, PR_TRUE
, aFlag
);
1683 if (NS_SUCCEEDED(result
)) {
1684 result
= aScanner
.GetChar(aChar
); // aChar should be " or '
1688 // A back up measure when disaster strikes...
1689 // Ex <table> <tr d="><td>hello</td></tr></table>
1690 if (!aString
.str().IsEmpty() && aString
.str().Last() != aChar
&&
1691 !aScanner
.IsIncremental() && result
== kEOF
) {
1692 static const nsReadEndCondition
1693 theAttributeTerminator(kAttributeTerminalChars
);
1694 aString
.writable().Truncate(origLen
);
1695 aScanner
.SetPosition(theOffset
, PR_FALSE
, PR_TRUE
);
1696 result
= ConsumeUntil(aString
, aNewlineCount
, aScanner
,
1697 theAttributeTerminator
, PR_FALSE
, PR_TRUE
, aFlag
);
1698 if (NS_SUCCEEDED(result
) && (aFlag
& NS_IPARSER_FLAG_VIEW_SOURCE
)) {
1699 // Remember that this string literal was unterminated.
1700 result
= NS_ERROR_HTMLPARSER_UNTERMINATEDSTRINGLITERAL
;
1707 * This method is meant to be used by view-source to consume invalid attributes.
1708 * For the purposes of this method, an invalid attribute is an attribute that
1709 * starts with either ', ", or /. We consume all ', ", or / and the following
1712 * @param aScanner -- the scanner we're reading our data from.
1713 * @param aChar -- the character we're skipping
1714 * @param aCurrent -- the current position that we're looking at.
1715 * @param aNewlineCount -- a count of the newlines we've consumed.
1716 * @return error result.
1719 ConsumeInvalidAttribute(nsScanner
& aScanner
,
1721 nsScannerIterator
& aCurrent
,
1722 PRInt32
& aNewlineCount
)
1724 NS_ASSERTION(aChar
== kApostrophe
|| aChar
== kQuote
|| aChar
== kForwardSlash
,
1725 "aChar must be a quote or apostrophe");
1726 nsScannerIterator end
, wsbeg
;
1727 aScanner
.EndReading(end
);
1729 while (aCurrent
!= end
&& *aCurrent
== aChar
) {
1733 aScanner
.SetPosition(aCurrent
);
1734 return aScanner
.ReadWhitespace(wsbeg
, aCurrent
, aNewlineCount
);
1738 * Consume the key and value portions of the attribute.
1741 CAttributeToken::Consume(PRUnichar aChar
, nsScanner
& aScanner
, PRInt32 aFlag
)
1744 nsScannerIterator wsstart
, wsend
;
1746 if (aFlag
& NS_IPARSER_FLAG_VIEW_SOURCE
) {
1747 result
= aScanner
.ReadWhitespace(wsstart
, wsend
, mNewlineCount
);
1748 if (kEOF
== result
&& wsstart
!= wsend
) {
1749 // Do this here so if this is the final token in the document, we don't
1750 // lose the whitespace.
1751 aScanner
.BindSubstring(mTextKey
, wsstart
, wsend
);
1754 result
= aScanner
.SkipWhitespace(mNewlineCount
);
1757 if (NS_OK
== result
) {
1758 static const PRUnichar theTerminalsChars
[] =
1759 { PRUnichar(' '), PRUnichar('"'),
1760 PRUnichar('='), PRUnichar('\n'),
1761 PRUnichar('\r'), PRUnichar('\t'),
1762 PRUnichar('>'), PRUnichar('<'),
1763 PRUnichar('\''), PRUnichar('/'),
1765 static const nsReadEndCondition
theEndCondition(theTerminalsChars
);
1767 nsScannerIterator start
, end
;
1768 result
= aScanner
.ReadUntil(start
, end
, theEndCondition
, PR_FALSE
);
1770 if (!(aFlag
& NS_IPARSER_FLAG_VIEW_SOURCE
)) {
1771 aScanner
.BindSubstring(mTextKey
, start
, end
);
1772 } else if (kEOF
== result
&& wsstart
!= end
) {
1773 // Capture all of the text (from the beginning of the whitespace to the
1774 // end of the document).
1775 aScanner
.BindSubstring(mTextKey
, wsstart
, end
);
1778 // Now it's time to Consume the (optional) value...
1779 if (NS_OK
== result
) {
1780 if (aFlag
& NS_IPARSER_FLAG_VIEW_SOURCE
) {
1781 result
= aScanner
.ReadWhitespace(start
, wsend
, mNewlineCount
);
1782 aScanner
.BindSubstring(mTextKey
, wsstart
, wsend
);
1784 result
= aScanner
.SkipWhitespace(mNewlineCount
);
1787 if (NS_OK
== result
) {
1788 // Skip ahead until you find an equal sign or a '>'...
1789 result
= aScanner
.Peek(aChar
);
1790 if (NS_OK
== result
) {
1791 if (kEqual
== aChar
) {
1792 result
= aScanner
.GetChar(aChar
); // Skip the equal sign...
1793 if (NS_OK
== result
) {
1794 if (aFlag
& NS_IPARSER_FLAG_VIEW_SOURCE
) {
1796 result
= aScanner
.ReadWhitespace(mTextValue
, mNewlineCount
,
1799 result
= aScanner
.SkipWhitespace(mNewlineCount
);
1802 if (NS_OK
== result
) {
1803 result
= aScanner
.Peek(aChar
); // And grab the next char.
1804 if (NS_OK
== result
) {
1805 if (kQuote
== aChar
|| kApostrophe
== aChar
) {
1806 aScanner
.GetChar(aChar
);
1807 if (aFlag
& NS_IPARSER_FLAG_VIEW_SOURCE
) {
1808 mTextValue
.writable().Append(aChar
);
1811 result
= ConsumeQuotedString(aChar
, mTextValue
,
1812 mNewlineCount
, aScanner
,
1814 if (NS_SUCCEEDED(result
) &&
1815 (aFlag
& NS_IPARSER_FLAG_VIEW_SOURCE
)) {
1816 mTextValue
.writable().Append(aChar
);
1817 } else if (result
==
1818 NS_ERROR_HTMLPARSER_UNTERMINATEDSTRINGLITERAL
) {
1822 // According to spec. we ( who? ) should ignore linefeeds.
1823 // But look, even the carriage return was getting stripped
1824 // ( wonder why! ) - Ref. to bug 15204. Okay, so the
1825 // spec. told us to ignore linefeeds, bug then what about
1826 // bug 47535 ? Should we preserve everything then? Well,
1827 // let's make it so!
1828 } else if (kGreaterThan
== aChar
) {
1829 mHasEqualWithoutValue
= PR_TRUE
;
1832 static const nsReadEndCondition
1833 theAttributeTerminator(kAttributeTerminalChars
);
1835 ConsumeUntil(mTextValue
,
1838 theAttributeTerminator
,
1844 if (NS_OK
== result
) {
1845 if (aFlag
& NS_IPARSER_FLAG_VIEW_SOURCE
) {
1847 result
= aScanner
.ReadWhitespace(mTextValue
, mNewlineCount
,
1850 result
= aScanner
.SkipWhitespace(mNewlineCount
);
1854 // We saw an equal sign but ran out of room looking for a value.
1855 mHasEqualWithoutValue
= PR_TRUE
;
1860 // This is where we have to handle fairly busted content.
1861 // If you're here, it means we saw an attribute name, but couldn't
1862 // find the following equal sign. <tag NAME....
1864 // Doing this right in all cases is <i>REALLY</i> ugly.
1865 // My best guess is to grab the next non-ws char. We know it's not
1866 // '=', so let's see what it is. If it's a '"', then assume we're
1867 // reading from the middle of the value. Try stripping the quote
1868 // and continuing... Note that this code also strips forward
1869 // slashes to handle cases like <tag NAME/>
1870 if (kQuote
== aChar
|| kApostrophe
== aChar
||
1871 kForwardSlash
== aChar
) {
1872 // In XML, a trailing slash isn't an error.
1873 if (kForwardSlash
!= aChar
|| !(aFlag
& NS_IPARSER_FLAG_XML
)) {
1877 if (!(aFlag
& NS_IPARSER_FLAG_VIEW_SOURCE
)) {
1878 result
= aScanner
.SkipOver(aChar
); // Strip quote or slash.
1879 if (NS_SUCCEEDED(result
)) {
1880 result
= aScanner
.SkipWhitespace(mNewlineCount
);
1883 // We want to collect whitespace here so that following
1884 // attributes can have the right line number (and for
1885 // parity with the non-view-source code above).
1886 result
= ConsumeInvalidAttribute(aScanner
, aChar
,
1887 wsend
, mNewlineCount
);
1889 aScanner
.BindSubstring(mTextKey
, wsstart
, wsend
);
1890 aScanner
.SetPosition(wsend
);
1898 if (NS_OK
== result
) {
1899 if (mTextValue
.str().Length() == 0 && mTextKey
.Length() == 0 &&
1900 mNewlineCount
== 0 && !mHasEqualWithoutValue
) {
1901 // This attribute contains no useful information for us, so there is no
1902 // use in keeping it around. Attributes that are otherwise empty, but
1903 // have newlines in them are passed on the the DTD so it can get line
1905 return NS_ERROR_HTMLPARSER_BADATTRIBUTE
;
1910 if (kEOF
== result
&& !aScanner
.IsIncremental()) {
1911 // This is our run-of-the mill "don't lose content at the end of a
1912 // document" with a slight twist: we don't want to bother returning an
1913 // empty attribute key, even if this is the end of the document.
1914 if (mTextKey
.Length() == 0) {
1915 result
= NS_ERROR_HTMLPARSER_BADATTRIBUTE
;
1925 CAttributeToken::SetKey(const nsAString
& aKey
)
1927 mTextKey
.Rebind(aKey
);
1931 CAttributeToken::BindKey(nsScanner
* aScanner
,
1932 nsScannerIterator
& aStart
,
1933 nsScannerIterator
& aEnd
)
1935 aScanner
->BindSubstring(mTextKey
, aStart
, aEnd
);
1938 CWhitespaceToken::CWhitespaceToken()
1939 : CHTMLToken(eHTMLTag_whitespace
)
1943 CWhitespaceToken::CWhitespaceToken(const nsAString
& aName
)
1944 : CHTMLToken(eHTMLTag_whitespace
)
1946 mTextValue
.writable().Assign(aName
);
1949 PRInt32
CWhitespaceToken::GetTokenType()
1951 return eToken_whitespace
;
1955 * This general purpose method is used when you want to
1956 * consume an aribrary sequence of whitespace.
1958 * @param aChar -- last char consumed from stream
1959 * @param aScanner -- controller of underlying input source
1960 * @return error result
1963 CWhitespaceToken::Consume(PRUnichar aChar
, nsScanner
& aScanner
, PRInt32 aFlag
)
1965 // If possible, we'd like to just be a dependent substring starting at
1966 // |aChar|. The scanner has already been advanced, so we need to
1967 // back it up to facilitate this.
1969 nsScannerIterator start
;
1970 aScanner
.CurrentPosition(start
);
1971 aScanner
.SetPosition(--start
, PR_FALSE
, PR_TRUE
);
1975 nsresult result
= aScanner
.ReadWhitespace(mTextValue
, mNewlineCount
, haveCR
);
1977 if (result
== kEOF
&& !aScanner
.IsIncremental()) {
1978 // Oops, we ran off the end, make sure we don't lose the trailing
1983 if (NS_OK
== result
&& haveCR
) {
1984 mTextValue
.writable().StripChar(kCR
);
1990 CWhitespaceToken::GetStringValue()
1992 return mTextValue
.str();
1995 CEntityToken::CEntityToken()
1996 : CHTMLToken(eHTMLTag_entity
)
2000 CEntityToken::CEntityToken(const nsAString
& aName
)
2001 : CHTMLToken(eHTMLTag_entity
)
2003 mTextValue
.Assign(aName
);
2008 * Consume the rest of the entity. We've already eaten the "&".
2010 * @param aChar -- last char consumed from stream
2011 * @param aScanner -- controller of underlying input source
2012 * @return error result
2015 CEntityToken::Consume(PRUnichar aChar
, nsScanner
& aScanner
, PRInt32 aFlag
)
2017 nsresult result
= ConsumeEntity(aChar
, mTextValue
, aScanner
);
2022 CEntityToken::GetTokenType()
2024 return eToken_entity
;
2028 * This general purpose method is used when you want to
2029 * consume an entity &xxxx;. Keep in mind that entities
2030 * are <i>not</i> reduced inline.
2032 * @param aChar -- last char consumed from stream
2033 * @param aScanner -- controller of underlying input source
2034 * @return error result
2037 CEntityToken::ConsumeEntity(PRUnichar aChar
,
2039 nsScanner
& aScanner
)
2041 nsresult result
= NS_OK
;
2042 if (kLeftBrace
== aChar
) {
2043 // You're consuming a script entity...
2044 aScanner
.GetChar(aChar
); // Consume &
2046 PRInt32 rightBraceCount
= 0;
2047 PRInt32 leftBraceCount
= 0;
2050 result
= aScanner
.GetChar(aChar
);
2052 if (NS_FAILED(result
)) {
2056 aString
.Append(aChar
);
2057 if (aChar
== kRightBrace
) {
2059 } else if (aChar
== kLeftBrace
) {
2062 } while (leftBraceCount
!= rightBraceCount
);
2064 PRUnichar theChar
= 0;
2065 if (kHashsign
== aChar
) {
2066 result
= aScanner
.Peek(theChar
, 2);
2068 if (NS_FAILED(result
)) {
2069 if (kEOF
== result
&& !aScanner
.IsIncremental()) {
2070 // If this is the last buffer then we are certainly
2071 // not dealing with an entity. That's, there are
2072 // no more characters after &#. Bug 188278.
2073 return NS_HTMLTOKENS_NOT_AN_ENTITY
;
2078 if (nsCRT::IsAsciiDigit(theChar
)) {
2079 aScanner
.GetChar(aChar
); // Consume &
2080 aScanner
.GetChar(aChar
); // Consume #
2081 aString
.Assign(aChar
);
2082 result
= aScanner
.ReadNumber(aString
, 10);
2083 } else if (theChar
== 'x' || theChar
== 'X') {
2084 aScanner
.GetChar(aChar
); // Consume &
2085 aScanner
.GetChar(aChar
); // Consume #
2086 aScanner
.GetChar(theChar
); // Consume x
2087 aString
.Assign(aChar
);
2088 aString
.Append(theChar
);
2089 result
= aScanner
.ReadNumber(aString
, 16);
2091 return NS_HTMLTOKENS_NOT_AN_ENTITY
;
2094 result
= aScanner
.Peek(theChar
, 1);
2096 if (NS_FAILED(result
)) {
2100 if (nsCRT::IsAsciiAlpha(theChar
) ||
2103 aScanner
.GetChar(aChar
); // Consume &
2104 result
= aScanner
.ReadEntityIdentifier(aString
);
2106 return NS_HTMLTOKENS_NOT_AN_ENTITY
;
2111 if (NS_FAILED(result
)) {
2115 result
= aScanner
.Peek(aChar
);
2117 if (NS_FAILED(result
)) {
2121 if (aChar
== kSemicolon
) {
2122 // Consume semicolon that stopped the scan
2123 aString
.Append(aChar
);
2124 result
= aScanner
.GetChar(aChar
);
2131 * Map some illegal but commonly used numeric entities into their
2132 * appropriate unicode value.
2134 #define NOT_USED 0xfffd
2136 static const PRUint16 PA_HackTable
[] = {
2137 0x20ac, /* EURO SIGN */
2139 0x201a, /* SINGLE LOW-9 QUOTATION MARK */
2140 0x0192, /* LATIN SMALL LETTER F WITH HOOK */
2141 0x201e, /* DOUBLE LOW-9 QUOTATION MARK */
2142 0x2026, /* HORIZONTAL ELLIPSIS */
2143 0x2020, /* DAGGER */
2144 0x2021, /* DOUBLE DAGGER */
2145 0x02c6, /* MODIFIER LETTER CIRCUMFLEX ACCENT */
2146 0x2030, /* PER MILLE SIGN */
2147 0x0160, /* LATIN CAPITAL LETTER S WITH CARON */
2148 0x2039, /* SINGLE LEFT-POINTING ANGLE QUOTATION MARK */
2149 0x0152, /* LATIN CAPITAL LIGATURE OE */
2151 0x017D, /* LATIN CAPITAL LETTER Z WITH CARON */
2154 0x2018, /* LEFT SINGLE QUOTATION MARK */
2155 0x2019, /* RIGHT SINGLE QUOTATION MARK */
2156 0x201c, /* LEFT DOUBLE QUOTATION MARK */
2157 0x201d, /* RIGHT DOUBLE QUOTATION MARK */
2158 0x2022, /* BULLET */
2159 0x2013, /* EN DASH */
2160 0x2014, /* EM DASH */
2161 0x02dc, /* SMALL TILDE */
2162 0x2122, /* TRADE MARK SIGN */
2163 0x0161, /* LATIN SMALL LETTER S WITH CARON */
2164 0x203a, /* SINGLE RIGHT-POINTING ANGLE QUOTATION MARK */
2165 0x0153, /* LATIN SMALL LIGATURE OE */
2167 0x017E, /* LATIN SMALL LETTER Z WITH CARON */
2168 0x0178 /* LATIN CAPITAL LETTER Y WITH DIAERESIS */
2172 AppendNCR(nsSubstring
& aString
, PRInt32 aNCRValue
)
2174 /* For some illegal, but popular usage */
2175 if (aNCRValue
>= 0x0080 && aNCRValue
<= 0x009f) {
2176 aNCRValue
= PA_HackTable
[aNCRValue
- 0x0080];
2179 AppendUCS4ToUTF16(ENSURE_VALID_CHAR(aNCRValue
), aString
);
2183 * This method converts this entity into its underlying
2184 * unicode equivalent.
2186 * @param aString will hold the resulting string value
2187 * @return numeric (unichar) value
2190 CEntityToken::TranslateToUnicodeStr(nsString
& aString
)
2194 if (mTextValue
.Length() > 1) {
2195 PRUnichar theChar0
= mTextValue
.CharAt(0);
2197 if (kHashsign
== theChar0
) {
2200 value
= mTextValue
.ToInteger(&err
, kAutoDetect
);
2203 AppendNCR(aString
, value
);
2206 value
= nsHTMLEntities::EntityToUnicode(mTextValue
);
2208 // We found a named entity...
2209 aString
.Assign(PRUnichar(value
));
2219 nsSubstring
& CEntityToken::GetStringValue()
2225 CEntityToken::GetSource(nsString
& anOutputString
)
2227 anOutputString
.AppendLiteral("&");
2228 anOutputString
+= mTextValue
;
2229 // Any possible ; is part of our text value.
2233 CEntityToken::AppendSourceTo(nsAString
& anOutputString
)
2235 anOutputString
.AppendLiteral("&");
2236 anOutputString
+= mTextValue
;
2237 // Any possible ; is part of our text value.
2241 GetTagName(PRInt32 aTag
)
2243 const PRUnichar
*result
= nsHTMLTags::GetStringValue((nsHTMLTag
) aTag
);
2249 if (aTag
>= eHTMLTag_userdefined
) {
2250 return sUserdefined
;
2257 CInstructionToken::CInstructionToken()
2258 : CHTMLToken(eHTMLTag_instruction
)
2262 CInstructionToken::CInstructionToken(const nsAString
& aString
)
2263 : CHTMLToken(eHTMLTag_unknown
)
2265 mTextValue
.Assign(aString
);
2269 CInstructionToken::Consume(PRUnichar aChar
, nsScanner
& aScanner
, PRInt32 aFlag
)
2271 mTextValue
.AssignLiteral("<?");
2272 nsresult result
= NS_OK
;
2273 PRBool done
= PR_FALSE
;
2275 while (NS_OK
== result
&& !done
) {
2276 // Note, this call does *not* consume the >.
2277 result
= aScanner
.ReadUntil(mTextValue
, kGreaterThan
, PR_FALSE
);
2278 if (NS_SUCCEEDED(result
)) {
2279 // In HTML, PIs end with a '>', in XML, they end with a '?>'. Cover both
2281 if (!(aFlag
& NS_IPARSER_FLAG_XML
) ||
2282 kQuestionMark
== mTextValue
.Last()) {
2283 // This really is the end of the PI.
2286 // Need to append this character no matter what.
2287 aScanner
.GetChar(aChar
);
2288 mTextValue
.Append(aChar
);
2292 if (kEOF
== result
&& !aScanner
.IsIncremental()) {
2293 // Hide the EOF result because there is no more text coming.
2302 CInstructionToken::GetTokenType()
2304 return eToken_instruction
;
2308 CInstructionToken::GetStringValue()
2313 // Doctype decl token
2315 CDoctypeDeclToken::CDoctypeDeclToken(eHTMLTags aTag
)
2320 CDoctypeDeclToken::CDoctypeDeclToken(const nsAString
& aString
, eHTMLTags aTag
)
2321 : CHTMLToken(aTag
), mTextValue(aString
)
2326 * This method consumes a doctype element.
2327 * Note: I'm rewriting this method to seek to the first <, since quotes can
2328 * really screw us up.
2329 * XXX Maybe this should do better in XML or strict mode?
2332 CDoctypeDeclToken::Consume(PRUnichar aChar
, nsScanner
& aScanner
, PRInt32 aFlag
)
2334 static const PRUnichar terminalChars
[] =
2335 { PRUnichar('>'), PRUnichar('<'),
2338 static const nsReadEndCondition
theEndCondition(terminalChars
);
2340 nsScannerIterator start
, end
;
2342 aScanner
.CurrentPosition(start
);
2343 aScanner
.EndReading(end
);
2345 nsresult result
= aScanner
.ReadUntil(start
, end
, theEndCondition
, PR_FALSE
);
2347 if (NS_SUCCEEDED(result
)) {
2350 if (ch
== kGreaterThan
) {
2351 // Include '>' but not '<' since '<'
2352 // could belong to another tag.
2353 aScanner
.GetChar(ch
);
2356 NS_ASSERTION(kLessThan
== ch
,
2357 "Make sure this doctype decl. is really in error.");
2360 } else if (!aScanner
.IsIncremental()) {
2361 // We have reached the document end but haven't
2362 // found either a '<' or a '>'. Therefore use
2363 // whatever we have.
2368 if (NS_SUCCEEDED(result
)) {
2369 start
.advance(-2); // Make sure to consume <!
2370 CopyUnicodeTo(start
, end
, mTextValue
);
2377 CDoctypeDeclToken::GetTokenType()
2379 return eToken_doctypeDecl
;
2383 CDoctypeDeclToken::GetStringValue()
2389 CDoctypeDeclToken::SetStringValue(const nsAString
& aStr
)
2391 mTextValue
.Assign(aStr
);