parser/htmlparser/src/nsHTMLTokens.cpp

   1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
   2 /* vim: set ts=2 sw=2 et tw=78: */
   3 /* ***** BEGIN LICENSE BLOCK *****
   4  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
   5  *
   6  * The contents of this file are subject to the Mozilla Public License Version
   7  * 1.1 (the "License"); you may not use this file except in compliance with
   8  * the License. You may obtain a copy of the License at
   9  * http://www.mozilla.org/MPL/
  10  *
  11  * Software distributed under the License is distributed on an "AS IS" basis,
  12  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
  13  * for the specific language governing rights and limitations under the
  14  * License.
  15  *
  16  * The Original Code is mozilla.org code.
  17  *
  18  * The Initial Developer of the Original Code is
  19  * Netscape Communications Corporation.
  20  * Portions created by the Initial Developer are Copyright (C) 1998
  21  * the Initial Developer. All Rights Reserved.
  22  *
  23  * Contributor(s):
  24  *   Blake Kaplan <mrbkap@gmail.com>
  25  *
  26  * Alternatively, the contents of this file may be used under the terms of
  27  * either of the GNU General Public License Version 2 or later (the "GPL"),
  28  * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
  29  * in which case the provisions of the GPL or the LGPL are applicable instead
  30  * of those above. If you wish to allow use of your version of this file only
  31  * under the terms of either the GPL or the LGPL, and not to allow others to
  32  * use your version of this file under the terms of the MPL, indicate your
  33  * decision by deleting the provisions above and replace them with the notice
  34  * and other provisions required by the GPL or the LGPL. If you do not delete
  35  * the provisions above, a recipient may use your version of this file under
  36  * the terms of any one of the MPL, the GPL or the LGPL.
  37  *
  38  * ***** END LICENSE BLOCK ***** */
  39
  40 #include <ctype.h>
  41 #include <time.h>
  42 #include <stdio.h>
  43 #include "nsScanner.h"
  44 #include "nsToken.h"
  45 #include "nsHTMLTokens.h"
  46 #include "prtypes.h"
  47 #include "nsDebug.h"
  48 #include "nsHTMLTags.h"
  49 #include "nsHTMLEntities.h"
  50 #include "nsCRT.h"
  51 #include "nsReadableUtils.h"
  52 #include "nsUnicharUtils.h"
  53 #include "nsScanner.h"
  54
  55
  56 static const PRUnichar sUserdefined[] = {'u', 's', 'e', 'r', 'd', 'e', 'f',
  57                                          'i', 'n', 'e', 'd', 0};
  58
  59 static const PRUnichar kAttributeTerminalChars[] = {
  60   PRUnichar('&'), PRUnichar('\t'), PRUnichar('\n'),
  61   PRUnichar('\r'), PRUnichar(' '), PRUnichar('>'),
  62   PRUnichar(0)
  63 };
  64
  65 static void AppendNCR(nsSubstring& aString, PRInt32 aNCRValue);
  66 /**
  67  * Consumes an entity from aScanner and expands it into aString.
  68  *
  69  * @param   aString The target string to append the entity to.
  70  * @param   aScanner Controller of underlying input source
  71  * @param   aIECompatible Controls whether we respect entities with values >
  72  *                        255 and no terminating semicolon.
  73  * @param   aFlag If NS_IPARSER_FLAG_VIEW_SOURCE do not reduce entities...
  74  * @return  error result
  75  */
  76 static nsresult
  77 ConsumeEntity(nsScannerSharedSubstring& aString,
  78               nsScanner& aScanner,
  79               PRBool aIECompatible,
  80               PRInt32 aFlag)
  81 {
  82   nsresult result = NS_OK;
  83
  84   PRUnichar ch;
  85   result = aScanner.Peek(ch, 1);
  86
  87   if (NS_SUCCEEDED(result)) {
  88     PRUnichar amp = 0;
  89     PRInt32 theNCRValue = 0;
  90     nsAutoString entity;
  91
  92     if (nsCRT::IsAsciiAlpha(ch) && !(aFlag & NS_IPARSER_FLAG_VIEW_SOURCE)) {
  93       result = CEntityToken::ConsumeEntity(ch, entity, aScanner);
  94       if (NS_SUCCEEDED(result)) {
  95         theNCRValue = nsHTMLEntities::EntityToUnicode(entity);
  96         PRUnichar theTermChar = entity.Last();
  97         // If an entity value is greater than 255 then:
  98         // Nav 4.x does not treat it as an entity,
  99         // IE treats it as an entity if terminated with a semicolon.
 100         // Resembling IE!!
 101
 102         nsSubstring &writable = aString.writable();
 103         if (theNCRValue < 0 ||
 104             (aIECompatible && theNCRValue > 255 && theTermChar != ';')) {
 105           // Looks like we're not dealing with an entity
 106           writable.Append(kAmpersand);
 107           writable.Append(entity);
 108         } else {
 109           // A valid entity so reduce it.
 110           writable.Append(PRUnichar(theNCRValue));
 111         }
 112       }
 113     } else if (ch == kHashsign && !(aFlag & NS_IPARSER_FLAG_VIEW_SOURCE)) {
 114       result = CEntityToken::ConsumeEntity(ch, entity, aScanner);
 115       if (NS_SUCCEEDED(result)) {
 116         nsSubstring &writable = aString.writable();
 117         if (result == NS_HTMLTOKENS_NOT_AN_ENTITY) {
 118           // Looked like an entity but it's not
 119           aScanner.GetChar(amp);
 120           writable.Append(amp);
 121           result = NS_OK;
 122         } else {
 123           PRInt32 err;
 124           theNCRValue = entity.ToInteger(&err, kAutoDetect);
 125           AppendNCR(writable, theNCRValue);
 126         }
 127       }
 128     } else {
 129       // What we thought as entity is not really an entity...
 130       aScanner.GetChar(amp);
 131       aString.writable().Append(amp);
 132     }
 133   }
 134
 135   return result;
 136 }
 137
 138 /*
 139  *  This general purpose method is used when you want to
 140  *  consume attributed text value.
 141  *  Note: It also reduces entities.
 142  *
 143  *  @param   aNewlineCount -- the newline count to increment when hitting newlines
 144  *  @param   aScanner -- controller of underlying input source
 145  *  @param   aTerminalChars -- characters that stop consuming attribute.
 146  *  @param   aAllowNewlines -- whether to allow newlines in the value.
 147  *                             XXX it would be nice to roll this info into
 148  *                             aTerminalChars somehow....
 149  *  @param   aIECompatEntities IE treats entities with values > 255 as
 150  *                             entities only if they're terminated with a
 151  *                             semicolon. This is true to follow that behavior
 152  *                             and false to treat all values as entities.
 153  *  @param   aFlag - contains information such as |dtd mode|view mode|doctype|etc...
 154  *  @return  error result
 155  */
 156 static nsresult
 157 ConsumeUntil(nsScannerSharedSubstring& aString,
 158              PRInt32& aNewlineCount,
 159              nsScanner& aScanner,
 160              const nsReadEndCondition& aEndCondition,
 161              PRBool aAllowNewlines,
 162              PRBool aIECompatEntities,
 163              PRInt32 aFlag)
 164 {
 165   nsresult result = NS_OK;
 166   PRBool   done = PR_FALSE;
 167
 168   do {
 169     result = aScanner.ReadUntil(aString, aEndCondition, PR_FALSE);
 170     if (NS_SUCCEEDED(result)) {
 171       PRUnichar ch;
 172       aScanner.Peek(ch);
 173       if (ch == kAmpersand) {
 174         result = ConsumeEntity(aString, aScanner, aIECompatEntities, aFlag);
 175       } else if (ch == kCR && aAllowNewlines) {
 176         aScanner.GetChar(ch);
 177         result = aScanner.Peek(ch);
 178         if (NS_SUCCEEDED(result)) {
 179           nsSubstring &writable = aString.writable();
 180           if (ch == kNewLine) {
 181             writable.AppendLiteral("\r\n");
 182             aScanner.GetChar(ch);
 183           } else {
 184             writable.Append(PRUnichar('\r'));
 185           }
 186           ++aNewlineCount;
 187         }
 188       } else if (ch == kNewLine && aAllowNewlines) {
 189         aScanner.GetChar(ch);
 190         aString.writable().Append(PRUnichar('\n'));
 191         ++aNewlineCount;
 192       } else {
 193         done = PR_TRUE;
 194       }
 195     }
 196   } while (NS_SUCCEEDED(result) && !done);
 197
 198   return result;
 199 }
 200
 201 /**************************************************************
 202   And now for the token classes...
 203  **************************************************************/
 204
 205 /**
 206  * Constructor from tag id
 207  */
 208 CHTMLToken::CHTMLToken(eHTMLTags aTag)
 209   : CToken(aTag)
 210 {
 211 }
 212
 213
 214 CHTMLToken::~CHTMLToken()
 215 {
 216 }
 217
 218 /*
 219  * Constructor from tag id
 220  */
 221 CStartToken::CStartToken(eHTMLTags aTag)
 222   : CHTMLToken(aTag)
 223 {
 224   mEmpty = PR_FALSE;
 225   mContainerInfo = eFormUnknown;
 226 #ifdef DEBUG
 227   mAttributed = PR_FALSE;
 228 #endif
 229 }
 230
 231 CStartToken::CStartToken(const nsAString& aName)
 232   : CHTMLToken(eHTMLTag_unknown)
 233 {
 234   mEmpty = PR_FALSE;
 235   mContainerInfo = eFormUnknown;
 236   mTextValue.Assign(aName);
 237 #ifdef DEBUG
 238   mAttributed = PR_FALSE;
 239 #endif
 240 }
 241
 242 CStartToken::CStartToken(const nsAString& aName, eHTMLTags aTag)
 243   : CHTMLToken(aTag)
 244 {
 245   mEmpty = PR_FALSE;
 246   mContainerInfo = eFormUnknown;
 247   mTextValue.Assign(aName);
 248 #ifdef DEBUG
 249   mAttributed = PR_FALSE;
 250 #endif
 251 }
 252
 253 /*
 254  * This method returns the typeid (the tag type) for this token.
 255  */
 256 PRInt32
 257 CStartToken::GetTypeID()
 258 {
 259   if (eHTMLTag_unknown == mTypeID) {
 260     mTypeID = nsHTMLTags::LookupTag(mTextValue);
 261   }
 262   return mTypeID;
 263 }
 264
 265 PRInt32
 266 CStartToken::GetTokenType()
 267 {
 268   return eToken_start;
 269 }
 270
 271 void
 272 CStartToken::SetEmpty(PRBool aValue)
 273 {
 274   mEmpty = aValue;
 275 }
 276
 277 PRBool
 278 CStartToken::IsEmpty()
 279 {
 280   return mEmpty;
 281 }
 282
 283 /*
 284  * Consume the identifier portion of the start tag
 285  */
 286 nsresult
 287 CStartToken::Consume(PRUnichar aChar, nsScanner& aScanner, PRInt32 aFlag)
 288 {
 289   // If you're here, we've already Consumed the < char, and are
 290   // ready to Consume the rest of the open tag identifier.
 291   // Stop consuming as soon as you see a space or a '>'.
 292   // NOTE: We don't Consume the tag attributes here, nor do we eat the ">"
 293
 294   nsresult result = NS_OK;
 295   nsScannerSharedSubstring tagIdent;
 296
 297   if (aFlag & NS_IPARSER_FLAG_HTML) {
 298     result = aScanner.ReadTagIdentifier(tagIdent);
 299     mTypeID = (PRInt32)nsHTMLTags::LookupTag(tagIdent.str());
 300     // Save the original tag string if this is user-defined or if we
 301     // are viewing source
 302     if (eHTMLTag_userdefined == mTypeID ||
 303         (aFlag & NS_IPARSER_FLAG_VIEW_SOURCE)) {
 304       mTextValue = tagIdent.str();
 305     }
 306   } else {
 307     result = aScanner.ReadTagIdentifier(tagIdent);
 308     mTextValue = tagIdent.str();
 309     mTypeID = nsHTMLTags::LookupTag(mTextValue);
 310   }
 311
 312   if (NS_SUCCEEDED(result) && !(aFlag & NS_IPARSER_FLAG_VIEW_SOURCE)) {
 313     result = aScanner.SkipWhitespace(mNewlineCount);
 314   }
 315
 316   if (kEOF == result && !aScanner.IsIncremental()) {
 317     // Take what we can get.
 318     result = NS_OK;
 319   }
 320
 321   return result;
 322 }
 323
 324 const nsSubstring&
 325 CStartToken::GetStringValue()
 326 {
 327   if (eHTMLTag_unknown < mTypeID && mTypeID < eHTMLTag_text) {
 328     if (!mTextValue.Length()) {
 329       mTextValue.Assign(nsHTMLTags::GetStringValue((nsHTMLTag) mTypeID));
 330     }
 331   }
 332   return mTextValue;
 333 }
 334
 335 void
 336 CStartToken::GetSource(nsString& anOutputString)
 337 {
 338   anOutputString.Truncate();
 339   AppendSourceTo(anOutputString);
 340 }
 341
 342 void
 343 CStartToken::AppendSourceTo(nsAString& anOutputString)
 344 {
 345   anOutputString.Append(PRUnichar('<'));
 346   /*
 347    * Watch out for Bug 15204
 348    */
 349   if (!mTextValue.IsEmpty()) {
 350     anOutputString.Append(mTextValue);
 351   } else {
 352     anOutputString.Append(GetTagName(mTypeID));
 353   }
 354
 355   anOutputString.Append(PRUnichar('>'));
 356 }
 357
 358 CEndToken::CEndToken(eHTMLTags aTag)
 359   : CHTMLToken(aTag)
 360 {
 361 }
 362
 363 CEndToken::CEndToken(const nsAString& aName)
 364   : CHTMLToken(eHTMLTag_unknown)
 365 {
 366   mTextValue.Assign(aName);
 367 }
 368
 369 CEndToken::CEndToken(const nsAString& aName, eHTMLTags aTag)
 370   : CHTMLToken(aTag)
 371 {
 372   mTextValue.Assign(aName);
 373 }
 374
 375 nsresult
 376 CEndToken::Consume(PRUnichar aChar, nsScanner& aScanner, PRInt32 aFlag)
 377 {
 378   nsresult result = NS_OK;
 379   nsScannerSharedSubstring tagIdent;
 380
 381   if (aFlag & NS_IPARSER_FLAG_HTML) {
 382     result = aScanner.ReadTagIdentifier(tagIdent);
 383
 384     mTypeID = (PRInt32)nsHTMLTags::LookupTag(tagIdent.str());
 385     // Save the original tag string if this is user-defined or if we
 386     // are viewing source
 387     if (eHTMLTag_userdefined == mTypeID ||
 388         (aFlag & NS_IPARSER_FLAG_VIEW_SOURCE)) {
 389       mTextValue = tagIdent.str();
 390     }
 391   } else {
 392     result = aScanner.ReadTagIdentifier(tagIdent);
 393     mTextValue = tagIdent.str();
 394     mTypeID = nsHTMLTags::LookupTag(mTextValue);
 395   }
 396
 397   if (NS_SUCCEEDED(result) && !(aFlag & NS_IPARSER_FLAG_VIEW_SOURCE)) {
 398     result = aScanner.SkipWhitespace(mNewlineCount);
 399   }
 400
 401   if (kEOF == result && !aScanner.IsIncremental()) {
 402     // Take what we can get.
 403     result = NS_OK;
 404   }
 405
 406   return result;
 407 }
 408
 409
 410 /*
 411  *  Asks the token to determine the <i>HTMLTag type</i> of
 412  *  the token. This turns around and looks up the tag name
 413  *  in the tag dictionary.
 414  */
 415 PRInt32
 416 CEndToken::GetTypeID()
 417 {
 418   if (eHTMLTag_unknown == mTypeID) {
 419     mTypeID = nsHTMLTags::LookupTag(mTextValue);
 420     switch (mTypeID) {
 421       case eHTMLTag_dir:
 422       case eHTMLTag_menu:
 423         mTypeID = eHTMLTag_ul;
 424         break;
 425
 426       default:
 427         break;
 428     }
 429   }
 430
 431   return mTypeID;
 432 }
 433
 434 PRInt32
 435 CEndToken::GetTokenType()
 436 {
 437   return eToken_end;
 438 }
 439
 440 const nsSubstring&
 441 CEndToken::GetStringValue()
 442 {
 443   if (eHTMLTag_unknown < mTypeID && mTypeID < eHTMLTag_text) {
 444     if (!mTextValue.Length()) {
 445       mTextValue.Assign(nsHTMLTags::GetStringValue((nsHTMLTag) mTypeID));
 446     }
 447   }
 448   return mTextValue;
 449 }
 450
 451 void
 452 CEndToken::GetSource(nsString& anOutputString)
 453 {
 454   anOutputString.Truncate();
 455   AppendSourceTo(anOutputString);
 456 }
 457
 458 void
 459 CEndToken::AppendSourceTo(nsAString& anOutputString)
 460 {
 461   anOutputString.AppendLiteral("</");
 462   if (!mTextValue.IsEmpty()) {
 463     anOutputString.Append(mTextValue);
 464   } else {
 465     anOutputString.Append(GetTagName(mTypeID));
 466   }
 467
 468   anOutputString.Append(PRUnichar('>'));
 469 }
 470
 471 CTextToken::CTextToken()
 472   : CHTMLToken(eHTMLTag_text)
 473 {
 474 }
 475
 476 CTextToken::CTextToken(const nsAString& aName)
 477   : CHTMLToken(eHTMLTag_text)
 478 {
 479   mTextValue.Rebind(aName);
 480 }
 481
 482 PRInt32
 483 CTextToken::GetTokenType()
 484 {
 485   return eToken_text;
 486 }
 487
 488 PRInt32
 489 CTextToken::GetTextLength()
 490 {
 491   return mTextValue.Length();
 492 }
 493
 494 nsresult
 495 CTextToken::Consume(PRUnichar aChar, nsScanner& aScanner, PRInt32 aFlag)
 496 {
 497   static const PRUnichar theTerminalsChars[] =
 498     { PRUnichar('\n'), PRUnichar('\r'), PRUnichar('&'), PRUnichar('<'),
 499       PRUnichar(0) };
 500   static const nsReadEndCondition theEndCondition(theTerminalsChars);
 501   nsresult  result = NS_OK;
 502   PRBool    done = PR_FALSE;
 503   nsScannerIterator origin, start, end;
 504
 505   // Start scanning after the first character, because we know it to
 506   // be part of this text token (we wouldn't have come here if it weren't)
 507   aScanner.CurrentPosition(origin);
 508   start = origin;
 509   aScanner.EndReading(end);
 510
 511   NS_ASSERTION(start != end, "Calling CTextToken::Consume when already at the "
 512                              "end of a document is a bad idea.");
 513
 514   aScanner.SetPosition(++start);
 515
 516   while (NS_OK == result && !done) {
 517     result = aScanner.ReadUntil(start, end, theEndCondition, PR_FALSE);
 518     if (NS_OK == result) {
 519       result = aScanner.Peek(aChar);
 520
 521       if (NS_OK == result && (kCR == aChar || kNewLine == aChar)) {
 522         switch (aChar) {
 523           case kCR:
 524           {
 525             // It's a carriage return. See if this is part of a CR-LF pair (in
 526             // which case we need to treat it as one newline). If we're at the
 527             // edge of a packet, then leave the CR on the scanner, since it
 528             // could still be part of a CR-LF pair. Otherwise, it isn't.
 529             PRUnichar theNextChar;
 530             result = aScanner.Peek(theNextChar, 1);
 531
 532             if (result == kEOF && aScanner.IsIncremental()) {
 533               break;
 534             }
 535
 536             if (NS_SUCCEEDED(result)) {
 537               // Actually get the carriage return.
 538               aScanner.GetChar(aChar);
 539             }
 540
 541             if (kLF == theNextChar) {
 542               // If the "\r" is followed by a "\n", don't replace it and let
 543               // it be ignored by the layout system.
 544               end.advance(2);
 545               aScanner.GetChar(theNextChar);
 546             } else {
 547               // If it is standalone, replace the "\r" with a "\n" so that it
 548               // will be considered by the layout system.
 549               aScanner.ReplaceCharacter(end, kLF);
 550               ++end;
 551             }
 552             ++mNewlineCount;
 553             break;
 554           }
 555           case kLF:
 556             aScanner.GetChar(aChar);
 557             ++end;
 558             ++mNewlineCount;
 559             break;
 560         }
 561       } else {
 562         done = PR_TRUE;
 563       }
 564     }
 565   }
 566
 567   // Note: This function is only called from nsHTMLTokenizer::ConsumeText. If
 568   // we return an error result from the final buffer, then it is responsible
 569   // for turning it into an NS_OK result.
 570   aScanner.BindSubstring(mTextValue, origin, end);
 571
 572   return result;
 573 }
 574
 575 /*
 576  *  Consume as much clear text from scanner as possible.
 577  *  The scanner is left on the < of the perceived end tag.
 578  *
 579  *  @param   aChar -- last char consumed from stream
 580  *  @param   aConservativeConsume -- controls our handling of content with no
 581  *                                   terminating string.
 582  *  @param   aIgnoreComments -- whether or not we should take comments into
 583  *                              account in looking for the end tag.
 584  *  @param   aScanner -- controller of underlying input source
 585  *  @param   aEndTagname -- the terminal tag name.
 586  *  @param   aFlag -- dtd modes and such.
 587  *  @param   aFlushTokens -- PR_TRUE if we found the terminal tag.
 588  *  @return  error result
 589  */
 590 nsresult
 591 CTextToken::ConsumeCharacterData(PRBool aIgnoreComments,
 592                                  nsScanner& aScanner,
 593                                  const nsAString& aEndTagName,
 594                                  PRInt32 aFlag,
 595                                  PRBool& aFlushTokens)
 596 {
 597   nsresult result = NS_OK;
 598   nsScannerIterator theStartOffset, theCurrOffset, theTermStrPos,
 599                     theStartCommentPos, theAltTermStrPos, endPos;
 600   PRBool        done = PR_FALSE;
 601   PRBool        theLastIteration = PR_FALSE;
 602
 603   aScanner.CurrentPosition(theStartOffset);
 604   theCurrOffset = theStartOffset;
 605   aScanner.EndReading(endPos);
 606   theTermStrPos = theStartCommentPos = theAltTermStrPos = endPos;
 607
 608   // ALGORITHM: *** The performance is based on correctness of the document ***
 609   // 1. Look for a '<' character.  This could be
 610   //      a) Start of a comment (<!--),
 611   //      b) Start of the terminal string, or
 612   //      c) a start of a tag.
 613   //    We are interested in a) and b). c) is ignored because in CDATA we
 614   //    don't care for tags.
 615   //    NOTE: Technically speaking in CDATA we should ignore the comments too!
 616   //    But for compatibility we don't.
 617   // 2. Having the offset, for '<', search for the terminal string from there
 618   //    on and record its offset.
 619   // 3. From the same '<' offset also search for start of a comment '<!--'.
 620   //    If found search for end comment '-->' between the terminal string and
 621   //    '<!--'.  If you did not find the end comment, then we have a malformed
 622   //    document, i.e., this section has a prematured terminal string Ex.
 623   //    <SCRIPT><!-- document.write('</SCRIPT>') //--> </SCRIPT>. But record
 624   //    terminal string's offset if this is the first premature terminal
 625   //    string, and update the current offset to the terminal string
 626   //    (prematured) offset and goto step 1.
 627   // 4. Amen...If you found a terminal string and '-->'. Otherwise goto step 1.
 628   // 5. If the end of the document is reached and if we still don't have the
 629   //    condition in step 4. then assume that the prematured terminal string
 630   //    is the actual terminal string and goto step 1. This will be our last
 631   //    iteration. If there is no premature terminal string and we're being
 632   //    conservative in our consumption (aConservativeConsume), then don't
 633   //    consume anything from the scanner. Otherwise, we consume all the way
 634   //    until the end.
 635
 636   NS_NAMED_LITERAL_STRING(ltslash, "</");
 637   const nsString theTerminalString = ltslash + aEndTagName;
 638
 639   PRUint32 termStrLen = theTerminalString.Length();
 640   while (result == NS_OK && !done) {
 641     PRBool found = PR_FALSE;
 642     nsScannerIterator gtOffset, ltOffset = theCurrOffset;
 643     while (FindCharInReadable(PRUnichar(kLessThan), ltOffset, endPos) &&
 644            ((PRUint32)ltOffset.size_forward() >= termStrLen ||
 645             Distance(ltOffset, endPos) >= termStrLen)) {
 646       // Make a copy of the (presumed) end tag and
 647       // do a case-insensitive comparison
 648
 649       nsScannerIterator start(ltOffset), end(ltOffset);
 650       end.advance(termStrLen);
 651
 652       if (CaseInsensitiveFindInReadable(theTerminalString, start, end) &&
 653           (end == endPos || (*end == '>'  || *end == ' '  ||
 654                              *end == '\t' || *end == '\n' ||
 655                              *end == '\r'))) {
 656         gtOffset = end;
 657         // Note that aIgnoreComments is only not set for <script>. We don't
 658         // want to execute scripts that aren't in the form of: <script\s.*>
 659         if ((end == endPos && aIgnoreComments) ||
 660             FindCharInReadable(PRUnichar(kGreaterThan), gtOffset, endPos)) {
 661           found = PR_TRUE;
 662           theTermStrPos = start;
 663         }
 664         break;
 665       }
 666       ltOffset.advance(1);
 667     }
 668
 669     if (found && theTermStrPos != endPos) {
 670       if (!(aFlag & NS_IPARSER_FLAG_STRICT_MODE) &&
 671           !theLastIteration && !aIgnoreComments) {
 672         nsScannerIterator endComment(ltOffset);
 673         endComment.advance(5);
 674
 675         if ((theStartCommentPos == endPos) &&
 676             FindInReadable(NS_LITERAL_STRING("<!--"), theCurrOffset,
 677                            endComment)) {
 678           theStartCommentPos = theCurrOffset;
 679         }
 680
 681         if (theStartCommentPos != endPos) {
 682           // Search for --> between <!-- and </TERMINALSTRING>.
 683           theCurrOffset = theStartCommentPos;
 684           nsScannerIterator terminal(theTermStrPos);
 685           if (!RFindInReadable(NS_LITERAL_STRING("-->"),
 686                                theCurrOffset, terminal)) {
 687             // If you're here it means that we have a bogus terminal string.
 688             // Even though it is bogus, the position of the terminal string
 689             // could be helpful in case we hit the rock bottom.
 690             if (theAltTermStrPos == endPos) {
 691               // But we only want to remember the first bogus terminal string.
 692               theAltTermStrPos = theTermStrPos;
 693             }
 694
 695             // We did not find '-->' so keep searching for terminal string.
 696             theCurrOffset = theTermStrPos;
 697             theCurrOffset.advance(termStrLen);
 698             continue;
 699           }
 700         }
 701       }
 702
 703       aScanner.BindSubstring(mTextValue, theStartOffset, theTermStrPos);
 704       aScanner.SetPosition(ltOffset);
 705
 706       // We found </SCRIPT> or </STYLE>...permit flushing -> Ref: Bug 22485
 707       aFlushTokens = PR_TRUE;
 708       done = PR_TRUE;
 709     } else {
 710       // We end up here if:
 711       // a) when the buffer runs out ot data.
 712       // b) when the terminal string is not found.
 713       if (!aScanner.IsIncremental()) {
 714         if (theAltTermStrPos != endPos) {
 715           // If you're here it means that we hit the rock bottom and therefore
 716           // switch to plan B, since we have an alternative terminating string.
 717           theCurrOffset = theAltTermStrPos;
 718           theLastIteration = PR_TRUE;
 719         } else {
 720           // Oops, We fell all the way down to the end of the document.
 721           done = PR_TRUE; // Do this to fix Bug. 35456
 722           result = kFakeEndTag;
 723           aScanner.BindSubstring(mTextValue, theStartOffset, endPos);
 724           aScanner.SetPosition(endPos);
 725         }
 726       } else {
 727         result = kEOF;
 728       }
 729     }
 730   }
 731
 732   if (result == NS_OK) {
 733     mNewlineCount = mTextValue.CountChar(kNewLine);
 734   }
 735
 736   return result;
 737 }
 738
 739 /*
 740  *  Consume as much clear text from scanner as possible. Reducing entities.
 741  *  The scanner is left on the < of the perceived end tag.
 742  *
 743  *  @param   aChar -- last char consumed from stream
 744  *  @param   aConservativeConsume -- controls our handling of content with no
 745  *                                   terminating string.
 746  *  @param   aScanner -- controller of underlying input source
 747  *  @param   aEndTagname -- the terminal tag name.
 748  *  @param   aFlag -- dtd modes and such.
 749  *  @param   aFlushTokens -- PR_TRUE if we found the terminal tag.
 750  *  @return  error result
 751  */
 752 nsresult
 753 CTextToken::ConsumeParsedCharacterData(PRBool aDiscardFirstNewline,
 754                                        PRBool aConservativeConsume,
 755                                        nsScanner& aScanner,
 756                                        const nsAString& aEndTagName,
 757                                        PRInt32 aFlag,
 758                                        PRBool& aFound)
 759 {
 760   // This function is fairly straightforward except if there is no terminating
 761   // string. If there is, we simply loop through all of the entities, reducing
 762   // them as necessary and skipping over non-terminal strings starting with <.
 763   // If there is *no* terminal string, then we examine aConservativeConsume.
 764   // If we want to be conservative, we backtrack to the first place in the
 765   // document that looked like the end of PCDATA (i.e., the first tag). This
 766   // is for compatibility and so we don't regress bug 42945. If we are not
 767   // conservative, then we consume everything, all the way up to the end of
 768   // the document.
 769
 770   static const PRUnichar terminalChars[] = {
 771     PRUnichar('\r'), PRUnichar('\n'), PRUnichar('&'), PRUnichar('<'),
 772     PRUnichar(0)
 773   };
 774   static const nsReadEndCondition theEndCondition(terminalChars);
 775
 776   nsScannerIterator currPos, endPos, altEndPos;
 777   PRUint32 truncPos = 0;
 778   aScanner.CurrentPosition(currPos);
 779   aScanner.EndReading(endPos);
 780
 781   altEndPos = endPos;
 782
 783   nsScannerSharedSubstring theContent;
 784   PRUnichar ch = 0;
 785
 786   NS_NAMED_LITERAL_STRING(commentStart, "<!--");
 787   NS_NAMED_LITERAL_STRING(ltslash, "</");
 788   const nsString theTerminalString = ltslash + aEndTagName;
 789   PRUint32 termStrLen = theTerminalString.Length();
 790   PRUint32 commentStartLen = commentStart.Length();
 791
 792   nsresult result = NS_OK;
 793
 794   // Note that if we're already at the end of the document, the ConsumeUntil
 795   // will fail, and we'll do the right thing.
 796   do {
 797     result = ConsumeUntil(theContent, mNewlineCount, aScanner,
 798                           theEndCondition, PR_TRUE, PR_FALSE, aFlag);
 799
 800     if (aDiscardFirstNewline &&
 801         (NS_SUCCEEDED(result) || !aScanner.IsIncremental()) &&
 802         !(aFlag & NS_IPARSER_FLAG_VIEW_SOURCE)) {
 803       // Check if the very first character is a newline, and if so discard it.
 804       // Note that we don't want to discard it in view source!
 805       // Also note that this has to happen here (as opposed to before the
 806       // ConsumeUntil) because we have to expand any entities.
 807       // XXX It would be nice to be able to do this without calling
 808       // writable()!
 809       const nsSubstring &firstChunk = theContent.str();
 810       if (!firstChunk.IsEmpty()) {
 811         PRUint32 where = 0;
 812         PRUnichar newline = firstChunk.First();
 813
 814         if (newline == kCR || newline == kNewLine) {
 815           ++where;
 816
 817           if (firstChunk.Length() > 1) {
 818             if (newline == kCR && firstChunk.CharAt(1) == kNewLine) {
 819               // Handle \r\n = 1 newline.
 820               ++where;
 821             }
 822             // Note: \n\r = 2 newlines.
 823           }
 824         }
 825
 826         if (where != 0) {
 827           theContent.writable() = Substring(firstChunk, where);
 828         }
 829       }
 830     }
 831     aDiscardFirstNewline = PR_FALSE;
 832
 833     if (NS_FAILED(result)) {
 834       if (kEOF == result && !aScanner.IsIncremental()) {
 835         aFound = PR_TRUE; // this is as good as it gets.
 836         result = kFakeEndTag;
 837
 838         if (aConservativeConsume && altEndPos != endPos) {
 839           // We ran out of room looking for a </title>. Go back to the first
 840           // place that looked like a tag and use that as our stopping point.
 841           theContent.writable().Truncate(truncPos);
 842           aScanner.SetPosition(altEndPos, PR_FALSE, PR_TRUE);
 843         }
 844         // else we take everything we consumed.
 845         mTextValue.Rebind(theContent.str());
 846       } else {
 847         aFound = PR_FALSE;
 848       }
 849
 850       return result;
 851     }
 852
 853     aScanner.CurrentPosition(currPos);
 854     aScanner.GetChar(ch); // this character must be '&' or '<'
 855
 856     if (ch == kLessThan && altEndPos == endPos) {
 857       // Keep this position in case we need it for later.
 858       altEndPos = currPos;
 859       truncPos = theContent.str().Length();
 860     }
 861
 862     if (Distance(currPos, endPos) >= termStrLen) {
 863       nsScannerIterator start(currPos), end(currPos);
 864       end.advance(termStrLen);
 865
 866       if (CaseInsensitiveFindInReadable(theTerminalString, start, end)) {
 867         if (end != endPos && (*end == '>'  || *end == ' '  ||
 868                               *end == '\t' || *end == '\n' ||
 869                               *end == '\r')) {
 870           aFound = PR_TRUE;
 871           mTextValue.Rebind(theContent.str());
 872
 873           // Note: This SetPosition() is actually going backwards from the
 874           // scanner's mCurrentPosition (so we pass aReverse == PR_TRUE). This
 875           // is because we call GetChar() above after we get the current
 876           // position.
 877           aScanner.SetPosition(currPos, PR_FALSE, PR_TRUE);
 878           break;
 879         }
 880       }
 881     }
 882     // IE only consumes <!-- --> as comments in PCDATA.
 883     if (Distance(currPos, endPos) >= commentStartLen) {
 884       nsScannerIterator start(currPos), end(currPos);
 885       end.advance(commentStartLen);
 886
 887       if (CaseInsensitiveFindInReadable(commentStart, start, end)) {
 888         CCommentToken consumer; // stack allocated.
 889
 890         // CCommentToken expects us to be on the '-'
 891         aScanner.SetPosition(currPos.advance(2));
 892
 893         // In quirks mode we consume too many things as comments, so pretend
 894         // that we're not by modifying aFlag.
 895         result = consumer.Consume(*currPos, aScanner,
 896                                   (aFlag & ~NS_IPARSER_FLAG_QUIRKS_MODE) |
 897                                    NS_IPARSER_FLAG_STRICT_MODE);
 898         if (kEOF == result) {
 899           // This can only happen if we're really out of space.
 900           return kEOF;
 901         } else if (kNotAComment == result) {
 902           // Fall through and consume this as text.
 903           aScanner.CurrentPosition(currPos);
 904           aScanner.SetPosition(currPos.advance(1));
 905         } else {
 906           consumer.AppendSourceTo(theContent.writable());
 907           mNewlineCount += consumer.GetNewlineCount();
 908           continue;
 909         }
 910       }
 911     }
 912
 913     result = kEOF;
 914     // We did not find the terminal string yet so
 915     // include the character that stopped consumption.
 916     theContent.writable().Append(ch);
 917   } while (currPos != endPos);
 918
 919   return result;
 920 }
 921
 922 void
 923 CTextToken::CopyTo(nsAString& aStr)
 924 {
 925   nsScannerIterator start, end;
 926   mTextValue.BeginReading(start);
 927   mTextValue.EndReading(end);
 928   CopyUnicodeTo(start, end, aStr);
 929 }
 930
 931 const nsSubstring& CTextToken::GetStringValue()
 932 {
 933   return mTextValue.AsString();
 934 }
 935
 936 void
 937 CTextToken::Bind(nsScanner* aScanner, nsScannerIterator& aStart,
 938                  nsScannerIterator& aEnd)
 939 {
 940   aScanner->BindSubstring(mTextValue, aStart, aEnd);
 941 }
 942
 943 void
 944 CTextToken::Bind(const nsAString& aStr)
 945 {
 946   mTextValue.Rebind(aStr);
 947 }
 948
 949 CCDATASectionToken::CCDATASectionToken(eHTMLTags aTag)
 950   : CHTMLToken(aTag)
 951 {
 952 }
 953
 954 CCDATASectionToken::CCDATASectionToken(const nsAString& aName)
 955   : CHTMLToken(eHTMLTag_unknown)
 956 {
 957   mTextValue.Assign(aName);
 958 }
 959
 960 PRInt32
 961 CCDATASectionToken::GetTokenType()
 962 {
 963   return eToken_cdatasection;
 964 }
 965
 966 /*
 967  *  Consume as much marked test from scanner as possible.
 968  *  Note: This has to handle case: "<![ ! IE 5]>", in addition to "<![..[..]]>"
 969  *
 970  *  @param   aChar -- last char consumed from stream
 971  *  @param   aScanner -- controller of underlying input source
 972  *  @return  error result
 973  */
 974 nsresult
 975 CCDATASectionToken::Consume(PRUnichar aChar, nsScanner& aScanner,
 976                             PRInt32 aFlag)
 977 {
 978   static const PRUnichar theTerminalsChars[] =
 979   { PRUnichar('\r'), PRUnichar('\n'), PRUnichar(']'), PRUnichar(0) };
 980   static const nsReadEndCondition theEndCondition(theTerminalsChars);
 981   nsresult  result = NS_OK;
 982   PRBool    done = PR_FALSE;
 983
 984   while (NS_OK == result && !done) {
 985     result = aScanner.ReadUntil(mTextValue, theEndCondition, PR_FALSE);
 986     if (NS_OK == result) {
 987       result = aScanner.Peek(aChar);
 988       if (kCR == aChar && NS_OK == result) {
 989         result = aScanner.GetChar(aChar); // Strip off the \r
 990         result = aScanner.Peek(aChar);    // Then see what's next.
 991         if (NS_OK == result) {
 992           switch(aChar) {
 993             case kCR:
 994               result = aScanner.GetChar(aChar); // Strip off the \r
 995               mTextValue.AppendLiteral("\n\n");
 996               mNewlineCount += 2;
 997               break;
 998
 999             case kNewLine:
1000               // Which means we saw \r\n, which becomes \n
1001               result = aScanner.GetChar(aChar); // Strip off the \n
1002
1003               // Fall through...
1004             default:
1005               mTextValue.AppendLiteral("\n");
1006               mNewlineCount++;
1007               break;
1008           }
1009         }
1010       } else if (kNewLine == aChar) {
1011         result = aScanner.GetChar(aChar);
1012         mTextValue.Append(aChar);
1013         ++mNewlineCount;
1014       } else if (kRightSquareBracket == aChar) {
1015         PRBool canClose = PR_FALSE;
1016         result = aScanner.GetChar(aChar); // Strip off the ]
1017         mTextValue.Append(aChar);
1018         result = aScanner.Peek(aChar);    // Then see what's next.
1019         if (NS_OK == result && kRightSquareBracket == aChar) {
1020           result = aScanner.GetChar(aChar); // Strip off the second ]
1021           mTextValue.Append(aChar);
1022           canClose = PR_TRUE;
1023         }
1024
1025         // The goal here is to not lose data from the page when encountering
1026         // markup like: <![endif]-->.  This means that in normal parsing, we
1027         // allow ']' to end the marked section and just drop everything between
1028         // it an the '>'.  In view-source mode, we cannot drop things on the
1029         // floor like that.  In fact, to make view-source of XML with script in
1030         // CDATA sections at all bearable, we need to somewhat enforce the ']]>'
1031         // terminator for marked sections.  So make the tokenization somewhat
1032         // different when in view-source _and_ dealing with a CDATA section.
1033         // XXX We should remember this StringBeginsWith test.
1034         PRBool inCDATA = (aFlag & NS_IPARSER_FLAG_VIEW_SOURCE) &&
1035           StringBeginsWith(mTextValue, NS_LITERAL_STRING("[CDATA["));
1036         if (inCDATA) {
1037           // Consume all right square brackets to catch cases such as:
1038           // <![CDATA[foo]]]>
1039           while (true) {
1040             result = aScanner.Peek(aChar);
1041             if (result != NS_OK || aChar != kRightSquareBracket) {
1042               break;
1043             }
1044
1045             mTextValue.Append(aChar);
1046             aScanner.GetChar(aChar);
1047           }
1048         } else {
1049           nsAutoString dummy; // Skip any bad data
1050           result = aScanner.ReadUntil(dummy, kGreaterThan, PR_FALSE);
1051         }
1052         if (NS_OK == result &&
1053             (!inCDATA || (canClose && kGreaterThan == aChar))) {
1054           result = aScanner.GetChar(aChar); // Strip off the >
1055           done = PR_TRUE;
1056         }
1057       } else {
1058         done = PR_TRUE;
1059       }
1060     }
1061   }
1062
1063   if (kEOF == result && !aScanner.IsIncremental()) {
1064     // We ran out of space looking for the end of this CDATA section.
1065     // In order to not completely lose the entire section, treat everything
1066     // until the end of the document as part of the CDATA section and let
1067     // the DTD handle it.
1068     mInError = PR_TRUE;
1069     result = NS_OK;
1070   }
1071
1072   return result;
1073 }
1074
1075 const nsSubstring&
1076 CCDATASectionToken::GetStringValue()
1077 {
1078   return mTextValue;
1079 }
1080
1081
1082 CMarkupDeclToken::CMarkupDeclToken()
1083   : CHTMLToken(eHTMLTag_markupDecl)
1084 {
1085 }
1086
1087 CMarkupDeclToken::CMarkupDeclToken(const nsAString& aName)
1088   : CHTMLToken(eHTMLTag_markupDecl)
1089 {
1090   mTextValue.Rebind(aName);
1091 }
1092
1093 PRInt32
1094 CMarkupDeclToken::GetTokenType()
1095 {
1096   return eToken_markupDecl;
1097 }
1098
1099 /*
1100  *  Consume as much declaration from scanner as possible.
1101  *  Declaration is a markup declaration of ELEMENT, ATTLIST, ENTITY or
1102  *  NOTATION, which can span multiple lines and ends in >.
1103  *
1104  *  @param   aChar -- last char consumed from stream
1105  *  @param   aScanner -- controller of underlying input source
1106  *  @return  error result
1107  */
1108 nsresult
1109 CMarkupDeclToken::Consume(PRUnichar aChar, nsScanner& aScanner,
1110                           PRInt32 aFlag)
1111 {
1112   static const PRUnichar theTerminalsChars[] =
1113     { PRUnichar('\n'), PRUnichar('\r'), PRUnichar('\''), PRUnichar('"'),
1114       PRUnichar('>'),
1115       PRUnichar(0) };
1116   static const nsReadEndCondition theEndCondition(theTerminalsChars);
1117   nsresult  result = NS_OK;
1118   PRBool    done = PR_FALSE;
1119   PRUnichar quote = 0;
1120
1121   nsScannerIterator origin, start, end;
1122   aScanner.CurrentPosition(origin);
1123   start = origin;
1124
1125   while (NS_OK == result && !done) {
1126     aScanner.SetPosition(start);
1127     result = aScanner.ReadUntil(start, end, theEndCondition, PR_FALSE);
1128     if (NS_OK == result) {
1129       result = aScanner.Peek(aChar);
1130
1131       if (NS_OK == result) {
1132         PRUnichar theNextChar = 0;
1133         if (kCR == aChar || kNewLine == aChar) {
1134           result = aScanner.GetChar(aChar); // Strip off the char
1135           result = aScanner.Peek(theNextChar); // Then see what's next.
1136         }
1137         switch(aChar) {
1138           case kCR:
1139             // result = aScanner.GetChar(aChar);
1140             if (kLF == theNextChar) {
1141               // If the "\r" is followed by a "\n", don't replace it and
1142               // let it be ignored by the layout system
1143               end.advance(2);
1144               result = aScanner.GetChar(theNextChar);
1145             } else {
1146               // If it standalone, replace the "\r" with a "\n" so that
1147               // it will be considered by the layout system
1148               aScanner.ReplaceCharacter(end, kLF);
1149               ++end;
1150             }
1151             ++mNewlineCount;
1152             break;
1153           case kLF:
1154             ++end;
1155             ++mNewlineCount;
1156             break;
1157           case '\'':
1158           case '"':
1159             ++end;
1160             if (quote) {
1161               if (quote == aChar) {
1162                 quote = 0;
1163               }
1164             } else {
1165               quote = aChar;
1166             }
1167             break;
1168           case kGreaterThan:
1169             if (quote) {
1170               ++end;
1171             } else {
1172               start = end;
1173               // Note that start is wrong after this, we just avoid temp var
1174               ++start;
1175               aScanner.SetPosition(start); // Skip the >
1176               done = PR_TRUE;
1177             }
1178             break;
1179           default:
1180             NS_ABORT_IF_FALSE(0, "should not happen, switch is missing cases?");
1181             break;
1182         }
1183         start = end;
1184       } else {
1185         done = PR_TRUE;
1186       }
1187     }
1188   }
1189   aScanner.BindSubstring(mTextValue, origin, end);
1190
1191   if (kEOF == result) {
1192     mInError = PR_TRUE;
1193     if (!aScanner.IsIncremental()) {
1194       // Hide this EOF.
1195       result = NS_OK;
1196     }
1197   }
1198
1199   return result;
1200 }
1201
1202 const nsSubstring&
1203 CMarkupDeclToken::GetStringValue()
1204 {
1205   return mTextValue.AsString();
1206 }
1207
1208
1209 CCommentToken::CCommentToken()
1210   : CHTMLToken(eHTMLTag_comment)
1211 {
1212 }
1213
1214 CCommentToken::CCommentToken(const nsAString& aName)
1215   : CHTMLToken(eHTMLTag_comment)
1216 {
1217   mComment.Rebind(aName);
1218 }
1219
1220 void
1221 CCommentToken::AppendSourceTo(nsAString& anOutputString)
1222 {
1223   AppendUnicodeTo(mCommentDecl, anOutputString);
1224 }
1225
1226 static PRBool
1227 IsCommentEnd(const nsScannerIterator& aCurrent, const nsScannerIterator& aEnd,
1228              nsScannerIterator& aGt)
1229 {
1230   nsScannerIterator current = aCurrent;
1231   PRInt32 dashes = 0;
1232
1233   while (current != aEnd && dashes != 2) {
1234     if (*current == kGreaterThan) {
1235       aGt = current;
1236       return PR_TRUE;
1237     }
1238     if (*current == PRUnichar('-')) {
1239       ++dashes;
1240     } else {
1241       dashes = 0;
1242     }
1243     ++current;
1244   }
1245
1246   return PR_FALSE;
1247 }
1248
1249 nsresult
1250 CCommentToken::ConsumeStrictComment(nsScanner& aScanner)
1251 {
1252   // <!--[... -- ... -- ...]*-->
1253   /*********************************************************
1254     NOTE: This algorithm does a fine job of handling comments
1255           when they're formatted per spec, but if they're not
1256           we don't handle them well.
1257    *********************************************************/
1258   nsScannerIterator end, current, gt, lt;
1259   aScanner.EndReading(end);
1260   aScanner.CurrentPosition(current);
1261
1262   nsScannerIterator beginData = end;
1263
1264   lt = current;
1265   lt.advance(-2); // <!
1266
1267   current.advance(-1);
1268
1269   // Regular comment must start with <!--
1270   if (*current == kExclamation &&
1271       ++current != end && *current == kMinus &&
1272       ++current != end && *current == kMinus &&
1273       ++current != end) {
1274     nsScannerIterator currentEnd = end;
1275     PRBool balancedComment = PR_FALSE;
1276     NS_NAMED_LITERAL_STRING(dashes, "--");
1277     beginData = current;
1278
1279     while (FindInReadable(dashes, current, currentEnd)) {
1280       current.advance(2);
1281
1282       balancedComment = !balancedComment; // We need to match '--' with '--'
1283
1284       if (balancedComment && IsCommentEnd(current, end, gt)) {
1285         // done
1286         current.advance(-2);
1287         // Note: it's ok if beginData == current, (we'll copy an empty string)
1288         // and we need to bind mComment anyway.
1289         aScanner.BindSubstring(mComment, beginData, current);
1290         aScanner.BindSubstring(mCommentDecl, lt, ++gt);
1291         aScanner.SetPosition(gt);
1292         return NS_OK;
1293       }
1294
1295       // Continue after the last '--'
1296       currentEnd = end;
1297     }
1298   }
1299
1300   // If beginData == end, we did not find opening '--'
1301   if (beginData == end) {
1302     // This might have been empty comment: <!>
1303     // Or it could have been something completely bogus like: <!This is foobar>
1304     // Handle both cases below
1305     aScanner.CurrentPosition(current);
1306     beginData = current;
1307     if (FindCharInReadable('>', current, end)) {
1308       aScanner.BindSubstring(mComment, beginData, current);
1309       aScanner.BindSubstring(mCommentDecl, lt, ++current);
1310       aScanner.SetPosition(current);
1311       return NS_OK;
1312     }
1313   }
1314
1315   if (aScanner.IsIncremental()) {
1316     // We got here because we saw the beginning of a comment,
1317     // but not yet the end, and we are still loading the page. In that
1318     // case the return value here will cause us to unwind,
1319     // wait for more content, and try again.
1320     // XXX For performance reasons we should cache where we were, and
1321     //     continue from there for next call
1322     return kEOF;
1323   }
1324
1325   // There was no terminating string, parse this comment as text.
1326   aScanner.SetPosition(lt, PR_FALSE, PR_TRUE);
1327   return kNotAComment;
1328 }
1329
1330 nsresult
1331 CCommentToken::ConsumeQuirksComment(nsScanner& aScanner)
1332 {
1333   // <![-[-]] ... [[-]-|--!]>
1334   /*********************************************************
1335     NOTE: This algorithm does a fine job of handling comments
1336           commonly used, but it doesn't really consume them
1337           per spec (But then, neither does IE or Nav).
1338    *********************************************************/
1339   nsScannerIterator end, current;
1340   aScanner.EndReading(end);
1341   aScanner.CurrentPosition(current);
1342   nsScannerIterator beginData = current,
1343                     beginLastMinus = end,
1344                     bestAltCommentEnd = end,
1345                     lt = current;
1346   lt.advance(-2); // <!
1347
1348   // When we get here, we have always already consumed <!
1349   // Skip over possible leading minuses
1350   if (current != end && *current == kMinus) {
1351     beginLastMinus = current;
1352     ++current;
1353     ++beginData;
1354     if (current != end && *current == kMinus) { // <!--
1355       beginLastMinus = current;
1356       ++current;
1357       ++beginData;
1358       // Long form comment
1359
1360       nsScannerIterator currentEnd = end, gt = end;
1361
1362       // Find the end of the comment
1363       while (FindCharInReadable(kGreaterThan, current, currentEnd)) {
1364         gt = current;
1365         if (bestAltCommentEnd == end) {
1366           bestAltCommentEnd = gt;
1367         }
1368         --current;
1369         PRBool goodComment = PR_FALSE;
1370         if (current != beginLastMinus && *current == kMinus) { // ->
1371           --current;
1372           if (current != beginLastMinus && *current == kMinus) { // -->
1373             goodComment = PR_TRUE;
1374             --current;
1375           }
1376         } else if (current != beginLastMinus && *current == '!') {
1377           --current;
1378           if (current != beginLastMinus && *current == kMinus) {
1379             --current;
1380             if (current != beginLastMinus && *current == kMinus) { // --!>
1381               --current;
1382               goodComment = PR_TRUE;
1383             }
1384           }
1385         } else if (current == beginLastMinus) {
1386           goodComment = PR_TRUE;
1387         }
1388
1389         if (goodComment) {
1390           // done
1391           aScanner.BindSubstring(mComment, beginData, ++current);
1392           aScanner.BindSubstring(mCommentDecl, lt, ++gt);
1393           aScanner.SetPosition(gt);
1394           return NS_OK;
1395         } else {
1396           // try again starting after the last '>'
1397           current = ++gt;
1398           currentEnd = end;
1399         }
1400       }
1401
1402       if (aScanner.IsIncremental()) {
1403         // We got here because we saw the beginning of a comment,
1404         // but not yet the end, and we are still loading the page. In that
1405         // case the return value here will cause us to unwind,
1406         // wait for more content, and try again.
1407         // XXX For performance reasons we should cache where we were, and
1408         //     continue from there for next call
1409         return kEOF;
1410       }
1411
1412       // If you're here, then we're in a special state.
1413       // The problem at hand is that we've hit the end of the document without
1414       // finding the normal endcomment delimiter "-->".  In this case, the
1415       // first thing we try is to see if we found an alternate endcomment
1416       // delimiter ">".  If so, rewind just pass that, and use everything up
1417       // to that point as your comment.  If not, the document has no end
1418       // comment and should be treated as one big comment.
1419       gt = bestAltCommentEnd;
1420       aScanner.BindSubstring(mComment, beginData, gt);
1421       if (gt != end) {
1422         ++gt;
1423       }
1424       aScanner.BindSubstring(mCommentDecl, lt, gt);
1425       aScanner.SetPosition(gt);
1426       return NS_OK;
1427     }
1428   }
1429
1430   // This could be short form of comment
1431   // Find the end of the comment
1432   current = beginData;
1433   if (FindCharInReadable(kGreaterThan, current, end)) {
1434     nsScannerIterator gt = current;
1435     if (current != beginData) {
1436       --current;
1437       if (current != beginData && *current == kMinus) { // ->
1438         --current;
1439         if (current != beginData && *current == kMinus) { // -->
1440           --current;
1441         }
1442       } else if (current != beginData && *current == '!') { // !>
1443         --current;
1444         if (current != beginData && *current == kMinus) { // -!>
1445           --current;
1446           if (current != beginData && *current == kMinus) { // --!>
1447             --current;
1448           }
1449         }
1450       }
1451     }
1452
1453     if (current != gt) {
1454       aScanner.BindSubstring(mComment, beginData, ++current);
1455     } else {
1456       // Bind mComment to an empty string (note that if current == gt,
1457       // then current == beginData). We reach this for <!>
1458       aScanner.BindSubstring(mComment, beginData, current);
1459     }
1460     aScanner.BindSubstring(mCommentDecl, lt, ++gt);
1461     aScanner.SetPosition(gt);
1462     return NS_OK;
1463   }
1464
1465   if (!aScanner.IsIncremental()) {
1466     // This isn't a comment at all, go back to the < and consume as text.
1467     aScanner.SetPosition(lt, PR_FALSE, PR_TRUE);
1468     return kNotAComment;
1469   }
1470
1471   // Wait for more data...
1472   return kEOF;
1473 }
1474
1475 /*
1476  *  Consume the identifier portion of the comment.
1477  *  Note that we've already eaten the "<!" portion.
1478  *
1479  *  @param   aChar -- last char consumed from stream
1480  *  @param   aScanner -- controller of underlying input source
1481  *  @return  error result
1482  */
1483 nsresult
1484 CCommentToken::Consume(PRUnichar aChar, nsScanner& aScanner, PRInt32 aFlag)
1485 {
1486   nsresult result = PR_TRUE;
1487
1488   if (aFlag & NS_IPARSER_FLAG_STRICT_MODE) {
1489     // Enabling strict comment parsing for Bug 53011 and 2749 contradicts!
1490     result = ConsumeStrictComment(aScanner);
1491   } else {
1492     result = ConsumeQuirksComment(aScanner);
1493   }
1494
1495   if (NS_SUCCEEDED(result)) {
1496     mNewlineCount = mCommentDecl.CountChar(kNewLine);
1497   }
1498
1499   return result;
1500 }
1501
1502 const nsSubstring&
1503 CCommentToken::GetStringValue()
1504 {
1505   return mComment.AsString();
1506 }
1507
1508 PRInt32
1509 CCommentToken::GetTokenType()
1510 {
1511   return eToken_comment;
1512 }
1513
1514 CNewlineToken::CNewlineToken()
1515   : CHTMLToken(eHTMLTag_newline)
1516 {
1517 }
1518
1519 PRInt32
1520 CNewlineToken::GetTokenType()
1521 {
1522   return eToken_newline;
1523 }
1524
1525 static nsScannerSubstring* gNewlineStr;
1526 void
1527 CNewlineToken::AllocNewline()
1528 {
1529   gNewlineStr = new nsScannerSubstring(NS_LITERAL_STRING("\n"));
1530 }
1531
1532 void
1533 CNewlineToken::FreeNewline()
1534 {
1535   if (gNewlineStr) {
1536     delete gNewlineStr;
1537     gNewlineStr = nsnull;
1538   }
1539 }
1540
1541 /**
1542  *  This method retrieves the value of this internal string.
1543  *
1544  *  @return nsString reference to internal string value
1545  */
1546 const nsSubstring&
1547 CNewlineToken::GetStringValue()
1548 {
1549   return gNewlineStr->AsString();
1550 }
1551
1552 /*
1553  * Consume one newline (cr/lf pair).
1554  *
1555  *  @param   aChar -- last char consumed from stream
1556  *  @param   aScanner -- controller of underlying input source
1557  *  @return  error result
1558  */
1559 nsresult
1560 CNewlineToken::Consume(PRUnichar aChar, nsScanner& aScanner, PRInt32 aFlag)
1561 {
1562   /*
1563    * Here's what the HTML spec says about newlines:
1564    *
1565    * "A line break is defined to be a carriage return (&#x000D;),
1566    * a line feed (&#x000A;), or a carriage return/line feed pair.
1567    * All line breaks constitute white space."
1568    */
1569
1570   nsresult rv = NS_OK;
1571   if (aChar == kCR) {
1572     PRUnichar theChar;
1573     rv = aScanner.Peek(theChar);
1574     if (theChar == kNewLine) {
1575       rv = aScanner.GetChar(theChar);
1576     } else if (rv == kEOF && !aScanner.IsIncremental()) {
1577       // Make sure we don't lose information about this trailing newline.
1578       rv = NS_OK;
1579     }
1580   }
1581
1582   mNewlineCount = 1;
1583   return rv;
1584 }
1585
1586 CAttributeToken::CAttributeToken()
1587   : CHTMLToken(eHTMLTag_unknown)
1588 {
1589   mHasEqualWithoutValue = PR_FALSE;
1590 }
1591
1592 /*
1593  * String based constructor
1594  */
1595 CAttributeToken::CAttributeToken(const nsAString& aName)
1596   : CHTMLToken(eHTMLTag_unknown)
1597 {
1598   mTextValue.writable().Assign(aName);
1599   mHasEqualWithoutValue = PR_FALSE;
1600 }
1601
1602 /*
1603  *  construct initializing data to key value pair
1604  */
1605 CAttributeToken::CAttributeToken(const nsAString& aKey, const nsAString& aName)
1606   : CHTMLToken(eHTMLTag_unknown)
1607 {
1608   mTextValue.writable().Assign(aName);
1609   mTextKey.Rebind(aKey);
1610   mHasEqualWithoutValue = PR_FALSE;
1611 }
1612
1613 PRInt32
1614 CAttributeToken::GetTokenType()
1615 {
1616   return eToken_attribute;
1617 }
1618
1619 const nsSubstring&
1620 CAttributeToken::GetStringValue()
1621 {
1622   return mTextValue.str();
1623 }
1624
1625 void
1626 CAttributeToken::GetSource(nsString& anOutputString)
1627 {
1628   anOutputString.Truncate();
1629   AppendSourceTo(anOutputString);
1630 }
1631
1632 void
1633 CAttributeToken::AppendSourceTo(nsAString& anOutputString)
1634 {
1635   AppendUnicodeTo(mTextKey, anOutputString);
1636   if (mTextValue.str().Length() || mHasEqualWithoutValue) {
1637     anOutputString.AppendLiteral("=");
1638   }
1639   anOutputString.Append(mTextValue.str());
1640   // anOutputString.AppendLiteral(";");
1641 }
1642
1643 /*
1644  * This general purpose method is used when you want to
1645  * consume a known quoted string.
1646  */
1647 static nsresult
1648 ConsumeQuotedString(PRUnichar aChar,
1649                     nsScannerSharedSubstring& aString,
1650                     PRInt32& aNewlineCount,
1651                     nsScanner& aScanner,
1652                     PRInt32 aFlag)
1653 {
1654   NS_ASSERTION(aChar == kQuote || aChar == kApostrophe,
1655                "char is neither quote nor apostrophe");
1656   // Hold onto this in case this is an unterminated string literal
1657   PRUint32 origLen = aString.str().Length();
1658
1659   static const PRUnichar theTerminalCharsQuote[] = {
1660     PRUnichar(kQuote), PRUnichar('&'), PRUnichar(kCR),
1661     PRUnichar(kNewLine), PRUnichar(0) };
1662   static const PRUnichar theTerminalCharsApostrophe[] = {
1663     PRUnichar(kApostrophe), PRUnichar('&'), PRUnichar(kCR),
1664     PRUnichar(kNewLine), PRUnichar(0) };
1665   static const nsReadEndCondition
1666     theTerminateConditionQuote(theTerminalCharsQuote);
1667   static const nsReadEndCondition
1668     theTerminateConditionApostrophe(theTerminalCharsApostrophe);
1669
1670   // Assume Quote to init to something
1671   const nsReadEndCondition *terminateCondition = &theTerminateConditionQuote;
1672   if (aChar == kApostrophe) {
1673     terminateCondition = &theTerminateConditionApostrophe;
1674   }
1675
1676   nsresult result = NS_OK;
1677   nsScannerIterator theOffset;
1678   aScanner.CurrentPosition(theOffset);
1679
1680   result = ConsumeUntil(aString, aNewlineCount, aScanner,
1681                       *terminateCondition, PR_TRUE, PR_TRUE, aFlag);
1682
1683   if (NS_SUCCEEDED(result)) {
1684     result = aScanner.GetChar(aChar); // aChar should be " or '
1685   }
1686
1687   // Ref: Bug 35806
1688   // A back up measure when disaster strikes...
1689   // Ex <table> <tr d="><td>hello</td></tr></table>
1690   if (!aString.str().IsEmpty() && aString.str().Last() != aChar &&
1691       !aScanner.IsIncremental() && result == kEOF) {
1692     static const nsReadEndCondition
1693       theAttributeTerminator(kAttributeTerminalChars);
1694     aString.writable().Truncate(origLen);
1695     aScanner.SetPosition(theOffset, PR_FALSE, PR_TRUE);
1696     result = ConsumeUntil(aString, aNewlineCount, aScanner,
1697                           theAttributeTerminator, PR_FALSE, PR_TRUE, aFlag);
1698     if (NS_SUCCEEDED(result) && (aFlag & NS_IPARSER_FLAG_VIEW_SOURCE)) {
1699       // Remember that this string literal was unterminated.
1700       result = NS_ERROR_HTMLPARSER_UNTERMINATEDSTRINGLITERAL;
1701     }
1702   }
1703   return result;
1704 }
1705
1706 /*
1707  * This method is meant to be used by view-source to consume invalid attributes.
1708  * For the purposes of this method, an invalid attribute is an attribute that
1709  * starts with either ', ", or /. We consume all ', ", or / and the following
1710  * whitespace.
1711  *
1712  * @param aScanner -- the scanner we're reading our data from.
1713  * @param aChar -- the character we're skipping
1714  * @param aCurrent -- the current position that we're looking at.
1715  * @param aNewlineCount -- a count of the newlines we've consumed.
1716  * @return error result.
1717  */
1718 static nsresult
1719 ConsumeInvalidAttribute(nsScanner& aScanner,
1720                         PRUnichar aChar,
1721                         nsScannerIterator& aCurrent,
1722                         PRInt32& aNewlineCount)
1723 {
1724   NS_ASSERTION(aChar == kApostrophe || aChar == kQuote || aChar == kForwardSlash,
1725                "aChar must be a quote or apostrophe");
1726   nsScannerIterator end, wsbeg;
1727   aScanner.EndReading(end);
1728
1729   while (aCurrent != end && *aCurrent == aChar) {
1730     ++aCurrent;
1731   }
1732
1733   aScanner.SetPosition(aCurrent);
1734   return aScanner.ReadWhitespace(wsbeg, aCurrent, aNewlineCount);
1735 }
1736
1737 /*
1738  * Consume the key and value portions of the attribute.
1739  */
1740 nsresult
1741 CAttributeToken::Consume(PRUnichar aChar, nsScanner& aScanner, PRInt32 aFlag)
1742 {
1743   nsresult result;
1744   nsScannerIterator wsstart, wsend;
1745
1746   if (aFlag & NS_IPARSER_FLAG_VIEW_SOURCE) {
1747     result = aScanner.ReadWhitespace(wsstart, wsend, mNewlineCount);
1748     if (kEOF == result && wsstart != wsend) {
1749       // Do this here so if this is the final token in the document, we don't
1750       // lose the whitespace.
1751       aScanner.BindSubstring(mTextKey, wsstart, wsend);
1752     }
1753   } else {
1754     result = aScanner.SkipWhitespace(mNewlineCount);
1755   }
1756
1757   if (NS_OK == result) {
1758     static const PRUnichar theTerminalsChars[] =
1759     { PRUnichar(' '), PRUnichar('"'),
1760       PRUnichar('='), PRUnichar('\n'),
1761       PRUnichar('\r'), PRUnichar('\t'),
1762       PRUnichar('>'), PRUnichar('<'),
1763       PRUnichar('\''), PRUnichar('/'),
1764       PRUnichar(0) };
1765     static const nsReadEndCondition theEndCondition(theTerminalsChars);
1766
1767     nsScannerIterator start, end;
1768     result = aScanner.ReadUntil(start, end, theEndCondition, PR_FALSE);
1769
1770     if (!(aFlag & NS_IPARSER_FLAG_VIEW_SOURCE)) {
1771       aScanner.BindSubstring(mTextKey, start, end);
1772     } else if (kEOF == result && wsstart != end) {
1773       // Capture all of the text (from the beginning of the whitespace to the
1774       // end of the document).
1775       aScanner.BindSubstring(mTextKey, wsstart, end);
1776     }
1777
1778     // Now it's time to Consume the (optional) value...
1779     if (NS_OK == result) {
1780       if (aFlag & NS_IPARSER_FLAG_VIEW_SOURCE) {
1781         result = aScanner.ReadWhitespace(start, wsend, mNewlineCount);
1782         aScanner.BindSubstring(mTextKey, wsstart, wsend);
1783       } else {
1784         result = aScanner.SkipWhitespace(mNewlineCount);
1785       }
1786
1787       if (NS_OK == result) {
1788         // Skip ahead until you find an equal sign or a '>'...
1789         result = aScanner.Peek(aChar);
1790         if (NS_OK == result) {
1791           if (kEqual == aChar) {
1792             result = aScanner.GetChar(aChar);  // Skip the equal sign...
1793             if (NS_OK == result) {
1794               if (aFlag & NS_IPARSER_FLAG_VIEW_SOURCE) {
1795                 PRBool haveCR;
1796                 result = aScanner.ReadWhitespace(mTextValue, mNewlineCount,
1797                                                  haveCR);
1798               } else {
1799                 result = aScanner.SkipWhitespace(mNewlineCount);
1800               }
1801
1802               if (NS_OK == result) {
1803                 result = aScanner.Peek(aChar);  // And grab the next char.
1804                 if (NS_OK == result) {
1805                   if (kQuote == aChar || kApostrophe == aChar) {
1806                     aScanner.GetChar(aChar);
1807                     if (aFlag & NS_IPARSER_FLAG_VIEW_SOURCE) {
1808                       mTextValue.writable().Append(aChar);
1809                     }
1810
1811                     result = ConsumeQuotedString(aChar, mTextValue,
1812                                                  mNewlineCount, aScanner,
1813                                                  aFlag);
1814                     if (NS_SUCCEEDED(result) &&
1815                         (aFlag & NS_IPARSER_FLAG_VIEW_SOURCE)) {
1816                       mTextValue.writable().Append(aChar);
1817                     } else if (result ==
1818                                 NS_ERROR_HTMLPARSER_UNTERMINATEDSTRINGLITERAL) {
1819                       result = NS_OK;
1820                       mInError = PR_TRUE;
1821                     }
1822                     // According to spec. we ( who? ) should ignore linefeeds.
1823                     // But look, even the carriage return was getting stripped
1824                     // ( wonder why! ) - Ref. to bug 15204.  Okay, so the
1825                     // spec. told us to ignore linefeeds, bug then what about
1826                     // bug 47535 ? Should we preserve everything then?  Well,
1827                     // let's make it so!
1828                   } else if (kGreaterThan == aChar) {
1829                     mHasEqualWithoutValue = PR_TRUE;
1830                     mInError = PR_TRUE;
1831                   } else {
1832                     static const nsReadEndCondition
1833                       theAttributeTerminator(kAttributeTerminalChars);
1834                     result =
1835                       ConsumeUntil(mTextValue,
1836                                    mNewlineCount,
1837                                    aScanner,
1838                                    theAttributeTerminator,
1839                                    PR_FALSE,
1840                                    PR_TRUE,
1841                                    aFlag);
1842                   }
1843                 }
1844                 if (NS_OK == result) {
1845                   if (aFlag & NS_IPARSER_FLAG_VIEW_SOURCE) {
1846                     PRBool haveCR;
1847                     result = aScanner.ReadWhitespace(mTextValue, mNewlineCount,
1848                                                      haveCR);
1849                   } else {
1850                     result = aScanner.SkipWhitespace(mNewlineCount);
1851                   }
1852                 }
1853               } else {
1854                 // We saw an equal sign but ran out of room looking for a value.
1855                 mHasEqualWithoutValue = PR_TRUE;
1856                 mInError = PR_TRUE;
1857               }
1858             }
1859           } else {
1860             // This is where we have to handle fairly busted content.
1861             // If you're here, it means we saw an attribute name, but couldn't
1862             // find the following equal sign.  <tag NAME....
1863
1864             // Doing this right in all cases is <i>REALLY</i> ugly.
1865             // My best guess is to grab the next non-ws char. We know it's not
1866             // '=', so let's see what it is. If it's a '"', then assume we're
1867             // reading from the middle of the value. Try stripping the quote
1868             // and continuing...  Note that this code also strips forward
1869             // slashes to handle cases like <tag NAME/>
1870             if (kQuote == aChar || kApostrophe == aChar ||
1871                 kForwardSlash == aChar) {
1872               // In XML, a trailing slash isn't an error.
1873               if (kForwardSlash != aChar || !(aFlag & NS_IPARSER_FLAG_XML)) {
1874                 mInError = PR_TRUE;
1875               }
1876
1877               if (!(aFlag & NS_IPARSER_FLAG_VIEW_SOURCE)) {
1878                 result = aScanner.SkipOver(aChar); // Strip quote or slash.
1879                 if (NS_SUCCEEDED(result)) {
1880                   result = aScanner.SkipWhitespace(mNewlineCount);
1881                 }
1882               } else {
1883                 // We want to collect whitespace here so that following
1884                 // attributes can have the right line number (and for
1885                 // parity with the non-view-source code above).
1886                 result = ConsumeInvalidAttribute(aScanner, aChar,
1887                                                  wsend, mNewlineCount);
1888
1889                 aScanner.BindSubstring(mTextKey, wsstart, wsend);
1890                 aScanner.SetPosition(wsend);
1891               }
1892             }
1893           }
1894         }
1895       }
1896     }
1897
1898     if (NS_OK == result) {
1899       if (mTextValue.str().Length() == 0 && mTextKey.Length() == 0 &&
1900           mNewlineCount == 0 && !mHasEqualWithoutValue) {
1901         // This attribute contains no useful information for us, so there is no
1902         // use in keeping it around. Attributes that are otherwise empty, but
1903         // have newlines in them are passed on the the DTD so it can get line
1904         // numbering right.
1905         return NS_ERROR_HTMLPARSER_BADATTRIBUTE;
1906       }
1907     }
1908   }
1909
1910   if (kEOF == result && !aScanner.IsIncremental()) {
1911     // This is our run-of-the mill "don't lose content at the end of a
1912     // document" with a slight twist: we don't want to bother returning an
1913     // empty attribute key, even if this is the end of the document.
1914     if (mTextKey.Length() == 0) {
1915       result = NS_ERROR_HTMLPARSER_BADATTRIBUTE;
1916     } else {
1917       result = NS_OK;
1918     }
1919   }
1920
1921   return result;
1922 }
1923
1924 void
1925 CAttributeToken::SetKey(const nsAString& aKey)
1926 {
1927   mTextKey.Rebind(aKey);
1928 }
1929
1930 void
1931 CAttributeToken::BindKey(nsScanner* aScanner,
1932                          nsScannerIterator& aStart,
1933                          nsScannerIterator& aEnd)
1934 {
1935   aScanner->BindSubstring(mTextKey, aStart, aEnd);
1936 }
1937
1938 CWhitespaceToken::CWhitespaceToken()
1939   : CHTMLToken(eHTMLTag_whitespace)
1940 {
1941 }
1942
1943 CWhitespaceToken::CWhitespaceToken(const nsAString& aName)
1944   : CHTMLToken(eHTMLTag_whitespace)
1945 {
1946   mTextValue.writable().Assign(aName);
1947 }
1948
1949 PRInt32 CWhitespaceToken::GetTokenType()
1950 {
1951   return eToken_whitespace;
1952 }
1953
1954 /*
1955  * This general purpose method is used when you want to
1956  * consume an aribrary sequence of whitespace.
1957  *
1958  *  @param   aChar -- last char consumed from stream
1959  *  @param   aScanner -- controller of underlying input source
1960  *  @return  error result
1961  */
1962 nsresult
1963 CWhitespaceToken::Consume(PRUnichar aChar, nsScanner& aScanner, PRInt32 aFlag)
1964 {
1965   // If possible, we'd like to just be a dependent substring starting at
1966   // |aChar|.  The scanner has already been advanced, so we need to
1967   // back it up to facilitate this.
1968
1969   nsScannerIterator start;
1970   aScanner.CurrentPosition(start);
1971   aScanner.SetPosition(--start, PR_FALSE, PR_TRUE);
1972
1973   PRBool haveCR;
1974
1975   nsresult result = aScanner.ReadWhitespace(mTextValue, mNewlineCount, haveCR);
1976
1977   if (result == kEOF && !aScanner.IsIncremental()) {
1978     // Oops, we ran off the end, make sure we don't lose the trailing
1979     // whitespace!
1980     result = NS_OK;
1981   }
1982
1983   if (NS_OK == result && haveCR) {
1984     mTextValue.writable().StripChar(kCR);
1985   }
1986   return result;
1987 }
1988
1989 const nsSubstring&
1990 CWhitespaceToken::GetStringValue()
1991 {
1992   return mTextValue.str();
1993 }
1994
1995 CEntityToken::CEntityToken()
1996   : CHTMLToken(eHTMLTag_entity)
1997 {
1998 }
1999
2000 CEntityToken::CEntityToken(const nsAString& aName)
2001   : CHTMLToken(eHTMLTag_entity)
2002 {
2003   mTextValue.Assign(aName);
2004 }
2005
2006
2007 /*
2008  *  Consume the rest of the entity. We've already eaten the "&".
2009  *
2010  *  @param   aChar -- last char consumed from stream
2011  *  @param   aScanner -- controller of underlying input source
2012  *  @return  error result
2013  */
2014 nsresult
2015 CEntityToken::Consume(PRUnichar aChar, nsScanner& aScanner, PRInt32 aFlag)
2016 {
2017   nsresult result = ConsumeEntity(aChar, mTextValue, aScanner);
2018   return result;
2019 }
2020
2021 PRInt32
2022 CEntityToken::GetTokenType()
2023 {
2024   return eToken_entity;
2025 }
2026
2027 /*
2028  * This general purpose method is used when you want to
2029  * consume an entity &xxxx;. Keep in mind that entities
2030  * are <i>not</i> reduced inline.
2031  *
2032  * @param   aChar -- last char consumed from stream
2033  * @param   aScanner -- controller of underlying input source
2034  * @return  error result
2035  */
2036 nsresult
2037 CEntityToken::ConsumeEntity(PRUnichar aChar,
2038                             nsString& aString,
2039                             nsScanner& aScanner)
2040 {
2041   nsresult result = NS_OK;
2042   if (kLeftBrace == aChar) {
2043     // You're consuming a script entity...
2044     aScanner.GetChar(aChar); // Consume &
2045
2046     PRInt32 rightBraceCount = 0;
2047     PRInt32 leftBraceCount  = 0;
2048
2049     do {
2050       result = aScanner.GetChar(aChar);
2051
2052       if (NS_FAILED(result)) {
2053         return result;
2054       }
2055
2056       aString.Append(aChar);
2057       if (aChar == kRightBrace) {
2058         ++rightBraceCount;
2059       } else if (aChar == kLeftBrace) {
2060         ++leftBraceCount;
2061       }
2062     } while (leftBraceCount != rightBraceCount);
2063   } else {
2064     PRUnichar theChar = 0;
2065     if (kHashsign == aChar) {
2066       result = aScanner.Peek(theChar, 2);
2067
2068       if (NS_FAILED(result)) {
2069         if (kEOF == result && !aScanner.IsIncremental()) {
2070           // If this is the last buffer then we are certainly
2071           // not dealing with an entity. That's, there are
2072           // no more characters after &#. Bug 188278.
2073           return NS_HTMLTOKENS_NOT_AN_ENTITY;
2074         }
2075         return result;
2076       }
2077
2078       if (nsCRT::IsAsciiDigit(theChar)) {
2079         aScanner.GetChar(aChar); // Consume &
2080         aScanner.GetChar(aChar); // Consume #
2081         aString.Assign(aChar);
2082         result = aScanner.ReadNumber(aString, 10);
2083       } else if (theChar == 'x' || theChar == 'X') {
2084         aScanner.GetChar(aChar);   // Consume &
2085         aScanner.GetChar(aChar);   // Consume #
2086         aScanner.GetChar(theChar); // Consume x
2087         aString.Assign(aChar);
2088         aString.Append(theChar);
2089         result = aScanner.ReadNumber(aString, 16);
2090       } else {
2091         return NS_HTMLTOKENS_NOT_AN_ENTITY;
2092       }
2093     } else {
2094       result = aScanner.Peek(theChar, 1);
2095
2096       if (NS_FAILED(result)) {
2097         return result;
2098       }
2099
2100       if (nsCRT::IsAsciiAlpha(theChar) ||
2101         theChar == '_' ||
2102         theChar == ':') {
2103         aScanner.GetChar(aChar); // Consume &
2104         result = aScanner.ReadEntityIdentifier(aString);
2105       } else {
2106         return NS_HTMLTOKENS_NOT_AN_ENTITY;
2107       }
2108     }
2109   }
2110
2111   if (NS_FAILED(result)) {
2112     return result;
2113   }
2114
2115   result = aScanner.Peek(aChar);
2116
2117   if (NS_FAILED(result)) {
2118     return result;
2119   }
2120
2121   if (aChar == kSemicolon) {
2122     // Consume semicolon that stopped the scan
2123     aString.Append(aChar);
2124     result = aScanner.GetChar(aChar);
2125   }
2126
2127   return result;
2128 }
2129
2130 /**
2131  * Map some illegal but commonly used numeric entities into their
2132  * appropriate unicode value.
2133  */
2134 #define NOT_USED 0xfffd
2135
2136 static const PRUint16 PA_HackTable[] = {
2137         0x20ac,  /* EURO SIGN */
2138         NOT_USED,
2139         0x201a,  /* SINGLE LOW-9 QUOTATION MARK */
2140         0x0192,  /* LATIN SMALL LETTER F WITH HOOK */
2141         0x201e,  /* DOUBLE LOW-9 QUOTATION MARK */
2142         0x2026,  /* HORIZONTAL ELLIPSIS */
2143         0x2020,  /* DAGGER */
2144         0x2021,  /* DOUBLE DAGGER */
2145         0x02c6,  /* MODIFIER LETTER CIRCUMFLEX ACCENT */
2146         0x2030,  /* PER MILLE SIGN */
2147         0x0160,  /* LATIN CAPITAL LETTER S WITH CARON */
2148         0x2039,  /* SINGLE LEFT-POINTING ANGLE QUOTATION MARK */
2149         0x0152,  /* LATIN CAPITAL LIGATURE OE */
2150         NOT_USED,
2151         0x017D,  /* LATIN CAPITAL LETTER Z WITH CARON */
2152         NOT_USED,
2153         NOT_USED,
2154         0x2018,  /* LEFT SINGLE QUOTATION MARK */
2155         0x2019,  /* RIGHT SINGLE QUOTATION MARK */
2156         0x201c,  /* LEFT DOUBLE QUOTATION MARK */
2157         0x201d,  /* RIGHT DOUBLE QUOTATION MARK */
2158         0x2022,  /* BULLET */
2159         0x2013,  /* EN DASH */
2160         0x2014,  /* EM DASH */
2161         0x02dc,  /* SMALL TILDE */
2162         0x2122,  /* TRADE MARK SIGN */
2163         0x0161,  /* LATIN SMALL LETTER S WITH CARON */
2164         0x203a,  /* SINGLE RIGHT-POINTING ANGLE QUOTATION MARK */
2165         0x0153,  /* LATIN SMALL LIGATURE OE */
2166         NOT_USED,
2167         0x017E,  /* LATIN SMALL LETTER Z WITH CARON */
2168         0x0178   /* LATIN CAPITAL LETTER Y WITH DIAERESIS */
2169 };
2170
2171 static void
2172 AppendNCR(nsSubstring& aString, PRInt32 aNCRValue)
2173 {
2174   /* For some illegal, but popular usage */
2175   if (aNCRValue >= 0x0080 && aNCRValue <= 0x009f) {
2176     aNCRValue = PA_HackTable[aNCRValue - 0x0080];
2177   }
2178
2179   AppendUCS4ToUTF16(ENSURE_VALID_CHAR(aNCRValue), aString);
2180 }
2181
2182 /*
2183  * This method converts this entity into its underlying
2184  * unicode equivalent.
2185  *
2186  *  @param   aString will hold the resulting string value
2187  *  @return  numeric (unichar) value
2188  */
2189 PRInt32
2190 CEntityToken::TranslateToUnicodeStr(nsString& aString)
2191 {
2192   PRInt32 value = 0;
2193
2194   if (mTextValue.Length() > 1) {
2195     PRUnichar theChar0 = mTextValue.CharAt(0);
2196
2197     if (kHashsign == theChar0) {
2198       PRInt32 err = 0;
2199
2200       value = mTextValue.ToInteger(&err, kAutoDetect);
2201
2202       if (0 == err) {
2203         AppendNCR(aString, value);
2204       }
2205     } else {
2206       value = nsHTMLEntities::EntityToUnicode(mTextValue);
2207       if (-1 < value) {
2208         // We found a named entity...
2209         aString.Assign(PRUnichar(value));
2210       }
2211     }
2212   }
2213
2214   return value;
2215 }
2216
2217
2218 const
2219 nsSubstring& CEntityToken::GetStringValue()
2220 {
2221   return mTextValue;
2222 }
2223
2224 void
2225 CEntityToken::GetSource(nsString& anOutputString)
2226 {
2227   anOutputString.AppendLiteral("&");
2228   anOutputString += mTextValue;
2229   // Any possible ; is part of our text value.
2230 }
2231
2232 void
2233 CEntityToken::AppendSourceTo(nsAString& anOutputString)
2234 {
2235   anOutputString.AppendLiteral("&");
2236   anOutputString += mTextValue;
2237   // Any possible ; is part of our text value.
2238 }
2239
2240 const PRUnichar*
2241 GetTagName(PRInt32 aTag)
2242 {
2243   const PRUnichar *result = nsHTMLTags::GetStringValue((nsHTMLTag) aTag);
2244
2245   if (result) {
2246     return result;
2247   }
2248
2249   if (aTag >= eHTMLTag_userdefined) {
2250     return sUserdefined;
2251   }
2252
2253   return 0;
2254 }
2255
2256
2257 CInstructionToken::CInstructionToken()
2258   : CHTMLToken(eHTMLTag_instruction)
2259 {
2260 }
2261
2262 CInstructionToken::CInstructionToken(const nsAString& aString)
2263   : CHTMLToken(eHTMLTag_unknown)
2264 {
2265   mTextValue.Assign(aString);
2266 }
2267
2268 nsresult
2269 CInstructionToken::Consume(PRUnichar aChar, nsScanner& aScanner, PRInt32 aFlag)
2270 {
2271   mTextValue.AssignLiteral("<?");
2272   nsresult result = NS_OK;
2273   PRBool done = PR_FALSE;
2274
2275   while (NS_OK == result && !done) {
2276     // Note, this call does *not* consume the >.
2277     result = aScanner.ReadUntil(mTextValue, kGreaterThan, PR_FALSE);
2278     if (NS_SUCCEEDED(result)) {
2279       // In HTML, PIs end with a '>', in XML, they end with a '?>'. Cover both
2280       // cases here.
2281       if (!(aFlag & NS_IPARSER_FLAG_XML) ||
2282           kQuestionMark == mTextValue.Last()) {
2283         // This really is the end of the PI.
2284         done = PR_TRUE;
2285       }
2286       // Need to append this character no matter what.
2287       aScanner.GetChar(aChar);
2288       mTextValue.Append(aChar);
2289     }
2290   }
2291
2292   if (kEOF == result && !aScanner.IsIncremental()) {
2293     // Hide the EOF result because there is no more text coming.
2294     mInError = PR_TRUE;
2295     result = NS_OK;
2296   }
2297
2298   return result;
2299 }
2300
2301 PRInt32
2302 CInstructionToken::GetTokenType()
2303 {
2304   return eToken_instruction;
2305 }
2306
2307 const nsSubstring&
2308 CInstructionToken::GetStringValue()
2309 {
2310   return mTextValue;
2311 }
2312
2313 // Doctype decl token
2314
2315 CDoctypeDeclToken::CDoctypeDeclToken(eHTMLTags aTag)
2316   : CHTMLToken(aTag)
2317 {
2318 }
2319
2320 CDoctypeDeclToken::CDoctypeDeclToken(const nsAString& aString, eHTMLTags aTag)
2321   : CHTMLToken(aTag), mTextValue(aString)
2322 {
2323 }
2324
2325 /**
2326  *  This method consumes a doctype element.
2327  *  Note: I'm rewriting this method to seek to the first <, since quotes can
2328  *  really screw us up.
2329  *  XXX Maybe this should do better in XML or strict mode?
2330  */
2331 nsresult
2332 CDoctypeDeclToken::Consume(PRUnichar aChar, nsScanner& aScanner, PRInt32 aFlag)
2333 {
2334   static const PRUnichar terminalChars[] =
2335   { PRUnichar('>'), PRUnichar('<'),
2336     PRUnichar(0)
2337   };
2338   static const nsReadEndCondition theEndCondition(terminalChars);
2339
2340   nsScannerIterator start, end;
2341
2342   aScanner.CurrentPosition(start);
2343   aScanner.EndReading(end);
2344
2345   nsresult result = aScanner.ReadUntil(start, end, theEndCondition, PR_FALSE);
2346
2347   if (NS_SUCCEEDED(result)) {
2348     PRUnichar ch;
2349     aScanner.Peek(ch);
2350     if (ch == kGreaterThan) {
2351       // Include '>' but not '<' since '<'
2352       // could belong to another tag.
2353       aScanner.GetChar(ch);
2354       end.advance(1);
2355     } else {
2356       NS_ASSERTION(kLessThan == ch,
2357                    "Make sure this doctype decl. is really in error.");
2358       mInError = PR_TRUE;
2359     }
2360   } else if (!aScanner.IsIncremental()) {
2361     // We have reached the document end but haven't
2362     // found either a '<' or a '>'. Therefore use
2363     // whatever we have.
2364     mInError = PR_TRUE;
2365     result = NS_OK;
2366   }
2367
2368   if (NS_SUCCEEDED(result)) {
2369     start.advance(-2); // Make sure to consume <!
2370     CopyUnicodeTo(start, end, mTextValue);
2371   }
2372
2373   return result;
2374 }
2375
2376 PRInt32
2377 CDoctypeDeclToken::GetTokenType()
2378 {
2379   return eToken_doctypeDecl;
2380 }
2381
2382 const nsSubstring&
2383 CDoctypeDeclToken::GetStringValue()
2384 {
2385   return mTextValue;
2386 }
2387
2388 void
2389 CDoctypeDeclToken::SetStringValue(const nsAString& aStr)
2390 {
2391   mTextValue.Assign(aStr);
2392 }