parser/htmlparser/src/nsScanner.cpp

   1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
   2 /* vim: set ts=2 sw=2 et tw=78: */
   3 /* ***** BEGIN LICENSE BLOCK *****
   4  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
   5  *
   6  * The contents of this file are subject to the Mozilla Public License Version
   7  * 1.1 (the "License"); you may not use this file except in compliance with
   8  * the License. You may obtain a copy of the License at
   9  * http://www.mozilla.org/MPL/
  10  *
  11  * Software distributed under the License is distributed on an "AS IS" basis,
  12  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
  13  * for the specific language governing rights and limitations under the
  14  * License.
  15  *
  16  * The Original Code is mozilla.org code.
  17  *
  18  * The Initial Developer of the Original Code is
  19  * Netscape Communications Corporation.
  20  * Portions created by the Initial Developer are Copyright (C) 1998
  21  * the Initial Developer. All Rights Reserved.
  22  *
  23  * Contributor(s):
  24  *
  25  * Alternatively, the contents of this file may be used under the terms of
  26  * either of the GNU General Public License Version 2 or later (the "GPL"),
  27  * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
  28  * in which case the provisions of the GPL or the LGPL are applicable instead
  29  * of those above. If you wish to allow use of your version of this file only
  30  * under the terms of either the GPL or the LGPL, and not to allow others to
  31  * use your version of this file under the terms of the MPL, indicate your
  32  * decision by deleting the provisions above and replace them with the notice
  33  * and other provisions required by the GPL or the LGPL. If you do not delete
  34  * the provisions above, a recipient may use your version of this file under
  35  * the terms of any one of the MPL, the GPL or the LGPL.
  36  *
  37  * ***** END LICENSE BLOCK ***** */
  38
  39 //#define __INCREMENTAL 1
  40
  41 #include "nsScanner.h"
  42 #include "nsDebug.h"
  43 #include "nsIServiceManager.h"
  44 #include "nsICharsetConverterManager.h"
  45 #include "nsICharsetAlias.h"
  46 #include "nsReadableUtils.h"
  47 #include "nsIInputStream.h"
  48 #include "nsILocalFile.h"
  49 #include "nsNetUtil.h"
  50 #include "nsUTF8Utils.h" // for LossyConvertEncoding
  51 #include "nsCRT.h"
  52 #include "nsParser.h"
  53
  54 // We replace NUL characters with this character.
  55 static PRUnichar sInvalid = UCS2_REPLACEMENT_CHAR;
  56
  57 nsReadEndCondition::nsReadEndCondition(const PRUnichar* aTerminateChars) :
  58   mChars(aTerminateChars), mFilter(PRUnichar(~0)) // All bits set
  59 {
  60   // Build filter that will be used to filter out characters with
  61   // bits that none of the terminal chars have. This works very well
  62   // because terminal chars often have only the last 4-6 bits set and
  63   // normal ascii letters have bit 7 set. Other letters have even higher
  64   // bits set.
  65
  66   // Calculate filter
  67   const PRUnichar *current = aTerminateChars;
  68   PRUnichar terminalChar = *current;
  69   while (terminalChar) {
  70     mFilter &= ~terminalChar;
  71     ++current;
  72     terminalChar = *current;
  73   }
  74 }
  75
  76 #ifdef __INCREMENTAL
  77 const int   kBufsize=1;
  78 #else
  79 const int   kBufsize=64;
  80 #endif
  81
  82 /**
  83  *  Use this constructor if you want i/o to be based on
  84  *  a single string you hand in during construction.
  85  *  This short cut was added for Javascript.
  86  *
  87  *  @update  gess 5/12/98
  88  *  @param   aMode represents the parser mode (nav, other)
  89  *  @return
  90  */
  91 nsScanner::nsScanner(const nsAString& anHTMLString, const nsACString& aCharset,
  92                      PRInt32 aSource)
  93   : mParser(nsnull)
  94 {
  95   MOZ_COUNT_CTOR(nsScanner);
  96
  97   mSlidingBuffer = nsnull;
  98   mCountRemaining = 0;
  99   mFirstNonWhitespacePosition = -1;
 100   if (AppendToBuffer(anHTMLString)) {
 101     mSlidingBuffer->BeginReading(mCurrentPosition);
 102   } else {
 103     /* XXX see hack below, re: bug 182067 */
 104     memset(&mCurrentPosition, 0, sizeof(mCurrentPosition));
 105     mEndPosition = mCurrentPosition;
 106   }
 107   mMarkPosition = mCurrentPosition;
 108   mIncremental = PR_FALSE;
 109   mUnicodeDecoder = 0;
 110   mCharsetSource = kCharsetUninitialized;
 111 }
 112
 113 /**
 114  *  Use this constructor if you want i/o to be based on strings
 115  *  the scanner receives. If you pass a null filename, you
 116  *  can still provide data to the scanner via append.
 117  *
 118  *  @update  gess 5/12/98
 119  *  @param   aFilename --
 120  *  @return
 121  */
 122 nsScanner::nsScanner(nsString& aFilename,PRBool aCreateStream,
 123                      const nsACString& aCharset, PRInt32 aSource)
 124   : mFilename(aFilename), mParser(nsnull)
 125 {
 126   MOZ_COUNT_CTOR(nsScanner);
 127   NS_ASSERTION(!aCreateStream, "This is always true.");
 128
 129   mSlidingBuffer = nsnull;
 130
 131   // XXX This is a big hack.  We need to initialize the iterators to something.
 132   // What matters is that mCurrentPosition == mEndPosition, so that our methods
 133   // believe that we are at EOF (see bug 182067).  We null out mCurrentPosition
 134   // so that we have some hope of catching null pointer dereferences associated
 135   // with this hack. --darin
 136   memset(&mCurrentPosition, 0, sizeof(mCurrentPosition));
 137   mMarkPosition = mCurrentPosition;
 138   mEndPosition = mCurrentPosition;
 139
 140   mIncremental = PR_TRUE;
 141   mFirstNonWhitespacePosition = -1;
 142   mCountRemaining = 0;
 143
 144   mUnicodeDecoder = 0;
 145   mCharsetSource = kCharsetUninitialized;
 146   SetDocumentCharset(aCharset, aSource);
 147 }
 148
 149 nsresult nsScanner::SetDocumentCharset(const nsACString& aCharset , PRInt32 aSource)
 150 {
 151   if (aSource < mCharsetSource) // priority is lower the the current one , just
 152     return NS_OK;
 153
 154   nsICharsetAlias* calias = nsParser::GetCharsetAliasService();
 155   NS_ASSERTION(calias, "Must have the charset alias service!");
 156
 157   nsresult res = NS_OK;
 158   if (!mCharset.IsEmpty())
 159   {
 160     PRBool same;
 161     res = calias->Equals(aCharset, mCharset, &same);
 162     if(NS_SUCCEEDED(res) && same)
 163     {
 164       return NS_OK; // no difference, don't change it
 165     }
 166   }
 167
 168   // different, need to change it
 169   nsCString charsetName;
 170   res = calias->GetPreferred(aCharset, charsetName);
 171
 172   if(NS_FAILED(res) && (mCharsetSource == kCharsetUninitialized))
 173   {
 174      // failed - unknown alias , fallback to ISO-8859-1
 175     mCharset.AssignLiteral("ISO-8859-1");
 176   }
 177   else
 178   {
 179     mCharset.Assign(charsetName);
 180   }
 181
 182   mCharsetSource = aSource;
 183
 184   NS_ASSERTION(nsParser::GetCharsetConverterManager(),
 185                "Must have the charset converter manager!");
 186
 187   return nsParser::GetCharsetConverterManager()->
 188     GetUnicodeDecoderRaw(mCharset.get(), getter_AddRefs(mUnicodeDecoder));
 189 }
 190
 191
 192 /**
 193  *  default destructor
 194  *
 195  *  @update  gess 3/25/98
 196  *  @param
 197  *  @return
 198  */
 199 nsScanner::~nsScanner() {
 200
 201   if (mSlidingBuffer) {
 202     delete mSlidingBuffer;
 203   }
 204
 205   MOZ_COUNT_DTOR(nsScanner);
 206 }
 207
 208 /**
 209  *  Resets current offset position of input stream to marked position.
 210  *  This allows us to back up to this point if the need should arise,
 211  *  such as when tokenization gets interrupted.
 212  *  NOTE: IT IS REALLY BAD FORM TO CALL RELEASE WITHOUT CALLING MARK FIRST!
 213  *
 214  *  @update  gess 5/12/98
 215  *  @param
 216  *  @return
 217  */
 218 void nsScanner::RewindToMark(void){
 219   if (mSlidingBuffer) {
 220     mCountRemaining += (Distance(mMarkPosition, mCurrentPosition));
 221     mCurrentPosition = mMarkPosition;
 222   }
 223 }
 224
 225
 226 /**
 227  *  Records current offset position in input stream. This allows us
 228  *  to back up to this point if the need should arise, such as when
 229  *  tokenization gets interrupted.
 230  *
 231  *  @update  gess 7/29/98
 232  *  @param
 233  *  @return
 234  */
 235 PRInt32 nsScanner::Mark() {
 236   PRInt32 distance = 0;
 237   if (mSlidingBuffer) {
 238     nsScannerIterator oldStart;
 239     mSlidingBuffer->BeginReading(oldStart);
 240
 241     distance = Distance(oldStart, mCurrentPosition);
 242
 243     mSlidingBuffer->DiscardPrefix(mCurrentPosition);
 244     mSlidingBuffer->BeginReading(mCurrentPosition);
 245     mMarkPosition = mCurrentPosition;
 246   }
 247
 248   return distance;
 249 }
 250
 251 /**
 252  * Insert data to our underlying input buffer as
 253  * if it were read from an input stream.
 254  *
 255  * @update  harishd 01/12/99
 256  * @return  error code
 257  */
 258 PRBool nsScanner::UngetReadable(const nsAString& aBuffer) {
 259   if (!mSlidingBuffer) {
 260     return PR_FALSE;
 261   }
 262
 263   mSlidingBuffer->UngetReadable(aBuffer,mCurrentPosition);
 264   mSlidingBuffer->BeginReading(mCurrentPosition); // Insertion invalidated our iterators
 265   mSlidingBuffer->EndReading(mEndPosition);
 266
 267   PRUint32 length = aBuffer.Length();
 268   mCountRemaining += length; // Ref. bug 117441
 269   return PR_TRUE;
 270 }
 271
 272 /**
 273  * Append data to our underlying input buffer as
 274  * if it were read from an input stream.
 275  *
 276  * @update  gess4/3/98
 277  * @return  error code
 278  */
 279 nsresult nsScanner::Append(const nsAString& aBuffer) {
 280   if (!AppendToBuffer(aBuffer))
 281     return NS_ERROR_OUT_OF_MEMORY;
 282   return NS_OK;
 283 }
 284
 285 /**
 286  *
 287  *
 288  *  @update  gess 5/21/98
 289  *  @param
 290  *  @return
 291  */
 292 nsresult nsScanner::Append(const char* aBuffer, PRUint32 aLen,
 293                            nsIRequest *aRequest)
 294 {
 295   nsresult res=NS_OK;
 296   PRUnichar *unichars, *start;
 297   if (mUnicodeDecoder) {
 298     PRInt32 unicharBufLen = 0;
 299     mUnicodeDecoder->GetMaxLength(aBuffer, aLen, &unicharBufLen);
 300     nsScannerString::Buffer* buffer = nsScannerString::AllocBuffer(unicharBufLen + 1);
 301     NS_ENSURE_TRUE(buffer,NS_ERROR_OUT_OF_MEMORY);
 302     start = unichars = buffer->DataStart();
 303
 304     PRInt32 totalChars = 0;
 305     PRInt32 unicharLength = unicharBufLen;
 306     do {
 307       PRInt32 srcLength = aLen;
 308       res = mUnicodeDecoder->Convert(aBuffer, &srcLength, unichars, &unicharLength);
 309
 310       totalChars += unicharLength;
 311       // Continuation of failure case
 312       if(NS_FAILED(res)) {
 313         // if we failed, we consume one byte, replace it with U+FFFD
 314         // and try the conversion again.
 315
 316         // This is only needed because some decoders don't follow the
 317         // nsIUnicodeDecoder contract: they return a failure when *aDestLength
 318         // is 0 rather than the correct NS_OK_UDEC_MOREOUTPUT.  See bug 244177
 319         if ((unichars + unicharLength) >= buffer->DataEnd()) {
 320           NS_ERROR("Unexpected end of destination buffer");
 321           break;
 322         }
 323
 324         unichars[unicharLength++] = (PRUnichar)0xFFFD;
 325         unichars = unichars + unicharLength;
 326         unicharLength = unicharBufLen - (++totalChars);
 327
 328         mUnicodeDecoder->Reset();
 329
 330         if(((PRUint32) (srcLength + 1)) > aLen) {
 331           srcLength = aLen;
 332         }
 333         else {
 334           ++srcLength;
 335         }
 336
 337         aBuffer += srcLength;
 338         aLen -= srcLength;
 339       }
 340     } while (NS_FAILED(res) && (aLen > 0));
 341
 342     buffer->SetDataLength(totalChars);
 343     // Don't propagate return code of unicode decoder
 344     // since it doesn't reflect on our success or failure
 345     // - Ref. bug 87110
 346     res = NS_OK;
 347     if (!AppendToBuffer(buffer, aRequest))
 348       res = NS_ERROR_OUT_OF_MEMORY;
 349   }
 350   else {
 351     NS_WARNING("No decoder found.");
 352     res = NS_ERROR_FAILURE;
 353   }
 354
 355   return res;
 356 }
 357
 358 /**
 359  *  retrieve next char from scanners internal input stream
 360  *
 361  *  @update  gess 3/25/98
 362  *  @param
 363  *  @return  error code reflecting read status
 364  */
 365 nsresult nsScanner::GetChar(PRUnichar& aChar) {
 366   if (!mSlidingBuffer || mCurrentPosition == mEndPosition) {
 367     aChar = 0;
 368     return kEOF;
 369   }
 370
 371   aChar = *mCurrentPosition++;
 372   --mCountRemaining;
 373
 374   return NS_OK;
 375 }
 376
 377
 378 /**
 379  *  peek ahead to consume next char from scanner's internal
 380  *  input buffer
 381  *
 382  *  @update  gess 3/25/98
 383  *  @param
 384  *  @return
 385  */
 386 nsresult nsScanner::Peek(PRUnichar& aChar, PRUint32 aOffset) {
 387   aChar = 0;
 388
 389   if (!mSlidingBuffer || mCurrentPosition == mEndPosition) {
 390     return kEOF;
 391   }
 392
 393   if (aOffset > 0) {
 394     if (mCountRemaining <= aOffset)
 395       return kEOF;
 396
 397     nsScannerIterator pos = mCurrentPosition;
 398     pos.advance(aOffset);
 399     aChar=*pos;
 400   }
 401   else {
 402     aChar=*mCurrentPosition;
 403   }
 404
 405   return NS_OK;
 406 }
 407
 408 nsresult nsScanner::Peek(nsAString& aStr, PRInt32 aNumChars, PRInt32 aOffset)
 409 {
 410   if (!mSlidingBuffer || mCurrentPosition == mEndPosition) {
 411     return kEOF;
 412   }
 413
 414   nsScannerIterator start, end;
 415
 416   start = mCurrentPosition;
 417
 418   if ((PRInt32)mCountRemaining <= aOffset) {
 419     return kEOF;
 420   }
 421
 422   if (aOffset > 0) {
 423     start.advance(aOffset);
 424   }
 425
 426   if (mCountRemaining < PRUint32(aNumChars + aOffset)) {
 427     end = mEndPosition;
 428   }
 429   else {
 430     end = start;
 431     end.advance(aNumChars);
 432   }
 433
 434   CopyUnicodeTo(start, end, aStr);
 435
 436   return NS_OK;
 437 }
 438
 439
 440 /**
 441  *  Skip whitespace on scanner input stream
 442  *
 443  *  @update  gess 3/25/98
 444  *  @param
 445  *  @return  error status
 446  */
 447 nsresult nsScanner::SkipWhitespace(PRInt32& aNewlinesSkipped) {
 448
 449   if (!mSlidingBuffer) {
 450     return kEOF;
 451   }
 452
 453   PRUnichar theChar = 0;
 454   nsresult  result = Peek(theChar);
 455
 456   if (NS_FAILED(result)) {
 457     return result;
 458   }
 459
 460   nsScannerIterator current = mCurrentPosition;
 461   PRBool    done = PR_FALSE;
 462   PRBool    skipped = PR_FALSE;
 463
 464   while (!done && current != mEndPosition) {
 465     switch(theChar) {
 466       case '\n':
 467       case '\r': ++aNewlinesSkipped;
 468       case ' ' :
 469       case '\t':
 470         {
 471           skipped = PR_TRUE;
 472           PRUnichar thePrevChar = theChar;
 473           theChar = (++current != mEndPosition) ? *current : '\0';
 474           if ((thePrevChar == '\r' && theChar == '\n') ||
 475               (thePrevChar == '\n' && theChar == '\r')) {
 476             theChar = (++current != mEndPosition) ? *current : '\0'; // CRLF == LFCR => LF
 477           }
 478         }
 479         break;
 480       default:
 481         done = PR_TRUE;
 482         break;
 483     }
 484   }
 485
 486   if (skipped) {
 487     SetPosition(current);
 488     if (current == mEndPosition) {
 489       result = kEOF;
 490     }
 491   }
 492
 493   return result;
 494 }
 495
 496 /**
 497  *  Skip over chars as long as they equal given char
 498  *
 499  *  @update  gess 3/25/98
 500  *  @param
 501  *  @return  error code
 502  */
 503 nsresult nsScanner::SkipOver(PRUnichar aSkipChar){
 504
 505   if (!mSlidingBuffer) {
 506     return kEOF;
 507   }
 508
 509   PRUnichar ch=0;
 510   nsresult   result=NS_OK;
 511
 512   while(NS_OK==result) {
 513     result=Peek(ch);
 514     if(NS_OK == result) {
 515       if(ch!=aSkipChar) {
 516         break;
 517       }
 518       GetChar(ch);
 519     }
 520     else break;
 521   } //while
 522   return result;
 523
 524 }
 525
 526 #if 0
 527 void DoErrTest(nsString& aString) {
 528   PRInt32 pos=aString.FindChar(0);
 529   if(kNotFound<pos) {
 530     if(aString.Length()-1!=pos) {
 531     }
 532   }
 533 }
 534
 535 void DoErrTest(nsCString& aString) {
 536   PRInt32 pos=aString.FindChar(0);
 537   if(kNotFound<pos) {
 538     if(aString.Length()-1!=pos) {
 539     }
 540   }
 541 }
 542 #endif
 543
 544 /**
 545  *  Consume characters until you run into space, a '<', a '>', or a '/'.
 546  *
 547  *  @param   aString - receives new data from stream
 548  *  @return  error code
 549  */
 550 nsresult nsScanner::ReadTagIdentifier(nsScannerSharedSubstring& aString) {
 551
 552   if (!mSlidingBuffer) {
 553     return kEOF;
 554   }
 555
 556   PRUnichar         theChar=0;
 557   nsresult          result=Peek(theChar);
 558   nsScannerIterator current, end;
 559   PRBool            found=PR_FALSE;
 560
 561   current = mCurrentPosition;
 562   end = mEndPosition;
 563
 564   // Loop until we find an illegal character. Everything is then appended
 565   // later.
 566   while(current != end && !found) {
 567     theChar=*current;
 568
 569     switch(theChar) {
 570       case '\n':
 571       case '\r':
 572       case ' ' :
 573       case '\t':
 574       case '\v':
 575       case '\f':
 576       case '<':
 577       case '>':
 578       case '/':
 579         found = PR_TRUE;
 580         break;
 581
 582       case '\0':
 583         ReplaceCharacter(current, sInvalid);
 584         break;
 585
 586       default:
 587         break;
 588     }
 589
 590     if (!found) {
 591       ++current;
 592     }
 593   }
 594
 595   // Don't bother appending nothing.
 596   if (current != mCurrentPosition) {
 597     AppendUnicodeTo(mCurrentPosition, current, aString);
 598   }
 599
 600   SetPosition(current);
 601   if (current == end) {
 602     result = kEOF;
 603   }
 604
 605   //DoErrTest(aString);
 606
 607   return result;
 608 }
 609
 610 /**
 611  *  Consume characters until you run into a char that's not valid in an
 612  *  entity name
 613  *
 614  *  @param   aString - receives new data from stream
 615  *  @return  error code
 616  */
 617 nsresult nsScanner::ReadEntityIdentifier(nsString& aString) {
 618
 619   if (!mSlidingBuffer) {
 620     return kEOF;
 621   }
 622
 623   PRUnichar         theChar=0;
 624   nsresult          result=Peek(theChar);
 625   nsScannerIterator origin, current, end;
 626   PRBool            found=PR_FALSE;
 627
 628   origin = mCurrentPosition;
 629   current = mCurrentPosition;
 630   end = mEndPosition;
 631
 632   while(current != end) {
 633
 634     theChar=*current;
 635     if(theChar) {
 636       found=PR_FALSE;
 637       switch(theChar) {
 638         case '_':
 639         case '-':
 640         case '.':
 641           // Don't allow ':' in entity names.  See bug 23791
 642           found = PR_TRUE;
 643           break;
 644         default:
 645           found = ('a'<=theChar && theChar<='z') ||
 646                   ('A'<=theChar && theChar<='Z') ||
 647                   ('0'<=theChar && theChar<='9');
 648           break;
 649       }
 650
 651       if(!found) {
 652         AppendUnicodeTo(mCurrentPosition, current, aString);
 653         break;
 654       }
 655     }
 656     ++current;
 657   }
 658
 659   SetPosition(current);
 660   if (current == end) {
 661     AppendUnicodeTo(origin, current, aString);
 662     return kEOF;
 663   }
 664
 665   //DoErrTest(aString);
 666
 667   return result;
 668 }
 669
 670 /**
 671  *  Consume digits
 672  *
 673  *  @param   aString - should contain digits
 674  *  @return  error code
 675  */
 676 nsresult nsScanner::ReadNumber(nsString& aString,PRInt32 aBase) {
 677
 678   if (!mSlidingBuffer) {
 679     return kEOF;
 680   }
 681
 682   NS_ASSERTION(aBase == 10 || aBase == 16,"base value not supported");
 683
 684   PRUnichar         theChar=0;
 685   nsresult          result=Peek(theChar);
 686   nsScannerIterator origin, current, end;
 687
 688   origin = mCurrentPosition;
 689   current = origin;
 690   end = mEndPosition;
 691
 692   PRBool done = PR_FALSE;
 693   while(current != end) {
 694     theChar=*current;
 695     if(theChar) {
 696       done = (theChar < '0' || theChar > '9') &&
 697              ((aBase == 16)? (theChar < 'A' || theChar > 'F') &&
 698                              (theChar < 'a' || theChar > 'f')
 699                              :PR_TRUE);
 700       if(done) {
 701         AppendUnicodeTo(origin, current, aString);
 702         break;
 703       }
 704     }
 705     ++current;
 706   }
 707
 708   SetPosition(current);
 709   if (current == end) {
 710     AppendUnicodeTo(origin, current, aString);
 711     return kEOF;
 712   }
 713
 714   //DoErrTest(aString);
 715
 716   return result;
 717 }
 718
 719 /**
 720  *  Consume characters until you find the terminal char
 721  *
 722  *  @update  gess 3/25/98
 723  *  @param   aString receives new data from stream
 724  *  @param   addTerminal tells us whether to append terminal to aString
 725  *  @return  error code
 726  */
 727 nsresult nsScanner::ReadWhitespace(nsScannerSharedSubstring& aString,
 728                                    PRInt32& aNewlinesSkipped,
 729                                    PRBool& aHaveCR) {
 730
 731   aHaveCR = PR_FALSE;
 732
 733   if (!mSlidingBuffer) {
 734     return kEOF;
 735   }
 736
 737   PRUnichar theChar = 0;
 738   nsresult  result = Peek(theChar);
 739
 740   if (NS_FAILED(result)) {
 741     return result;
 742   }
 743
 744   nsScannerIterator origin, current, end;
 745   PRBool done = PR_FALSE;
 746
 747   origin = mCurrentPosition;
 748   current = origin;
 749   end = mEndPosition;
 750
 751   PRBool haveCR = PR_FALSE;
 752
 753   while(!done && current != end) {
 754     switch(theChar) {
 755       case '\n':
 756       case '\r':
 757         {
 758           ++aNewlinesSkipped;
 759           PRUnichar thePrevChar = theChar;
 760           theChar = (++current != end) ? *current : '\0';
 761           if ((thePrevChar == '\r' && theChar == '\n') ||
 762               (thePrevChar == '\n' && theChar == '\r')) {
 763             theChar = (++current != end) ? *current : '\0'; // CRLF == LFCR => LF
 764             haveCR = PR_TRUE;
 765           } else if (thePrevChar == '\r') {
 766             // Lone CR becomes CRLF; callers should know to remove extra CRs
 767             AppendUnicodeTo(origin, current, aString);
 768             aString.writable().Append(PRUnichar('\n'));
 769             origin = current;
 770             haveCR = PR_TRUE;
 771           }
 772         }
 773         break;
 774       case ' ' :
 775       case '\t':
 776         theChar = (++current != end) ? *current : '\0';
 777         break;
 778       default:
 779         done = PR_TRUE;
 780         AppendUnicodeTo(origin, current, aString);
 781         break;
 782     }
 783   }
 784
 785   SetPosition(current);
 786   if (current == end) {
 787     AppendUnicodeTo(origin, current, aString);
 788     result = kEOF;
 789   }
 790
 791   aHaveCR = haveCR;
 792   return result;
 793 }
 794
 795 //XXXbz callers of this have to manage their lone '\r' themselves if they want
 796 //it to work.  Good thing they're all in view-source and it deals.
 797 nsresult nsScanner::ReadWhitespace(nsScannerIterator& aStart,
 798                                    nsScannerIterator& aEnd,
 799                                    PRInt32& aNewlinesSkipped) {
 800
 801   if (!mSlidingBuffer) {
 802     return kEOF;
 803   }
 804
 805   PRUnichar theChar = 0;
 806   nsresult  result = Peek(theChar);
 807
 808   if (NS_FAILED(result)) {
 809     return result;
 810   }
 811
 812   nsScannerIterator origin, current, end;
 813   PRBool done = PR_FALSE;
 814
 815   origin = mCurrentPosition;
 816   current = origin;
 817   end = mEndPosition;
 818
 819   while(!done && current != end) {
 820     switch(theChar) {
 821       case '\n':
 822       case '\r': ++aNewlinesSkipped;
 823       case ' ' :
 824       case '\t':
 825         {
 826           PRUnichar thePrevChar = theChar;
 827           theChar = (++current != end) ? *current : '\0';
 828           if ((thePrevChar == '\r' && theChar == '\n') ||
 829               (thePrevChar == '\n' && theChar == '\r')) {
 830             theChar = (++current != end) ? *current : '\0'; // CRLF == LFCR => LF
 831           }
 832         }
 833         break;
 834       default:
 835         done = PR_TRUE;
 836         aStart = origin;
 837         aEnd = current;
 838         break;
 839     }
 840   }
 841
 842   SetPosition(current);
 843   if (current == end) {
 844     aStart = origin;
 845     aEnd = current;
 846     result = kEOF;
 847   }
 848
 849   return result;
 850 }
 851
 852 /**
 853  *  Consume characters until you encounter one contained in given
 854  *  input set.
 855  *
 856  *  @update  gess 3/25/98
 857  *  @param   aString will contain the result of this method
 858  *  @param   aTerminalSet is an ordered string that contains
 859  *           the set of INVALID characters
 860  *  @return  error code
 861  */
 862 nsresult nsScanner::ReadUntil(nsAString& aString,
 863                               const nsReadEndCondition& aEndCondition,
 864                               PRBool addTerminal)
 865 {
 866   if (!mSlidingBuffer) {
 867     return kEOF;
 868   }
 869
 870   nsScannerIterator origin, current;
 871   const PRUnichar* setstart = aEndCondition.mChars;
 872   const PRUnichar* setcurrent;
 873
 874   origin = mCurrentPosition;
 875   current = origin;
 876
 877   PRUnichar         theChar=0;
 878   nsresult          result=Peek(theChar);
 879
 880   if (NS_FAILED(result)) {
 881     return result;
 882   }
 883
 884   while (current != mEndPosition) {
 885     theChar = *current;
 886     if (theChar == '\0') {
 887       ReplaceCharacter(current, sInvalid);
 888       theChar = sInvalid;
 889     }
 890
 891     // Filter out completely wrong characters
 892     // Check if all bits are in the required area
 893     if(!(theChar & aEndCondition.mFilter)) {
 894       // They were. Do a thorough check.
 895
 896       setcurrent = setstart;
 897       while (*setcurrent) {
 898         if (*setcurrent == theChar) {
 899           if(addTerminal)
 900             ++current;
 901           AppendUnicodeTo(origin, current, aString);
 902           SetPosition(current);
 903
 904           //DoErrTest(aString);
 905
 906           return NS_OK;
 907         }
 908         ++setcurrent;
 909       }
 910     }
 911
 912     ++current;
 913   }
 914
 915   // If we are here, we didn't find any terminator in the string and
 916   // current = mEndPosition
 917   SetPosition(current);
 918   AppendUnicodeTo(origin, current, aString);
 919   return kEOF;
 920 }
 921
 922 nsresult nsScanner::ReadUntil(nsScannerSharedSubstring& aString,
 923                               const nsReadEndCondition& aEndCondition,
 924                               PRBool addTerminal)
 925 {
 926   if (!mSlidingBuffer) {
 927     return kEOF;
 928   }
 929
 930   nsScannerIterator origin, current;
 931   const PRUnichar* setstart = aEndCondition.mChars;
 932   const PRUnichar* setcurrent;
 933
 934   origin = mCurrentPosition;
 935   current = origin;
 936
 937   PRUnichar         theChar=0;
 938   nsresult          result=Peek(theChar);
 939
 940   if (NS_FAILED(result)) {
 941     return result;
 942   }
 943
 944   while (current != mEndPosition) {
 945     theChar = *current;
 946     if (theChar == '\0') {
 947       ReplaceCharacter(current, sInvalid);
 948       theChar = sInvalid;
 949     }
 950
 951     // Filter out completely wrong characters
 952     // Check if all bits are in the required area
 953     if(!(theChar & aEndCondition.mFilter)) {
 954       // They were. Do a thorough check.
 955
 956       setcurrent = setstart;
 957       while (*setcurrent) {
 958         if (*setcurrent == theChar) {
 959           if(addTerminal)
 960             ++current;
 961           AppendUnicodeTo(origin, current, aString);
 962           SetPosition(current);
 963
 964           //DoErrTest(aString);
 965
 966           return NS_OK;
 967         }
 968         ++setcurrent;
 969       }
 970     }
 971
 972     ++current;
 973   }
 974
 975   // If we are here, we didn't find any terminator in the string and
 976   // current = mEndPosition
 977   SetPosition(current);
 978   AppendUnicodeTo(origin, current, aString);
 979   return kEOF;
 980 }
 981
 982 nsresult nsScanner::ReadUntil(nsScannerIterator& aStart,
 983                               nsScannerIterator& aEnd,
 984                               const nsReadEndCondition &aEndCondition,
 985                               PRBool addTerminal)
 986 {
 987   if (!mSlidingBuffer) {
 988     return kEOF;
 989   }
 990
 991   nsScannerIterator origin, current;
 992   const PRUnichar* setstart = aEndCondition.mChars;
 993   const PRUnichar* setcurrent;
 994
 995   origin = mCurrentPosition;
 996   current = origin;
 997
 998   PRUnichar         theChar=0;
 999   nsresult          result=Peek(theChar);
1000
1001   if (NS_FAILED(result)) {
1002     aStart = aEnd = current;
1003     return result;
1004   }
1005
1006   while (current != mEndPosition) {
1007     if (theChar == '\0') {
1008       ReplaceCharacter(current, sInvalid);
1009       theChar = sInvalid;
1010     }
1011
1012     // Filter out completely wrong characters
1013     // Check if all bits are in the required area
1014     if(!(theChar & aEndCondition.mFilter)) {
1015       // They were. Do a thorough check.
1016       setcurrent = setstart;
1017       while (*setcurrent) {
1018         if (*setcurrent == theChar) {
1019           if(addTerminal)
1020             ++current;
1021           aStart = origin;
1022           aEnd = current;
1023           SetPosition(current);
1024
1025           return NS_OK;
1026         }
1027         ++setcurrent;
1028       }
1029     }
1030
1031     ++current;
1032     theChar = *current;
1033   }
1034
1035   // If we are here, we didn't find any terminator in the string and
1036   // current = mEndPosition
1037   SetPosition(current);
1038   aStart = origin;
1039   aEnd = current;
1040   return kEOF;
1041 }
1042
1043 /**
1044  *  Consumes chars until you see the given terminalChar
1045  *
1046  *  @update  gess 3/25/98
1047  *  @param
1048  *  @return  error code
1049  */
1050 nsresult nsScanner::ReadUntil(nsAString& aString,
1051                               PRUnichar aTerminalChar,
1052                               PRBool addTerminal)
1053 {
1054   if (!mSlidingBuffer) {
1055     return kEOF;
1056   }
1057
1058   nsScannerIterator origin, current;
1059
1060   origin = mCurrentPosition;
1061   current = origin;
1062
1063   PRUnichar theChar;
1064   nsresult result = Peek(theChar);
1065
1066   if (NS_FAILED(result)) {
1067     return result;
1068   }
1069
1070   while (current != mEndPosition) {
1071     if (theChar == '\0') {
1072       ReplaceCharacter(current, sInvalid);
1073       theChar = sInvalid;
1074     }
1075
1076     if (aTerminalChar == theChar) {
1077       if(addTerminal)
1078         ++current;
1079       AppendUnicodeTo(origin, current, aString);
1080       SetPosition(current);
1081       return NS_OK;
1082     }
1083     ++current;
1084     theChar = *current;
1085   }
1086
1087   // If we are here, we didn't find any terminator in the string and
1088   // current = mEndPosition
1089   AppendUnicodeTo(origin, current, aString);
1090   SetPosition(current);
1091   return kEOF;
1092
1093 }
1094
1095 void nsScanner::BindSubstring(nsScannerSubstring& aSubstring, const nsScannerIterator& aStart, const nsScannerIterator& aEnd)
1096 {
1097   aSubstring.Rebind(*mSlidingBuffer, aStart, aEnd);
1098 }
1099
1100 void nsScanner::CurrentPosition(nsScannerIterator& aPosition)
1101 {
1102   aPosition = mCurrentPosition;
1103 }
1104
1105 void nsScanner::EndReading(nsScannerIterator& aPosition)
1106 {
1107   aPosition = mEndPosition;
1108 }
1109
1110 void nsScanner::SetPosition(nsScannerIterator& aPosition, PRBool aTerminate, PRBool aReverse)
1111 {
1112   if (mSlidingBuffer) {
1113 #ifdef DEBUG
1114     PRUint32 origRemaining = mCountRemaining;
1115 #endif
1116
1117     if (aReverse) {
1118       mCountRemaining += (Distance(aPosition, mCurrentPosition));
1119     }
1120     else {
1121       mCountRemaining -= (Distance(mCurrentPosition, aPosition));
1122     }
1123
1124     NS_ASSERTION((mCountRemaining >= origRemaining && aReverse) ||
1125                  (mCountRemaining <= origRemaining && !aReverse),
1126                  "Improper use of nsScanner::SetPosition. Make sure to set the"
1127                  " aReverse parameter correctly");
1128
1129     mCurrentPosition = aPosition;
1130     if (aTerminate && (mCurrentPosition == mEndPosition)) {
1131       mMarkPosition = mCurrentPosition;
1132       mSlidingBuffer->DiscardPrefix(mCurrentPosition);
1133     }
1134   }
1135 }
1136
1137 void nsScanner::ReplaceCharacter(nsScannerIterator& aPosition,
1138                                  PRUnichar aChar)
1139 {
1140   if (mSlidingBuffer) {
1141     mSlidingBuffer->ReplaceCharacter(aPosition, aChar);
1142   }
1143 }
1144
1145 PRBool nsScanner::AppendToBuffer(nsScannerString::Buffer* aBuf,
1146                                  nsIRequest *aRequest)
1147 {
1148   if (nsParser::sParserDataListeners && mParser &&
1149       NS_FAILED(mParser->DataAdded(Substring(aBuf->DataStart(),
1150                                              aBuf->DataEnd()), aRequest))) {
1151     // Don't actually append on failure.
1152
1153     return mSlidingBuffer != nsnull;
1154   }
1155
1156   if (!mSlidingBuffer) {
1157     mSlidingBuffer = new nsScannerString(aBuf);
1158     if (!mSlidingBuffer)
1159       return PR_FALSE;
1160     mSlidingBuffer->BeginReading(mCurrentPosition);
1161     mMarkPosition = mCurrentPosition;
1162     mSlidingBuffer->EndReading(mEndPosition);
1163     mCountRemaining = aBuf->DataLength();
1164   }
1165   else {
1166     mSlidingBuffer->AppendBuffer(aBuf);
1167     if (mCurrentPosition == mEndPosition) {
1168       mSlidingBuffer->BeginReading(mCurrentPosition);
1169     }
1170     mSlidingBuffer->EndReading(mEndPosition);
1171     mCountRemaining += aBuf->DataLength();
1172   }
1173
1174   if (mFirstNonWhitespacePosition == -1) {
1175     nsScannerIterator iter(mCurrentPosition);
1176     nsScannerIterator end(mEndPosition);
1177
1178     while (iter != end) {
1179       if (!nsCRT::IsAsciiSpace(*iter)) {
1180         mFirstNonWhitespacePosition = Distance(mCurrentPosition, iter);
1181
1182         break;
1183       }
1184
1185       ++iter;
1186     }
1187   }
1188   return PR_TRUE;
1189 }
1190
1191 /**
1192  *  call this to copy bytes out of the scanner that have not yet been consumed
1193  *  by the tokenization process.
1194  *
1195  *  @update  gess 5/12/98
1196  *  @param   aCopyBuffer is where the scanner buffer will be copied to
1197  *  @return  nada
1198  */
1199 void nsScanner::CopyUnusedData(nsString& aCopyBuffer) {
1200   if (!mSlidingBuffer) {
1201     aCopyBuffer.Truncate();
1202     return;
1203   }
1204
1205   nsScannerIterator start, end;
1206   start = mCurrentPosition;
1207   end = mEndPosition;
1208
1209   CopyUnicodeTo(start, end, aCopyBuffer);
1210 }
1211
1212 /**
1213  *  Retrieve the name of the file that the scanner is reading from.
1214  *  In some cases, it's just a given name, because the scanner isn't
1215  *  really reading from a file.
1216  *
1217  *  @update  gess 5/12/98
1218  *  @return
1219  */
1220 nsString& nsScanner::GetFilename(void) {
1221   return mFilename;
1222 }
1223
1224 /**
1225  *  Conduct self test. Actually, selftesting for this class
1226  *  occurs in the parser selftest.
1227  *
1228  *  @update  gess 3/25/98
1229  *  @param
1230  *  @return
1231  */
1232
1233 void nsScanner::SelfTest(void) {
1234 #ifdef _DEBUG
1235 #endif
1236 }
1237
1238
1239