Filters/HtmlAgilityPack/HtmlDocument.cs

   1 // HtmlAgilityPack V1.0 - Simon Mourier <simonm@microsoft.com>
   2
   3 /*
   4 Copyright (C) 2003 Simon Mourier <simonm@microsoft.com>
   5 All rights reserved.
   6
   7 Redistribution and use in source and binary forms, with or without
   8 modification, are permitted provided that the following conditions
   9 are met:
  10 1. Redistributions of source code must retain the above copyright
  11    notice, this list of conditions and the following disclaimer.
  12 2. Redistributions in binary form must reproduce the above copyright
  13    notice, this list of conditions and the following disclaimer in the
  14    documentation and/or other materials provided with the distribution.
  15 3. The name of the author may not be used to endorse or promote products
  16    derived from this software without specific prior written permission.
  17
  18 THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  19 IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  20 OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  21 IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  22 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  23 NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  27 THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28 */
  29
  30 using System;
  31 using System.IO;
  32 using System.Text;
  33 using System.Diagnostics;
  34 using System.Collections;
  35 using System.Text.RegularExpressions;
  36 using System.Xml;
  37 using System.Xml.XPath;
  38
  39
  40 // Legend: SLIM=Comment added describing changes to original HtmlAgilityPack
  41 //              to reduce memory consumption
  42 // Once the parser is free of bugs, the comments will be taken out
  43 namespace HtmlAgilityPack
  44 {
  45         /// <summary>
  46         /// Represents the type of parsing error.
  47         /// </summary>
  48         public enum HtmlParseErrorCode
  49         {
  50                 /// <summary>
  51                 /// A tag was not closed.
  52                 /// </summary>
  53                 TagNotClosed,
  54
  55                 /// <summary>
  56                 /// A tag was not opened.
  57                 /// </summary>
  58                 TagNotOpened,
  59
  60                 /// <summary>
  61                 /// There is a charset mismatch between stream and declared (META) encoding.
  62                 /// </summary>
  63                 CharsetMismatch,
  64
  65                 /// <summary>
  66                 /// An end tag was not required.
  67                 /// </summary>
  68                 EndTagNotRequired,
  69
  70                 /// <summary>
  71                 /// An end tag is invalid at this position.
  72                 /// </summary>
  73                 EndTagInvalidHere
  74         }
  75
  76         /// <summary>
  77         /// Represents a parsing error found during document parsing.
  78         /// </summary>
  79         public class HtmlParseError
  80         {
  81                 private HtmlParseErrorCode _code;
  82                 private int _line;
  83                 private int _linePosition;
  84                 private int _streamPosition;
  85                 private string _sourceText;
  86                 private string _reason;
  87
  88                 internal HtmlParseError(
  89                         HtmlParseErrorCode code,
  90                         int line,
  91                         int linePosition,
  92                         int streamPosition,
  93                         string sourceText,
  94                         string reason)
  95                 {
  96                         _code = code;
  97                         _line = line;
  98                         _linePosition = linePosition;
  99                         _streamPosition = streamPosition;
 100                         _sourceText = sourceText;
 101                         _reason = reason;
 102                 }
 103
 104                 /// <summary>
 105                 /// Gets the type of error.
 106                 /// </summary>
 107                 public HtmlParseErrorCode Code
 108                 {
 109                         get
 110                         {
 111                                 return _code;
 112                         }
 113                 }
 114
 115                 /// <summary>
 116                 /// Gets the line number of this error in the document.
 117                 /// </summary>
 118                 public int Line
 119                 {
 120                         get
 121                         {
 122                                 return _line;
 123                         }
 124                 }
 125
 126                 /// <summary>
 127                 /// Gets the column number of this error in the document.
 128                 /// </summary>
 129                 public int LinePosition
 130                 {
 131                         get
 132                         {
 133                                 return _linePosition;
 134                         }
 135                 }
 136
 137                 /// <summary>
 138                 /// Gets the absolstream position of this error in the document, relative to the start of the document.
 139                 /// </summary>
 140                 public int StreamPosition
 141                 {
 142                         get
 143                         {
 144                                 return _streamPosition;
 145                         }
 146                 }
 147
 148                 /// <summary>
 149                 /// Gets the the full text of the line containing the error.
 150                 /// </summary>
 151                 public string SourceText
 152                 {
 153                         get
 154                         {
 155                                 return _sourceText;
 156                                 }
 157                 }
 158
 159                 /// <summary>
 160                 /// Gets a description for the error.
 161                 /// </summary>
 162                 public string Reason
 163                 {
 164                         get
 165                         {
 166                                 return _reason;
 167                         }
 168                 }
 169         }
 170
 171
 172         abstract class StreamAsArray {
 173                 public abstract bool Eof (int index);
 174                 public abstract char this [int index] { get;}
 175                 public abstract string Substring (int startindex, int length);
 176                 public abstract int FullLength { get;}
 177         }
 178
 179         // SLIM: creating this class to wrap around a textreader
 180         //       to emulate ReadToEnd () behaviour
 181         class ImplStreamAsArray : StreamAsArray {
 182                 private StreamReader _reader;
 183                 private int _length;
 184                 private int _position;
 185                 private bool _eof;
 186                 private char[] _buf_previous; // could have used only one array
 187                 private char[] _buf_current; // but, this is cleaner
 188                 private int _block_size;
 189
 190                 public ImplStreamAsArray (StreamReader r)
 191                 {
 192                         _reader = r;
 193                         _length = 0;
 194                         _position = 0;
 195                         _eof = false;
 196
 197                         _block_size = 1024;
 198                         _buf_previous = new char [_block_size];
 199                         _buf_current = new char [_block_size];
 200
 201                         Read (true);
 202                 }
 203
 204                 private void Read (bool initial)
 205                 {
 206                         if ( !initial) {
 207                                 Array.Copy (_buf_current, _buf_previous, _block_size);
 208                                 _position += _block_size;
 209                         }
 210                         HtmlDocument.Debug ("Debug: Read in buffer at:" + _position);
 211
 212                         int num_read = _reader.Read (_buf_current, 0, _block_size);
 213                         if (num_read < _block_size) {
 214                                 _eof = true;
 215                                 _length = _position + num_read;
 216                         }
 217                         HtmlDocument.Debug ("[" + new string (_buf_current, 0, num_read) + "]");
 218                 }
 219
 220                 public override bool Eof (int index) {
 221                         if (_eof)
 222                                 return (index == _length);
 223                         else {
 224                                 if (index >= _position + _block_size &&
 225                                     index < _position + _block_size + _block_size)
 226                                         Read (false);
 227                                 if (_eof)
 228                                         return (index == _length);
 229                                 else
 230                                         return false;
 231                         }
 232                 }
 233
 234                 public override char this[int index] {
 235                         get {
 236                                 if (index >= _position &&
 237                                     index < _position + _block_size)
 238                                         return _buf_current [index % _block_size];
 239                                 if (index >= _position - _block_size &&
 240                                     index < _position)
 241                                         return _buf_previous [ index % _block_size];
 242                                 if (index >= _position + _block_size &&
 243                                     index < _position + _block_size + _block_size) {
 244                                         Read (false);
 245                                         return _buf_current [index % _block_size];
 246                                 }
 247                                 return OutOfBandRead (index, 1) [0];
 248                         }
 249                 }
 250
 251                 // evil function ... you get what you pay for!
 252                 private string OutOfBandRead (int startindex, int length)
 253                 {
 254                         HtmlDocument.Debug ("Out of band read! From " + startindex + " to " + (startindex + length - 1));
 255                         ResetPosition (startindex);
 256                         // ahh.. now we are at the correct place
 257                         // create a buffer of required length
 258                         // who cares if the buffer size does not align well
 259                         // with page boundary
 260                         char[] temp_buf = new char [length];
 261                         int num_read = _reader.Read (temp_buf, 0, length);
 262                         if (num_read < length) {
 263                                 // Shouldnt occur!!!
 264                                 _eof = true;
 265                                 _length = startindex + num_read;
 266                         }
 267                         // discard data and reset stream position
 268                         int t = (_eof ? _length :_position + _block_size);
 269                         ResetPosition (t);
 270                         return new String (temp_buf);
 271                 }
 272
 273                 // streamreader does not allow seeking
 274                 // seek on its basestream does not reflect the position
 275                 // of the reader - it is governed by the buffer size
 276                 // of the underlying stream
 277                 // :( so, read character by character from beginning ...
 278                 private void ResetPosition (int pos)
 279                 {
 280                         _reader.DiscardBufferedData ();
 281                         _reader.BaseStream.Position = 0;
 282                         // read in chunks of block_size
 283                         int n1 = pos / _block_size;
 284                         int n2 = pos % _block_size;
 285                         char[] tmp = new char [_block_size];
 286                         // yo ho... start reading till we have reach pos
 287                         // hopefully, reader will buffer itself, so we can be mean and get one char at a time
 288                         for (int i = 0; i < n1; ++i)
 289                                 _reader.Read (tmp, 0, _block_size);
 290                         for (int i = 0; i < n2; ++i)
 291                                 _reader.Read ();
 292                         tmp = null;
 293                 }
 294
 295                 public override string Substring (int startindex, int length)
 296                 {
 297                         if (length == 0) {
 298                                 HtmlDocument.Debug ("substring:" + startindex + " " + length + " " + _position + ":");
 299                                 return String.Empty;
 300                         }
 301                         if (length > _block_size || startindex < _position - _block_size) {
 302                                 return OutOfBandRead (startindex, length);
 303                         }
 304                         while (startindex + length - 1 >= _position + _block_size) {
 305                                 Read (false);
 306                         }
 307                         string substr;
 308                         if (startindex < _position) {
 309                                 int len_1 = _position - startindex;
 310                                 if (length < len_1)
 311                                         substr = new String (_buf_previous, _block_size - len_1, length);
 312                                 else {
 313                                         substr = new String (_buf_previous, _block_size - len_1, len_1);
 314                                         substr += new String (_buf_current, 0, length - len_1);
 315                                 }
 316                         } else {
 317                                 substr = new String (_buf_current, startindex - _position, length);
 318                         }
 319                         return substr;
 320                 }
 321
 322                 // FIXME: Is this costly ?
 323                 public override int FullLength {
 324                         get {
 325                                 return (int)_reader.BaseStream.Length;
 326                         }
 327                 }
 328         }
 329
 330         // A dummy StreamAsArray wrapper around a string
 331         class DummyStreamAsArray : StreamAsArray {
 332                 private string _base_string;
 333                 private int _length;
 334
 335                 public DummyStreamAsArray(string str)
 336                 {
 337                         _base_string = str;
 338                         _length = str.Length;
 339                 }
 340
 341                 public override bool Eof(int index)
 342                 {
 343                         return (index >= _length);
 344                 }
 345
 346                 public new char this[int index] {
 347                         get { return _base_string [index]; }
 348                 }
 349
 350                 public override string Substring (int startindex, int length)
 351                 {
 352                         return _base_string.Substring (startindex, length);
 353                 }
 354
 355                 public override int FullLength {
 356                         get { return _length; }
 357                 }
 358         }
 359
 360         /// <summary>
 361         /// Represents a complete HTML document.
 362         /// </summary>
 363         public class HtmlDocument: IXPathNavigable
 364         {
 365                 // SLIM: Make the parser event driven
 366                 // callback for FilterHtml
 367                 // return value is a way for the callback to signal to continue or stop parsing
 368                 public delegate bool NodeHandler (HtmlNode node);
 369                 public NodeHandler ReportNode;
 370                 // misnomer ... should be called event_driven_mode
 371                 private bool _streammode = false;
 372                 private bool _stop_parsing = false;
 373
 374                 internal static readonly string HtmlExceptionRefNotChild = "Reference node must be a child of this node";
 375                 internal static readonly string HtmlExceptionUseIdAttributeFalse = "You need to set UseIdAttribute property to true to enable this feature";
 376
 377                 internal Hashtable _openednodes;
 378                 internal Hashtable _lastnodes = new Hashtable();
 379                 internal Hashtable _nodesid;
 380                 private HtmlNode _documentnode;
 381                 //SLIM: internal string _text;
 382                 internal StreamAsArray _text;
 383                 private HtmlNode _currentnode;
 384                 private HtmlNode _lastparentnode;
 385                 private HtmlAttribute _currentattribute;
 386                 private int _index;
 387                 private int _line;
 388                 private int _lineposition, _maxlineposition;
 389                 private int _c;
 390                 private bool _fullcomment;
 391                 private System.Text.Encoding _streamencoding;
 392                 private System.Text.Encoding _declaredencoding;
 393                 private ArrayList _parseerrors = new ArrayList();
 394                 private ParseState _state, _oldstate;
 395                 private Crc32 _crc32 = null;
 396                 private bool _onlyDetectEncoding = false;
 397                 private int _pcdata_quote_char = '\0';
 398
 399                 private static bool _debug = false;
 400                 internal static void Debug (string s)
 401                 {
 402                         if (_debug)
 403                                 Console.WriteLine (s);
 404                 }
 405
 406                 // public props
 407
 408                 /// <summary>
 409                 /// Defines if a checksum must be computed for the document while parsing. Default is false.
 410                 /// </summary>
 411                 public bool OptionComputeChecksum = false;
 412
 413                 /// <summary>
 414                 /// Defines if declared encoding must be read from the document.
 415                 /// Declared encoding is determined using the meta http-equiv="content-type" content="text/html;charset=XXXXX" html node.
 416                 /// Default is true.
 417                 /// </summary>
 418                 public bool OptionReadEncoding = true;
 419
 420
 421                 /// <summary>
 422                 /// Defines if non closed nodes will be checked at the end of parsing. Default is true.
 423                 /// </summary>
 424                 public bool OptionCheckSyntax = true;
 425
 426                 /// <summary>
 427                 /// Defines if the 'id' attribute must be specifically used. Default is true.
 428                 /// </summary>
 429                 public bool OptionUseIdAttribute = true;
 430
 431                 /// <summary>
 432                 /// Defines if empty nodes must be written as closed during output. Default is false.
 433                 /// </summary>
 434                 public bool OptionWriteEmptyNodes = false;
 435
 436                 /// <summary>
 437                 /// Defines if output must conform to XML, instead of HTML.
 438                 /// </summary>
 439                 public bool OptionOutputAsXml = false;
 440
 441                 /// <summary>
 442                 /// Defines if name must be output in uppercase. Default is false.
 443                 /// </summary>
 444                 public bool OptionOutputUpperCase = false;
 445
 446                 /// <summary>
 447                 /// Defines if attribute value output must be optimized (not bound with double quotes if it is possible). Default is false.
 448                 /// </summary>
 449                 public bool OptionOutputOptimizeAttributeValues = false;
 450
 451                 /// <summary>
 452                 /// Adds Debugging attributes to node. Default is false.
 453                 /// </summary>
 454                 public bool OptionAddDebuggingAttributes = false;
 455
 456                 /// <summary>
 457                 /// Defines if source text must be extracted while parsing errors.
 458                 /// If the document has a lot of errors, or cascading errors, parsing performance can be dramatically affected if set to true.
 459                 /// Default is false.
 460                 /// </summary>
 461                 public bool OptionExtractErrorSourceText = false; // turning this on can dramatically slow performance if a lot of errors are detected
 462
 463                 /// <summary>
 464                 /// Defines if closing for non closed nodes must be done at the end or directly in the document.
 465                 /// Setting this to true can actually change how browsers render the page. Default is false.
 466                 /// </summary>
 467                 public bool OptionAutoCloseOnEnd = false; // close errors at the end
 468
 469                 /// <summary>
 470                 /// Defines if LI, TR, TH, TD tags must be partially fixed when nesting errors are detected. Default is false.
 471                 /// </summary>
 472                 public bool OptionFixNestedTags = false; // fix li, tr, th, td tags
 473
 474                 /// <summary>
 475                 /// Defines the maximum length of source text or parse errors. Default is 100.
 476                 /// </summary>
 477                 public int OptionExtractErrorSourceTextMaxLength = 100;
 478
 479                 /// <summary>
 480                 /// Defines the default stream encoding to use. Default is System.Text.Encoding.Default.
 481                 /// </summary>
 482                 // From http://www.w3.org/TR/REC-html40/charset.html
 483                 // The HTTP protocol ([RFC2616], section 3.7.1) mentions ISO-8859-1 as a default character encoding when the "charset" parameter is absent from the "Content-Type" header field.
 484                 // So, however we are still using UTF-8 for some unknown reason
 485                 //FIXME: Fix the default encoding!
 486                 public System.Text.Encoding OptionDefaultStreamEncoding = Encoding.UTF8;
 487
 488                 /// <summary>
 489                 /// Gets a list of parse errors found in the document.
 490                 /// </summary>
 491                 public ArrayList ParseErrors
 492                 {
 493                         get
 494                         {
 495                                 return _parseerrors;
 496                         }
 497                 }
 498
 499                 /// <summary>
 500                 /// Gets the document's stream encoding.
 501                 /// </summary>
 502                 public System.Text.Encoding StreamEncoding
 503                 {
 504                         get
 505                         {
 506                                 return _streamencoding;
 507                         }
 508                 }
 509
 510                 /// <summary>
 511                 /// Gets the document's declared encoding.
 512                 /// Declared encoding is determined using the meta http-equiv="content-type" content="text/html;charset=XXXXX" html node.
 513                 /// </summary>
 514                 public System.Text.Encoding DeclaredEncoding
 515                 {
 516                         get
 517                         {
 518                                 return _declaredencoding;
 519                         }
 520                 }
 521
 522                 /// <summary>
 523                 /// Creates an instance of an HTML document.
 524                 /// </summary>
 525                 public HtmlDocument()
 526                 {
 527                         _documentnode = CreateNode(HtmlNodeType.Document, 0);
 528                 }
 529
 530                 internal HtmlNode GetXmlDeclaration()
 531                 {
 532                         if (!_documentnode.HasChildNodes)
 533                         {
 534                                 return null;
 535                         }
 536
 537                         foreach(HtmlNode node in _documentnode._childnodes)
 538                         {
 539                                 if (node.Name == "?xml") // it's ok, names are case sensitive
 540                                 {
 541                                         return node;
 542                                 }
 543                         }
 544                         return null;
 545                 }
 546
 547                 /// <summary>
 548                 /// Applies HTML encoding to a specified string.
 549                 /// </summary>
 550                 /// <param name="html">The input string to encode. May not be null.</param>
 551                 /// <returns>The encoded string.</returns>
 552                 public static string HtmlEncode(string html)
 553                 {
 554                         if (html == null)
 555                         {
 556                                 throw new ArgumentNullException("html");
 557                         }
 558                         // replace & by &amp; but only once!
 559                         Regex rx = new Regex("&(?!(amp;)|(lt;)|(gt;)|(quot;))", RegexOptions.IgnoreCase);
 560                         return rx.Replace(html, "&amp;").Replace("<", "&lt;").Replace(">", "&gt;").Replace("\"", "&quot;");
 561                 }
 562
 563                 /// <summary>
 564                 /// Detects the encoding of an HTML stream.
 565                 /// </summary>
 566                 /// <param name="stream">The input stream. May not be null.</param>
 567                 /// <returns>The detected encoding.</returns>
 568                 public Encoding DetectEncoding(Stream stream)
 569                 {
 570                         if (stream == null)
 571                         {
 572                                 throw new ArgumentNullException("stream");
 573                         }
 574                         return DetectEncoding(new StreamReader(stream));
 575                 }
 576
 577                 /// <summary>
 578                 /// Detects the encoding of an HTML file.
 579                 /// </summary>
 580                 /// <param name="path">Path for the file containing the HTML document to detect. May not be null.</param>
 581                 /// <returns>The detected encoding.</returns>
 582                 public Encoding DetectEncoding(string path)
 583                 {
 584                         if (path == null)
 585                         {
 586                                 throw new ArgumentNullException("path");
 587                         }
 588                         StreamReader sr = new StreamReader(path, OptionDefaultStreamEncoding);
 589                         Encoding encoding = DetectEncoding(sr);
 590                         sr.Close();
 591                         return encoding;
 592                 }
 593
 594                 /// <summary>
 595                 /// Detects the encoding of an HTML text.
 596                 /// </summary>
 597                 /// <param name="html">The input html text. May not be null.</param>
 598                 /// <returns>The detected encoding.</returns>
 599                 public Encoding DetectEncodingHtml(string html)
 600                 {
 601                         if (html == null)
 602                         {
 603                                 throw new ArgumentNullException("html");
 604                         }
 605                         StringReader sr = new StringReader(html);
 606                         Encoding encoding = DetectEncoding(sr);
 607                         sr.Close();
 608                         return encoding;
 609                 }
 610
 611                 /// <summary>
 612                 /// Detects the encoding of an HTML text provided on a TextReader.
 613                 /// </summary>
 614                 /// <param name="reader">The TextReader used to feed the HTML. May not be null.</param>
 615                 /// <returns>The detected encoding.</returns>
 616                 public Encoding DetectEncoding(TextReader reader)
 617                 {
 618                         if (reader == null)
 619                         {
 620                                 throw new ArgumentNullException("reader");
 621                         }
 622                         _onlyDetectEncoding = true;
 623                         if (OptionCheckSyntax)
 624                         {
 625                                 _openednodes = new Hashtable();
 626                         }
 627                         else
 628                         {
 629                                 _openednodes = null;
 630                         }
 631
 632                         if (OptionUseIdAttribute)
 633                         {
 634                                 _nodesid = new Hashtable();
 635                         }
 636                         else
 637                         {
 638                                 _nodesid = null;
 639                         }
 640
 641                         StreamReader sr = reader as StreamReader;
 642                         if (sr != null)
 643                         {
 644                                 _streamencoding = sr.CurrentEncoding;
 645                                 _text = new ImplStreamAsArray (sr);
 646                         }
 647                         else
 648                         {
 649                                 _streamencoding = null;
 650                                 // Expensive, but cannot avoid since TextReader doesnt have any length of the underlying data
 651                                 _text = new DummyStreamAsArray (reader.ReadToEnd());
 652                         }
 653                         _declaredencoding = null;
 654
 655                         // SLIM: _text = reader.ReadToEnd();
 656                         _documentnode = CreateNode(HtmlNodeType.Document, 0);
 657
 658                         // this is a hack, but it allows us not to muck with the original parsing code
 659                         try
 660                         {
 661                                 Parse();
 662                         }
 663                         catch(EncodingFoundException ex)
 664                         {
 665                                 _lastnodes.Clear();
 666                                 return ex.Encoding;
 667                         }
 668                         return null;
 669                 }
 670
 671                 /// <summary>
 672                 /// Loads an HTML document from a stream.
 673                 /// </summary>
 674                 /// <param name="stream">The input stream.</param>
 675                 public void Load(Stream stream)
 676                 {
 677                         Load(new StreamReader(stream, OptionDefaultStreamEncoding));
 678                 }
 679
 680                 /// <summary>
 681                 /// Loads an HTML document from a stream.
 682                 /// </summary>
 683                 /// <param name="stream">The input stream.</param>
 684                 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the stream.</param>
 685                 public void Load(Stream stream, bool detectEncodingFromByteOrderMarks)
 686                 {
 687                         Load(new StreamReader(stream, detectEncodingFromByteOrderMarks));
 688                 }
 689
 690                 /// <summary>
 691                 /// Loads an HTML document from a stream.
 692                 /// </summary>
 693                 /// <param name="stream">The input stream.</param>
 694                 /// <param name="encoding">The character encoding to use.</param>
 695                 public void Load(Stream stream, Encoding encoding)
 696                 {
 697                         Load(new StreamReader(stream, encoding));
 698                 }
 699
 700                 /// <summary>
 701                 /// Loads an HTML document from a stream.
 702                 /// </summary>
 703                 /// <param name="stream">The input stream.</param>
 704                 /// <param name="encoding">The character encoding to use.</param>
 705                 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the stream.</param>
 706                 public void Load(Stream stream, Encoding encoding, bool detectEncodingFromByteOrderMarks)
 707                 {
 708                         Load(new StreamReader(stream, encoding, detectEncodingFromByteOrderMarks));
 709                 }
 710
 711                 /// <summary>
 712                 /// Loads an HTML document from a stream.
 713                 /// </summary>
 714                 /// <param name="stream">The input stream.</param>
 715                 /// <param name="encoding">The character encoding to use.</param>
 716                 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the stream.</param>
 717                 /// <param name="buffersize">The minimum buffer size.</param>
 718                 public void Load(Stream stream, Encoding encoding, bool detectEncodingFromByteOrderMarks, int buffersize)
 719                 {
 720                         Load(new StreamReader(stream, encoding, detectEncodingFromByteOrderMarks, buffersize));
 721                 }
 722
 723                 /// <summary>
 724                 /// Loads an HTML document from a file.
 725                 /// </summary>
 726                 /// <param name="path">The complete file path to be read. May not be null.</param>
 727                 public void Load(string path)
 728                 {
 729                         if (path == null)
 730                         {
 731                                 throw new ArgumentNullException("path");
 732                         }
 733                         StreamReader sr = new StreamReader(path, OptionDefaultStreamEncoding);
 734                         Load(sr);
 735                         sr.Close();
 736                 }
 737
 738                 /// <summary>
 739                 /// Loads an HTML document from a file.
 740                 /// </summary>
 741                 /// <param name="path">The complete file path to be read. May not be null.</param>
 742                 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
 743                 public void Load(string path, bool detectEncodingFromByteOrderMarks)
 744                 {
 745                         if (path == null)
 746                         {
 747                                 throw new ArgumentNullException("path");
 748                         }
 749                         StreamReader sr = new StreamReader(path, detectEncodingFromByteOrderMarks);
 750                         Load(sr);
 751                         sr.Close();
 752                 }
 753
 754                 /// <summary>
 755                 /// Loads an HTML document from a file.
 756                 /// </summary>
 757                 /// <param name="path">The complete file path to be read. May not be null.</param>
 758                 /// <param name="encoding">The character encoding to use. May not be null.</param>
 759                 public void Load(string path, Encoding encoding)
 760                 {
 761                         if (path == null)
 762                         {
 763                                 throw new ArgumentNullException("path");
 764                         }
 765                         if (encoding == null)
 766                         {
 767                                 throw new ArgumentNullException("encoding");
 768                         }
 769                         StreamReader sr = new StreamReader(path, encoding);
 770                         Load(sr);
 771                         sr.Close();
 772                 }
 773
 774                 /// <summary>
 775                 /// Loads an HTML document from a file.
 776                 /// </summary>
 777                 /// <param name="path">The complete file path to be read. May not be null.</param>
 778                 /// <param name="encoding">The character encoding to use. May not be null.</param>
 779                 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
 780                 public void Load(string path, Encoding encoding, bool detectEncodingFromByteOrderMarks)
 781                 {
 782                         if (path == null)
 783                         {
 784                                 throw new ArgumentNullException("path");
 785                         }
 786                         if (encoding == null)
 787                         {
 788                                 throw new ArgumentNullException("encoding");
 789                         }
 790                         StreamReader sr = new StreamReader(path, encoding, detectEncodingFromByteOrderMarks);
 791                         Load(sr);
 792                         sr.Close();
 793                 }
 794
 795                 /// <summary>
 796                 /// Loads an HTML document from a file.
 797                 /// </summary>
 798                 /// <param name="path">The complete file path to be read. May not be null.</param>
 799                 /// <param name="encoding">The character encoding to use. May not be null.</param>
 800                 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
 801                 /// <param name="buffersize">The minimum buffer size.</param>
 802                 public void Load(string path, Encoding encoding, bool detectEncodingFromByteOrderMarks, int buffersize)
 803                 {
 804                         if (path == null)
 805                         {
 806                                 throw new ArgumentNullException("path");
 807                         }
 808                         if (encoding == null)
 809                         {
 810                                 throw new ArgumentNullException("encoding");
 811                         }
 812                         StreamReader sr = new StreamReader(path, encoding, detectEncodingFromByteOrderMarks, buffersize);
 813                         Load(sr);
 814                         sr.Close();
 815                 }
 816
 817                 /// <summary>
 818                 /// Loads the HTML document from the specified string.
 819                 /// </summary>
 820                 /// <param name="html">String containing the HTML document to load. May not be null.</param>
 821                 public void LoadHtml(string html)
 822                 {
 823                         if (html == null)
 824                         {
 825                                 throw new ArgumentNullException("html");
 826                         }
 827                         StringReader sr = new StringReader(html);
 828                         Load(sr);
 829                         sr.Close();
 830                 }
 831
 832                 /// <summary>
 833                 /// Detects the encoding of an HTML document from a file first, and then loads the file.
 834                 /// </summary>
 835                 /// <param name="path">The complete file path to be read.</param>
 836                 public void DetectEncodingAndLoad(string path)
 837                 {
 838                         DetectEncodingAndLoad(path, true);
 839                 }
 840
 841                 /// <summary>
 842                 /// Detects the encoding of an HTML document from a file first, and then loads the file.
 843                 /// </summary>
 844                 /// <param name="path">The complete file path to be read. May not be null.</param>
 845                 /// <param name="detectEncoding">true to detect encoding, false otherwise.</param>
 846                 public void DetectEncodingAndLoad(string path, bool detectEncoding)
 847                 {
 848                         if (path == null)
 849                         {
 850                                 throw new ArgumentNullException("path");
 851                         }
 852                         System.Text.Encoding enc;
 853                         if (detectEncoding)
 854                         {
 855                                 enc = DetectEncoding(path);
 856                         }
 857                         else
 858                         {
 859                                 enc = null;
 860                         }
 861
 862                         if (enc == null)
 863                         {
 864                                 Load(path);
 865                         }
 866                         else
 867                         {
 868                                 Load(path, enc);
 869                         }
 870                 }
 871
 872                 /// <summary>
 873                 /// Loads the HTML document from the specified TextReader.
 874                 /// </summary>
 875                 /// <param name="reader">The TextReader used to feed the HTML data into the document. May not be null.</param>
 876                 public void Load(TextReader reader)
 877                 {
 878                         // all Load methods pass down to this one
 879                         if (reader == null)
 880                         {
 881                                 throw new ArgumentNullException("reader");
 882                         }
 883
 884                         _onlyDetectEncoding = false;
 885
 886                         if (OptionCheckSyntax)
 887                         {
 888                                 _openednodes = new Hashtable();
 889                         }
 890                         else
 891                         {
 892                                 _openednodes = null;
 893                         }
 894
 895                         if (OptionUseIdAttribute)
 896                         {
 897                                 _nodesid = new Hashtable();
 898                         }
 899                         else
 900                         {
 901                                 _nodesid = null;
 902                         }
 903
 904                         StreamReader sr = reader as StreamReader;
 905                         if (sr != null)
 906                         {
 907                                 try
 908                                 {
 909                                     // trigger bom read if needed
 910                                     sr.Peek();
 911                                 }
 912                                 catch
 913                                 {
 914                                     // void on purpose
 915                                 }
 916                                 _streamencoding = sr.CurrentEncoding;
 917                                 _text = new ImplStreamAsArray (sr);
 918                         }
 919                         else
 920                         {
 921                                 _streamencoding = null;
 922                                 // Expensive, but cannot avoid since TextReader doesnt have any length of the underlying data
 923                                 _text = new DummyStreamAsArray (reader.ReadToEnd());
 924                         }
 925                         _declaredencoding = null;
 926
 927                         // SLIM: _text = reader.ReadToEnd();
 928                         _documentnode = CreateNode(HtmlNodeType.Document, 0);
 929                         Parse();
 930
 931                         if (OptionCheckSyntax)
 932                         {
 933                                 foreach(HtmlNode node in _openednodes.Values)
 934                                 {
 935                                         if (!node._starttag)    // already reported
 936                                         {
 937                                                 continue;
 938                                         }
 939
 940                                         string html;
 941                                         if (OptionExtractErrorSourceText)
 942                                         {
 943                                                 html = node.OuterHtml;
 944                                                 if (html.Length > OptionExtractErrorSourceTextMaxLength)
 945                                                 {
 946                                                         html = html.Substring(0, OptionExtractErrorSourceTextMaxLength);
 947                                                 }
 948                                         }
 949                                         else
 950                                         {
 951                                                 html = string.Empty;
 952                                         }
 953                                         AddError(
 954                                                 HtmlParseErrorCode.TagNotClosed,
 955                                                 node._line, node._lineposition,
 956                                                 node._streamposition, html,
 957                                                 "End tag </" + node.Name + "> was not found");
 958                                 }
 959
 960                                 // we don't need this anymore
 961                                 _openednodes.Clear();
 962                         }
 963                 }
 964
 965                 internal System.Text.Encoding GetOutEncoding()
 966                 {
 967                         // when unspecified, use the stream encoding first
 968                         if (_declaredencoding != null)
 969                         {
 970                                 return _declaredencoding;
 971                         }
 972                         else
 973                         {
 974                                 if (_streamencoding != null)
 975                                 {
 976                                         return _streamencoding;
 977                                 }
 978                         }
 979                         return OptionDefaultStreamEncoding;
 980                 }
 981
 982
 983                 /// <summary>
 984                 /// Gets the document's output encoding.
 985                 /// </summary>
 986                 public System.Text.Encoding Encoding
 987                 {
 988                         get
 989                         {
 990                                 return GetOutEncoding();
 991                         }
 992                 }
 993
 994                 /// <summary>
 995                 /// Saves the HTML document to the specified stream.
 996                 /// </summary>
 997                 /// <param name="outStream">The stream to which you want to save.</param>
 998                 public void Save(Stream outStream)
 999                 {
1000                         StreamWriter sw = new StreamWriter(outStream, GetOutEncoding());
1001                         Save(sw);
1002                 }
1003
1004                 /// <summary>
1005                 /// Saves the HTML document to the specified stream.
1006                 /// </summary>
1007                 /// <param name="outStream">The stream to which you want to save. May not be null.</param>
1008                 /// <param name="encoding">The character encoding to use. May not be null.</param>
1009                 public void Save(Stream outStream, System.Text.Encoding encoding)
1010                 {
1011                         if (outStream == null)
1012                         {
1013                                 throw new ArgumentNullException("outStream");
1014                         }
1015                         if (encoding == null)
1016                         {
1017                                 throw new ArgumentNullException("encoding");
1018                         }
1019                         StreamWriter sw = new StreamWriter(outStream, encoding);
1020                         Save(sw);
1021                 }
1022
1023                 /// <summary>
1024                 /// Saves the mixed document to the specified file.
1025                 /// </summary>
1026                 /// <param name="filename">The location of the file where you want to save the document.</param>
1027                 public void Save(string filename)
1028                 {
1029                         StreamWriter sw = new StreamWriter(filename, false, GetOutEncoding());
1030                         Save(sw);
1031                         sw.Close();
1032                 }
1033
1034                 /// <summary>
1035                 /// Saves the mixed document to the specified file.
1036                 /// </summary>
1037                 /// <param name="filename">The location of the file where you want to save the document. May not be null.</param>
1038                 /// <param name="encoding">The character encoding to use. May not be null.</param>
1039                 public void Save(string filename, System.Text.Encoding encoding)
1040                 {
1041                         if (filename == null)
1042                         {
1043                                 throw new ArgumentNullException("filename");
1044                         }
1045                         if (encoding == null)
1046                         {
1047                                 throw new ArgumentNullException("encoding");
1048                         }
1049                         StreamWriter sw = new StreamWriter(filename, false, encoding);
1050                         Save(sw);
1051                         sw.Close();
1052                 }
1053
1054                 /// <summary>
1055                 /// Saves the HTML document to the specified StreamWriter.
1056                 /// </summary>
1057                 /// <param name="writer">The StreamWriter to which you want to save.</param>
1058                 public void Save(StreamWriter writer)
1059                 {
1060                         Save((TextWriter)writer);
1061                 }
1062
1063                 /// <summary>
1064                 /// Saves the HTML document to the specified TextWriter.
1065                 /// </summary>
1066                 /// <param name="writer">The TextWriter to which you want to save. May not be null.</param>
1067                 public void Save(TextWriter writer)
1068                 {
1069                         if (writer == null)
1070                         {
1071                                 throw new ArgumentNullException("writer");
1072                         }
1073                         DocumentNode.WriteTo(writer);
1074                 }
1075
1076                 /// <summary>
1077                 /// Saves the HTML document to the specified XmlWriter.
1078                 /// </summary>
1079                 /// <param name="writer">The XmlWriter to which you want to save.</param>
1080                 public void Save(XmlWriter writer)
1081                 {
1082                         DocumentNode.WriteTo(writer);
1083                         writer.Flush();
1084                 }
1085
1086                 /// <summary>
1087                 /// Creates a new XPathNavigator object for navigating this HTML document.
1088                 /// </summary>
1089                 /// <returns>An XPathNavigator object. The XPathNavigator is positioned on the root of the document.</returns>
1090                 public XPathNavigator CreateNavigator()
1091                 {
1092                         return new HtmlNodeNavigator(this, _documentnode);
1093                 }
1094
1095                 internal void SetIdForNode(HtmlNode node, string id)
1096                 {
1097                         if (!OptionUseIdAttribute)
1098                         {
1099                                 return;
1100                         }
1101
1102                         if ((_nodesid == null) || (id == null))
1103                         {
1104                                 return;
1105                         }
1106
1107                         if (node == null)
1108                         {
1109                                 _nodesid.Remove(id.ToLower());
1110                         }
1111                         else
1112                         {
1113                                 _nodesid[id.ToLower()] = node;
1114                         }
1115                 }
1116
1117                 /// <summary>
1118                 /// Gets the HTML node with the specified 'id' attribute value.
1119                 /// </summary>
1120                 /// <param name="id">The attribute id to match. May not be null.</param>
1121                 /// <returns>The HTML node with the matching id or null if not found.</returns>
1122                 public HtmlNode GetElementbyId(string id)
1123                 {
1124                         if (id == null)
1125                         {
1126                                 throw new ArgumentNullException("id");
1127                         }
1128                         if (_nodesid == null)
1129                         {
1130                                 throw new Exception(HtmlExceptionUseIdAttributeFalse);
1131                         }
1132
1133                         return _nodesid[id.ToLower()] as HtmlNode;
1134                 }
1135
1136                 /// <summary>
1137                 /// Creates an HTML element node with the specified name.
1138                 /// </summary>
1139                 /// <param name="name">The qualified name of the element. May not be null.</param>
1140                 /// <returns>The new HTML node.</returns>
1141                 public HtmlNode CreateElement(string name)
1142                 {
1143                         if (name == null)
1144                         {
1145                                 throw new ArgumentNullException("name");
1146                         }
1147                         HtmlNode node = CreateNode(HtmlNodeType.Element);
1148                         node._name = name;
1149                         return node;
1150                 }
1151
1152                 /// <summary>
1153                 /// Creates an HTML comment node.
1154                 /// </summary>
1155                 /// <returns>The new HTML comment node.</returns>
1156                 public HtmlCommentNode CreateComment()
1157                 {
1158                         return (HtmlCommentNode)CreateNode(HtmlNodeType.Comment);
1159                 }
1160
1161                 /// <summary>
1162                 /// Creates an HTML comment node with the specified comment text.
1163                 /// </summary>
1164                 /// <param name="comment">The comment text. May not be null.</param>
1165                 /// <returns>The new HTML comment node.</returns>
1166                 public HtmlCommentNode CreateComment(string comment)
1167                 {
1168                         if (comment == null)
1169                         {
1170                                 throw new ArgumentNullException("comment");
1171                         }
1172                         HtmlCommentNode c = CreateComment();
1173                         c.Comment = comment;
1174                         return c;
1175                 }
1176
1177                 /// <summary>
1178                 /// Creates an HTML text node.
1179                 /// </summary>
1180                 /// <returns>The new HTML text node.</returns>
1181                 public HtmlTextNode CreateTextNode()
1182                 {
1183                         return (HtmlTextNode)CreateNode(HtmlNodeType.Text);
1184                 }
1185
1186                 /// <summary>
1187                 /// Creates an HTML text node with the specified text.
1188                 /// </summary>
1189                 /// <param name="text">The text of the node. May not be null.</param>
1190                 /// <returns>The new HTML text node.</returns>
1191                 public HtmlTextNode CreateTextNode(string text)
1192                 {
1193                         if (text == null)
1194                         {
1195                                 throw new ArgumentNullException("text");
1196                         }
1197                         HtmlTextNode t = CreateTextNode();
1198                         t.Text = text;
1199                         return t;
1200                 }
1201
1202                 internal HtmlNode CreateNode(HtmlNodeType type)
1203                 {
1204                         return CreateNode(type, -1);
1205                 }
1206
1207                 internal HtmlNode CreateNode(HtmlNodeType type, int index)
1208                 {
1209                         switch (type)
1210                         {
1211                                 case HtmlNodeType.Comment:
1212                                         return new HtmlCommentNode(this, index);
1213
1214                                 case HtmlNodeType.Text:
1215                                         return new HtmlTextNode(this, index);
1216
1217                                 default:
1218                                         return new HtmlNode(type, this, index);
1219                         }
1220                 }
1221
1222                 internal HtmlAttribute CreateAttribute()
1223                 {
1224                         return new HtmlAttribute(this);
1225                 }
1226
1227                 /// <summary>
1228                 /// Creates an HTML attribute with the specified name.
1229                 /// </summary>
1230                 /// <param name="name">The name of the attribute. May not be null.</param>
1231                 /// <returns>The new HTML attribute.</returns>
1232                 public HtmlAttribute CreateAttribute(string name)
1233                 {
1234                         if (name == null)
1235                         {
1236                                 throw new ArgumentNullException("name");
1237                         }
1238                         HtmlAttribute att = CreateAttribute();
1239                         att.Name = name;
1240                         return att;
1241                 }
1242
1243                 /// <summary>
1244                 /// Creates an HTML attribute with the specified name.
1245                 /// </summary>
1246                 /// <param name="name">The name of the attribute. May not be null.</param>
1247                 /// <param name="value">The value of the attribute.</param>
1248                 /// <returns>The new HTML attribute.</returns>
1249                 public HtmlAttribute CreateAttribute(string name, string value)
1250                 {
1251                         if (name == null)
1252                         {
1253                                 throw new ArgumentNullException("name");
1254                         }
1255                         HtmlAttribute att = CreateAttribute(name);
1256                         att.Value = value;
1257                         return att;
1258                 }
1259
1260                 /// <summary>
1261                 /// Gets the root node of the document.
1262                 /// </summary>
1263                 public HtmlNode DocumentNode
1264                 {
1265                         get
1266                         {
1267                                 return _documentnode;
1268                         }
1269                 }
1270
1271                 /// <summary>
1272                 /// Gets the document CRC32 checksum if OptionComputeChecksum was set to true before parsing, 0 otherwise.
1273                 /// </summary>
1274                 public int CheckSum
1275                 {
1276                         get
1277                         {
1278                                 if (_crc32 == null)
1279                                 {
1280                                         return 0;
1281                                 }
1282                                 else
1283                                 {
1284                                         return (int)_crc32.CheckSum;
1285                                 }
1286                         }
1287                 }
1288
1289                 public bool StreamMode
1290                 {
1291                         get
1292                         {
1293                                 return _streammode;
1294                         }
1295                         set
1296                         {
1297                                 _streammode = value;
1298                         }
1299                 }
1300
1301                 private HtmlParseError AddError(
1302                                 HtmlParseErrorCode code,
1303                                 int line,
1304                                 int linePosition,
1305                                 int streamPosition,
1306                                 string sourceText,
1307                                 string reason)
1308                         {
1309                         HtmlParseError err = new HtmlParseError(code, line, linePosition, streamPosition, sourceText, reason);
1310                         _parseerrors.Add(err);
1311                         return err;
1312                 }
1313
1314                 private enum ParseState
1315                 {
1316                         Text,
1317                         WhichTag,
1318                         Tag,
1319                         BetweenAttributes,
1320                         EmptyTag,
1321                         AttributeName,
1322                         AttributeBeforeEquals,
1323                         AttributeAfterEquals,
1324                         AttributeValue,
1325                         Comment,
1326                         QuotedAttributeValue,
1327                         ServerSideCode,
1328                         PcDataQuote,
1329                         PcData
1330                 }
1331
1332                 private void IncrementPosition()
1333                 {
1334                         if (_crc32 != null)
1335                         {
1336                                 // REVIEW: should we add some checksum code in DecrementPosition too?
1337                                 _crc32.AddToCRC32(_c);
1338                         }
1339
1340                         _index++;
1341                         _maxlineposition = _lineposition;
1342                         if (_c == 10)
1343                         {
1344                                 _lineposition = 1;
1345                                 _line++;
1346                         }
1347                         else
1348                         {
1349                                 _lineposition++;
1350                         }
1351                 }
1352
1353                 private void DecrementPosition()
1354                 {
1355                         _index--;
1356                         if (_lineposition == 1)
1357                         {
1358                                 _lineposition = _maxlineposition;
1359                                 _line--;
1360                         }
1361                         else
1362                         {
1363                                 _lineposition--;
1364                         }
1365                 }
1366
1367                 private void Parse()
1368                 {
1369                         int lastquote = 0;
1370                         if (OptionComputeChecksum)
1371                         {
1372                                 _crc32 = new Crc32();
1373                         }
1374
1375                         _lastnodes = new Hashtable();
1376                         _c = 0;
1377                         _fullcomment = false;
1378                         _parseerrors = new ArrayList();
1379                         _line = 1;
1380                         _lineposition = 1;
1381                         _maxlineposition = 1;
1382
1383                         _state = ParseState.Text;
1384                         _oldstate = _state;
1385                         _documentnode._innerlength = _text.FullLength;
1386                         _documentnode._outerlength = _text.FullLength;
1387
1388                         _lastparentnode = _documentnode;
1389                         _currentnode = CreateNode(HtmlNodeType.Text, 0);
1390                         _currentattribute = null;
1391
1392                         _index = 0;
1393                         PushNodeStart(HtmlNodeType.Text, 0);
1394                         // SLIM: while (_index<_text.Length)
1395                         while (! _stop_parsing && ! _text.Eof (_index))
1396                         {
1397                                 _c = _text[_index];
1398                                 IncrementPosition();
1399
1400                                 switch(_state)
1401                                 {
1402                                         case ParseState.Text:
1403                                                 if (NewCheck())
1404                                                         continue;
1405                                                 break;
1406
1407                                         case ParseState.WhichTag:
1408                                                 if (NewCheck())
1409                                                         continue;
1410                                                 if (_c == '/')
1411                                                 {
1412                                                         PushNodeNameStart(false, _index);
1413                                                 }
1414                                                 else
1415                                                 {
1416                                                         PushNodeNameStart(true, _index-1);
1417                                                         DecrementPosition();
1418                                                 }
1419                                                 _state = ParseState.Tag;
1420                                                 break;
1421
1422                                         case ParseState.Tag:
1423                                                 if (NewCheck())
1424                                                         continue;
1425                                                 if (IsWhiteSpace(_c))
1426                                                 {
1427                                                         PushNodeNameEnd(_index-1);
1428                                                         if (_state != ParseState.Tag)
1429                                                                 continue;
1430                                                         _state = ParseState.BetweenAttributes;
1431                                                         continue;
1432                                                 }
1433                                                 if (_c == '/')
1434                                                 {
1435                                                         PushNodeNameEnd(_index-1);
1436                                                         if (_state != ParseState.Tag)
1437                                                                 continue;
1438                                                         _state = ParseState.EmptyTag;
1439                                                         continue;
1440                                                 }
1441                                                 if (_c == '>')
1442                                                 {
1443                                                         PushNodeNameEnd(_index-1);
1444                                                         if (_state != ParseState.Tag)
1445                                                                 continue;
1446                                                         PushNodeEnd(_index, false);
1447                                                         if (_state != ParseState.Tag)
1448                                                                 continue;
1449                                                         _state = ParseState.Text;
1450                                                         PushNodeStart(HtmlNodeType.Text, _index);
1451                                                 }
1452                                                 break;
1453
1454                                         case ParseState.BetweenAttributes:
1455                                                 if (NewCheck())
1456                                                         continue;
1457
1458                                                 if (IsWhiteSpace(_c))
1459                                                         continue;
1460
1461                                                 if ((_c == '/') || (_c == '?'))
1462                                                 {
1463                                                         _state = ParseState.EmptyTag;
1464                                                         continue;
1465                                                 }
1466
1467                                                 if (_c == '>')
1468                                                 {
1469                                                         PushNodeEnd(_index, false);
1470                                                         if (_state != ParseState.BetweenAttributes)
1471                                                                 continue;
1472                                                         _state = ParseState.Text;
1473                                                         PushNodeStart(HtmlNodeType.Text, _index);
1474                                                         continue;
1475                                                 }
1476
1477                                                 PushAttributeNameStart(_index-1);
1478                                                 _state = ParseState.AttributeName;
1479                                                 break;
1480
1481                                         case ParseState.EmptyTag:
1482                                                 if (NewCheck())
1483                                                         continue;
1484
1485                                                 if (_c == '>')
1486                                                 {
1487                                                         PushNodeEnd(_index, true);
1488                                                         if (_state != ParseState.EmptyTag)
1489                                                                 continue;
1490                                                         _state = ParseState.Text;
1491                                                         PushNodeStart(HtmlNodeType.Text, _index);
1492                                                         continue;
1493                                                 }
1494                                                 _state = ParseState.BetweenAttributes;
1495                                                 break;
1496
1497                                         case ParseState.AttributeName:
1498                                                 if (NewCheck())
1499                                                         continue;
1500
1501                                                 if (IsWhiteSpace(_c))
1502                                                 {
1503                                                         PushAttributeNameEnd(_index-1);
1504                                                         _state = ParseState.AttributeBeforeEquals;
1505                                                         continue;
1506                                                 }
1507                                                 if (_c == '=')
1508                                                 {
1509                                                         PushAttributeNameEnd(_index-1);
1510                                                         _state = ParseState.AttributeAfterEquals;
1511                                                         continue;
1512                                                 }
1513                                                 if (_c == '>')
1514                                                 {
1515                                                         PushAttributeNameEnd(_index-1);
1516                                                         PushNodeEnd(_index, false);
1517                                                         if (_state != ParseState.AttributeName)
1518                                                                 continue;
1519                                                         _state = ParseState.Text;
1520                                                         PushNodeStart(HtmlNodeType.Text, _index);
1521                                                         continue;
1522                                                 }
1523                                                 break;
1524
1525                                         case ParseState.AttributeBeforeEquals:
1526                                                 if (NewCheck())
1527                                                         continue;
1528
1529                                                 if (IsWhiteSpace(_c))
1530                                                         continue;
1531                                                 if (_c == '>')
1532                                                 {
1533                                                         PushNodeEnd(_index, false);
1534                                                         if (_state != ParseState.AttributeBeforeEquals)
1535                                                                 continue;
1536                                                         _state = ParseState.Text;
1537                                                         PushNodeStart(HtmlNodeType.Text, _index);
1538                                                         continue;
1539                                                 }
1540                                                 if (_c == '=')
1541                                                 {
1542                                                         _state = ParseState.AttributeAfterEquals;
1543                                                         continue;
1544                                                 }
1545                                                 // no equals, no whitespace, it's a new attrribute starting
1546                                                 _state = ParseState.BetweenAttributes;
1547                                                 DecrementPosition();
1548                                                 break;
1549
1550                                         case ParseState.AttributeAfterEquals:
1551                                                 if (NewCheck())
1552                                                         continue;
1553
1554                                                 if (IsWhiteSpace(_c))
1555                                                         continue;
1556
1557                                                 if ((_c == '\'') || (_c == '"'))
1558                                                 {
1559                                                         _state = ParseState.QuotedAttributeValue;
1560                                                         PushAttributeValueStart(_index);
1561                                                         lastquote = _c;
1562                                                         continue;
1563                                                 }
1564                                                 if (_c == '>')
1565                                                 {
1566                                                         PushNodeEnd(_index, false);
1567                                                         if (_state != ParseState.AttributeAfterEquals)
1568                                                                 continue;
1569                                                         _state = ParseState.Text;
1570                                                         PushNodeStart(HtmlNodeType.Text, _index);
1571                                                         continue;
1572                                                 }
1573                                                 PushAttributeValueStart(_index-1);
1574                                                 _state = ParseState.AttributeValue;
1575                                                 break;
1576
1577                                         case ParseState.AttributeValue:
1578                                                 if (NewCheck())
1579                                                         continue;
1580
1581                                                 if (IsWhiteSpace(_c))
1582                                                 {
1583                                                         PushAttributeValueEnd(_index-1);
1584                                                         _state = ParseState.BetweenAttributes;
1585                                                         continue;
1586                                                 }
1587
1588                                                 if (_c == '>')
1589                                                 {
1590                                                         PushAttributeValueEnd(_index-1);
1591                                                         PushNodeEnd(_index, false);
1592                                                         if (_state != ParseState.AttributeValue)
1593                                                                 continue;
1594                                                         _state = ParseState.Text;
1595                                                         PushNodeStart(HtmlNodeType.Text, _index);
1596                                                         continue;
1597                                                 }
1598                                                 break;
1599
1600                                         case ParseState.QuotedAttributeValue:
1601                                                 if (_c == lastquote)
1602                                                 {
1603                                                         PushAttributeValueEnd(_index-1);
1604                                                         _state = ParseState.BetweenAttributes;
1605                                                         continue;
1606                                                 }
1607                                                 if (_c == '<')
1608                                                 {
1609                                                         //SLIM: if (_index<_text.Length)
1610                                                         if (!_text.Eof (_index))
1611                                                         {
1612                                                                 if (_text[_index] == '%')
1613                                                                 {
1614                                                                         _oldstate = _state;
1615                                                                         _state = ParseState.ServerSideCode;
1616                                                                         continue;
1617                                                                 }
1618                                                         }
1619                                                 }
1620                                                 break;
1621
1622                                         case ParseState.Comment:
1623                                                 if (_c == '>')
1624                                                 {
1625                                                         if (_fullcomment)
1626                                                         {
1627                                                                 if ((_text[_index-2] != '-') ||
1628                                                                         (_text[_index-3] != '-'))
1629                                                                 {
1630                                                                         continue;
1631                                                                 }
1632                                                         }
1633                                                         PushNodeEnd(_index, false);
1634                                                         _state = ParseState.Text;
1635                                                         PushNodeStart(HtmlNodeType.Text, _index);
1636                                                         continue;
1637                                                 }
1638                                                 break;
1639
1640                                         case ParseState.ServerSideCode:
1641                                                 if (_c == '%')
1642                                                 {
1643                                                         //SLIM: if (_index<_text.Length)
1644                                                         if (! _text.Eof (_index))
1645                                                         {
1646                                                                 if (_text[_index] == '>')
1647                                                                 {
1648                                                                         switch(_oldstate)
1649                                                                         {
1650                                                                                 case ParseState.AttributeAfterEquals:
1651                                                                                         _state = ParseState.AttributeValue;
1652                                                                                         break;
1653
1654                                                                                 case ParseState.BetweenAttributes:
1655                                                                                         PushAttributeNameEnd(_index+1);
1656                                                                                         _state = ParseState.BetweenAttributes;
1657                                                                                         break;
1658
1659                                                                                 default:
1660                                                                                         _state = _oldstate;
1661                                                                                         break;
1662                                                                         }
1663                                                                         IncrementPosition();
1664                                                                 }
1665                                                         }
1666                                                 }
1667                                                 break;
1668
1669                                         // handle <script>a="</script>"</script>
1670                                         case ParseState.PcDataQuote:
1671                                                 if ((_c == _pcdata_quote_char) && (_text [_index - 2] != '\\')) {
1672                                                         _pcdata_quote_char = '\0';
1673                                                         _state = ParseState.PcData;
1674                                                 }
1675                                                 break;
1676
1677                                         case ParseState.PcData:
1678                                                 Debug ("PCDATA " + _currentnode.Name + " " + _text.Substring(_index-1,  _currentnode._namelength+2));
1679                                                 if (_c == '\"' || _c == '\''){
1680                                                         _pcdata_quote_char = _c;
1681                                                         _state = ParseState.PcDataQuote;
1682                                                         break;
1683                                                 }
1684                                                 // look for </tag + 1 char
1685
1686                                                 // check buffer end
1687                                                 //SLIM: if ((_currentnode._namelength+3)<=(_text.Length-(_index-1)))
1688                                                 if (! _text.Eof (_currentnode._namelength + _index + 1))
1689                                                 {
1690                                                         if (string.Compare(_text.Substring(_index-1, _currentnode._namelength+2),
1691                                                                 "</" + _currentnode.Name, true) == 0)
1692                                                         {
1693                                                                 int c = _text[_index-1 + 2 + _currentnode.Name.Length];
1694                                                                 if ((c == '>') || (IsWhiteSpace(c)))
1695                                                                 {
1696                                                                         // add the script as a text node
1697                                                                         HtmlNode script = CreateNode(HtmlNodeType.Text,
1698                                                                                 _currentnode._outerstartindex + _currentnode._outerlength);
1699                                                                         script._outerlength = _index-1 - script._outerstartindex;
1700                                                                         if (_streammode && ReportNode != null)
1701                                                                                 _stop_parsing = ! ReportNode (script);
1702                                                                         else
1703                                                                                 _currentnode.AppendChild(script);
1704                                                                         Debug ("Found script: [" + script.InnerText + "]");
1705
1706                                                                         PushNodeStart(HtmlNodeType.Element, _index-1);
1707                                                                         PushNodeNameStart(false, _index-1 +2);
1708                                                                         _state = ParseState.Tag;
1709                                                                         IncrementPosition();
1710                                                                 }
1711                                                         }
1712                                                 }
1713                                                 break;
1714                                 }
1715                         }
1716
1717                         // finish the current work
1718                         if (_currentnode._namestartindex > 0)
1719                         {
1720                                 PushNodeNameEnd(_index);
1721                         }
1722                         PushNodeEnd(_index, false);
1723
1724                         // we don't need this anymore
1725                         _lastnodes.Clear();
1726                 }
1727
1728                 private bool NewCheck()
1729                 {
1730                         if (_c != '<')
1731                         {
1732                                 return false;
1733                         }
1734                         //SLIM: if (_index<_text.Length)
1735                         if (! _text.Eof (_index))
1736                         {
1737                                 if (_text[_index] == '%')
1738                                 {
1739                                         switch(_state)
1740                                         {
1741                                                 case ParseState.AttributeAfterEquals:
1742                             PushAttributeValueStart(_index-1);
1743                                                         break;
1744
1745                                                 case ParseState.BetweenAttributes:
1746                                                         PushAttributeNameStart(_index-1);
1747                                                         break;
1748
1749                                                 case ParseState.WhichTag:
1750                                                         PushNodeNameStart(true, _index-1);
1751                                                         _state = ParseState.Tag;
1752                                                         break;
1753                                         }
1754                                         _oldstate = _state;
1755                                         _state = ParseState.ServerSideCode;
1756                                         return true;
1757                                 }
1758                         }
1759
1760                         PushNodeEnd(_index-1, true);
1761                         _state = ParseState.WhichTag;
1762                         //SLIM: if ((_index-1) <= (_text.Length-2))
1763                         if (!_text.Eof (_index))
1764                         {
1765                                 if (_text[_index] == '!')
1766                                 {
1767                                         PushNodeStart(HtmlNodeType.Comment, _index-1);
1768                                         PushNodeNameStart(true, _index);
1769                                         PushNodeNameEnd(_index+1);
1770                                         _state = ParseState.Comment;
1771                                         //SLIM: if (_index<(_text.Length-2))
1772                                         if (! _text.Eof (_index + 2))
1773                                         {
1774                                                 if ((_text[_index+1] == '-') &&
1775                                                         (_text[_index+2] == '-'))
1776                                                 {
1777                                                         _fullcomment = true;
1778                                                 }
1779                                                 else
1780                                                 {
1781                                                         _fullcomment = false;
1782                                                 }
1783                                         }
1784                                         return true;
1785                                 }
1786                         }
1787                         PushNodeStart(HtmlNodeType.Element, _index-1);
1788                         return true;
1789                 }
1790
1791                 private void ReadDocumentEncoding(HtmlNode node)
1792                 {
1793                         if (!OptionReadEncoding)
1794                                 return;
1795                         // format is
1796                         // <meta http-equiv="content-type" content="text/html;charset=iso-8859-1" />
1797
1798                         // when we append a child, we are in node end, so attributes are already populated
1799                         if (node._namelength == 4)      // quick check, avoids string alloc
1800                         {
1801                                 // only these nodes can occur before meta
1802                                 // if we started seeing any other node, we will never see a meta node
1803                                 if (node.NodeType == HtmlNodeType.Element &&
1804                                              (node.Name != "head" && node.Name != "script" &&
1805                                               node.Name != "style" && node.Name != "title" &&
1806                                               node.Name != "head" && node.Name != "link" &&
1807                                               node.Name != "html" && node.Name != "meta")) {
1808                                             _declaredencoding = null;
1809                                             if (_onlyDetectEncoding)
1810                                                     throw new EncodingFoundException (null);
1811                                             else
1812                                                     return;
1813                                             // FIXME: Should also handle declaredencoding mismatch with detected
1814                                             // encoding, as done below. None of the current filters run in error
1815                                             // detection mode currently, so its not needed now.
1816                                 }
1817                                 else if (node.Name == "meta") // all nodes names are lowercase
1818                                 {
1819                                         HtmlAttribute att = node.Attributes["http-equiv"];
1820                                         if (att != null)
1821                                         {
1822                                                 if (string.Compare(att.Value, "content-type", true) == 0)
1823                                                 {
1824                                                         HtmlAttribute content = node.Attributes["content"];
1825                                                         if (content != null)
1826                                                         {
1827                                                                 string charset = NameValuePairList.GetNameValuePairsValue(content.Value, "charset");
1828                                                                 if (charset != null)
1829                                                                 {
1830                                                                         _declaredencoding = Encoding.GetEncoding(charset);
1831                                                                         if (_onlyDetectEncoding)
1832                                                                         {
1833                                                                                 throw new EncodingFoundException(_declaredencoding);
1834                                                                         }
1835
1836                                                                         if (_streamencoding != null)
1837                                                                         {
1838                                                                                 if (_declaredencoding.WindowsCodePage != _streamencoding.WindowsCodePage)
1839                                                                                 {
1840                                                                                         AddError(
1841                                                                                                 HtmlParseErrorCode.CharsetMismatch,
1842                                                                                                 _line, _lineposition,
1843                                                                                                 _index, node.OuterHtml,
1844                                                                                                 "Encoding mismatch between StreamEncoding: " +
1845                                                                                                 _streamencoding.WebName + " and DeclaredEncoding: " + _declaredencoding.WebName);
1846                                                                                 }
1847                                                                         }
1848                                                                 }
1849                                                         }
1850                                                 }
1851                                         }
1852                                 }
1853                         }
1854                 }
1855
1856                 private void PushAttributeNameStart(int index)
1857                 {
1858                         _currentattribute = CreateAttribute();
1859                         _currentattribute._namestartindex = index;
1860                         _currentattribute._line = _line;
1861                         _currentattribute._lineposition = _lineposition;
1862                         _currentattribute._streamposition = index;
1863                 }
1864
1865                 private void PushAttributeNameEnd(int index)
1866                 {
1867                         _currentattribute._namelength = index - _currentattribute._namestartindex;
1868                         _currentnode.Attributes.Append(_currentattribute);
1869                 }
1870
1871                 private void PushAttributeValueStart(int index)
1872                 {
1873                         _currentattribute._valuestartindex = index;
1874                 }
1875
1876                 private void PushAttributeValueEnd(int index)
1877                 {
1878                         _currentattribute._valuelength = index - _currentattribute._valuestartindex;
1879                 }
1880
1881                 private void PushNodeStart(HtmlNodeType type, int index)
1882                 {
1883                         _currentnode = CreateNode(type, index);
1884                         _currentnode._line = _line;
1885                         _currentnode._lineposition = _lineposition;
1886                         if (type == HtmlNodeType.Element)
1887                         {
1888                                 _currentnode._lineposition--;
1889                         }
1890                         _currentnode._streamposition = index;
1891                 }
1892
1893                 private void PushNodeEnd(int index, bool close)
1894                 {
1895                         _currentnode._outerlength = index - _currentnode._outerstartindex;
1896
1897                         //SLIM: inform caller
1898                         if (_streammode && ReportNode != null)
1899                                 _stop_parsing = ! ReportNode (_currentnode);
1900
1901                         if (_debug) {
1902                                 if (_currentnode._nodetype == HtmlNodeType.Text)
1903                                         Debug ("Text:" + _currentnode.InnerText);
1904                                 else
1905                                         Debug ((_currentnode.StartTag ? "Start-" : "End-") + _currentnode.Name);
1906                         }
1907                         if ((_currentnode._nodetype == HtmlNodeType.Text) ||
1908                                 (_currentnode._nodetype == HtmlNodeType.Comment))
1909                         {
1910                                 // forget about void nodes
1911                                 if (_currentnode._outerlength>0)
1912                                 {
1913                                         _currentnode._innerlength = _currentnode._outerlength;
1914                                         _currentnode._innerstartindex = _currentnode._outerstartindex;
1915                                         // SLIM: no need to append child in stream mode
1916                                         // SLIM: whatever the caller needs to do, tell it to do now
1917                                         if (!_streammode && _lastparentnode != null)
1918                                         {
1919                                            _lastparentnode.AppendChild(_currentnode);
1920                                         }
1921                                 }
1922                         }
1923                         else
1924                         {
1925                                 if ((_currentnode._starttag) && (_lastparentnode != _currentnode))
1926                                 {
1927                                         // add to parent node
1928                                         // SLIM: no need to append child in stream mode
1929                                         // SLIM: whatever the caller needs to do, tell it to do now
1930                                         if (!_streammode && _lastparentnode != null)
1931                                         {
1932                                            _lastparentnode.AppendChild(_currentnode);
1933                                         }
1934
1935                                         ReadDocumentEncoding(_currentnode);
1936
1937                                         // remember last node of this kind
1938                                         // SLIM: we still to store _currentnode to help other tags in the same level
1939                                         HtmlNode prev = (HtmlNode)_lastnodes[_currentnode.Name];
1940                                         _currentnode._prevwithsamename = prev;
1941                                         _lastnodes[_currentnode.Name] = _currentnode;
1942
1943                                         // change parent?
1944                                         if ((_currentnode.NodeType == HtmlNodeType.Document) ||
1945                                                 (_currentnode.NodeType == HtmlNodeType.Element))
1946                                         {
1947                                                 _lastparentnode = _currentnode;
1948                                         }
1949
1950                                         if (HtmlNode.IsCDataElement(CurrentNodeName()))
1951                                         {
1952                                                 _state = ParseState.PcData;
1953                                                 return;
1954                                         }
1955
1956                                         if ((HtmlNode.IsClosedElement(_currentnode.Name)) ||
1957                                                 (HtmlNode.IsEmptyElement(_currentnode.Name)))
1958                                         {
1959                                                 close = true;
1960                                         }
1961                                 }
1962                         }
1963
1964                         if ((close) || (!_currentnode._starttag))
1965                         {
1966                                 CloseCurrentNode();
1967                                 if ((_currentnode._nodetype == HtmlNodeType.Text) ||
1968                                     (_currentnode._nodetype == HtmlNodeType.Comment))
1969                                         _currentnode = null;
1970                         }
1971                 }
1972
1973                 private void PushNodeNameStart(bool starttag, int index)
1974                 {
1975                         _currentnode._starttag = starttag;
1976                         _currentnode._namestartindex = index;
1977                 }
1978
1979                 private string[] GetResetters(string name)
1980                 {
1981                         switch (name)
1982                         {
1983                                 case "li":
1984                                         return new string[]{"ul"};
1985
1986                                 case "tr":
1987                                         return new string[]{"table"};
1988
1989                                 case "th":
1990                                 case "td":
1991                                         return new string[]{"tr", "table"};
1992
1993                                 default:
1994                                         return null;
1995                         }
1996                 }
1997
1998                 private void FixNestedTags()
1999                 {
2000                         // we are only interested by start tags, not closing tags
2001                         if (!_currentnode._starttag)
2002                                 return;
2003
2004                         string name = CurrentNodeName().ToLower();
2005                         FixNestedTag(name, GetResetters(name));
2006                 }
2007
2008                 private void FixNestedTag(string name, string[] resetters)
2009                 {
2010                         if (resetters == null)
2011                                 return;
2012
2013                         HtmlNode prev;
2014
2015                         // if we find a previous unclosed same name node, without a resetter node between, we must close it
2016                         prev = (HtmlNode)_lastnodes[name];
2017                         if ((prev != null) && (!prev.Closed))
2018                         {
2019
2020                                 // try to find a resetter node, if found, we do nothing
2021                                 if (FindResetterNodes(prev, resetters))
2022                                 {
2023                                         return;
2024                                 }
2025
2026                                 // ok we need to close the prev now
2027                                 // create a fake closer node
2028                                 HtmlNode close = new HtmlNode(prev.NodeType, this, -1);
2029                                 close._endnode = close;
2030                                 prev.CloseNode(close);
2031
2032                         }
2033                 }
2034
2035                 private bool FindResetterNodes(HtmlNode node, string[] names)
2036                 {
2037                         if (names == null)
2038                         {
2039                                 return false;
2040                         }
2041                         for(int i=0;i<names.Length;i++)
2042                         {
2043                                 if (FindResetterNode(node, names[i]) != null)
2044                                 {
2045                                         return true;
2046                                 }
2047                         }
2048                         return false;
2049                 }
2050
2051                 private HtmlNode FindResetterNode(HtmlNode node, string name)
2052                 {
2053                         HtmlNode resetter = (HtmlNode)_lastnodes[name];
2054                         if (resetter == null)
2055                                 return null;
2056                         if (resetter.Closed)
2057                         {
2058                                 return null;
2059                         }
2060                         if (resetter._streamposition<node._streamposition)
2061                         {
2062                                 return null;
2063                         }
2064                         return resetter;
2065                 }
2066
2067                 private void PushNodeNameEnd(int index)
2068                 {
2069                         _currentnode._namelength = index - _currentnode._namestartindex;
2070                         if (OptionFixNestedTags)
2071                         {
2072                                 FixNestedTags();
2073                         }
2074                 }
2075
2076                 private void CloseCurrentNode()
2077                 {
2078                         if (_currentnode.Closed) // text or document are by def closed
2079                                 return;
2080
2081                         bool error = false;
2082
2083                         // find last node of this kind
2084                         HtmlNode prev = (HtmlNode)_lastnodes[_currentnode.Name];
2085                         if (prev == null)
2086                         {
2087                                 if (HtmlNode.IsClosedElement(_currentnode.Name))
2088                                 {
2089                                         // </br> will be seen as <br>
2090                                         _currentnode.CloseNode(_currentnode);
2091
2092                                         // add to parent node
2093                                         if (_lastparentnode != null)
2094                                         {
2095                                                 HtmlNode foundNode = null;
2096                                                 Stack futureChild = new Stack();
2097                                                 for (HtmlNode node = _lastparentnode.LastChild; node != null; node = node.PreviousSibling)
2098                                                 {
2099                                                         if ((node.Name == _currentnode.Name) && (! node.HasChildNodes))
2100                                                         {
2101                                                                 foundNode = node;
2102                                                                 break;
2103                                                         }
2104                                                         futureChild.Push(node);
2105                                                 }
2106                                                 if (foundNode != null)
2107                                                 {
2108                                                         HtmlNode node = null;
2109                                                         while(futureChild.Count != 0)
2110                                                         {
2111                                                                 node = (HtmlNode)futureChild.Pop();
2112                                                                 _lastparentnode.RemoveChild(node);
2113                                                                 foundNode.AppendChild(node);
2114                                                         }
2115                                                 }
2116                                                 else
2117                                                 {
2118                                                         _lastparentnode.AppendChild(_currentnode);
2119                                                 }
2120
2121                                         }
2122                                 }
2123                                 else
2124                                 {
2125                                         // node has no parent
2126                                         // node is not a closed node
2127
2128                                         if (HtmlNode.CanOverlapElement(_currentnode.Name))
2129                                         {
2130                                                 // this is a hack: add it as a text node
2131                                                 HtmlNode closenode = CreateNode(HtmlNodeType.Text, _currentnode._outerstartindex);
2132                                                 closenode._outerlength = _currentnode._outerlength;
2133                                                 ((HtmlTextNode)closenode).Text = ((HtmlTextNode)closenode).Text.ToLower();
2134                                                 if (_lastparentnode != null)
2135                                                 {
2136                                                         _lastparentnode.AppendChild(closenode);
2137                                                 }
2138
2139                                         }
2140                                         else
2141                                         {
2142                                                 if (HtmlNode.IsEmptyElement(_currentnode.Name))
2143                                                 {
2144                                                         AddError(
2145                                                                 HtmlParseErrorCode.EndTagNotRequired,
2146                                                                 _currentnode._line, _currentnode._lineposition,
2147                                                                 _currentnode._streamposition, _currentnode.OuterHtml,
2148                                                                 "End tag </" + _currentnode.Name + "> is not required");
2149                                                 }
2150                                                 else
2151                                                 {
2152                                                         // node cannot overlap, node is not empty
2153                                                         AddError(
2154                                                                 HtmlParseErrorCode.TagNotOpened,
2155                                                                 _currentnode._line, _currentnode._lineposition,
2156                                                                 _currentnode._streamposition, _currentnode.OuterHtml,
2157                                                                 "Start tag <" + _currentnode.Name + "> was not found");
2158                                                         error = true;
2159                                                 }
2160                                         }
2161                                 }
2162                         }
2163                         else
2164                         {
2165                                 if (OptionFixNestedTags)
2166                                 {
2167                                         if (FindResetterNodes(prev, GetResetters(_currentnode.Name)))
2168                                         {
2169                                                 AddError(
2170                                                         HtmlParseErrorCode.EndTagInvalidHere,
2171                                                         _currentnode._line, _currentnode._lineposition,
2172                                                         _currentnode._streamposition, _currentnode.OuterHtml,
2173                                                         "End tag </" + _currentnode.Name + "> invalid here");
2174                                                 error = true;
2175                                         }
2176                                 }
2177
2178                                 if (!error)
2179                                 {
2180                                         _lastnodes[_currentnode.Name] = prev._prevwithsamename;
2181                                         prev.CloseNode(_currentnode);
2182                                 }
2183                         }
2184
2185
2186                         // we close this node, get grandparent
2187                         if (!error)
2188                         {
2189                                 if ((_lastparentnode != null) &&
2190                                         ((!HtmlNode.IsClosedElement(_currentnode.Name)) ||
2191                                         (_currentnode._starttag)))
2192                                 {
2193                                         UpdateLastParentNode();
2194                                 }
2195                         }
2196                 }
2197
2198                 internal void UpdateLastParentNode()
2199                 {
2200                         do
2201                         {
2202                                 if (_lastparentnode.Closed)
2203                                 {
2204                                         _lastparentnode = _lastparentnode.ParentNode;
2205                                 }
2206                         }
2207                         while ((_lastparentnode != null) && (_lastparentnode.Closed));
2208                         if (_lastparentnode == null)
2209                         {
2210                                 _lastparentnode = _documentnode;
2211                         }
2212                 }
2213
2214                 private string CurrentAttributeName()
2215                 {
2216                         return _text.Substring(_currentattribute._namestartindex, _currentattribute._namelength);
2217                 }
2218
2219                 private string CurrentAttributeValue()
2220                 {
2221                         return _text.Substring(_currentattribute._valuestartindex, _currentattribute._valuelength);
2222                 }
2223
2224                 private string CurrentNodeName()
2225                 {
2226                         return _text.Substring(_currentnode._namestartindex, _currentnode._namelength);
2227                 }
2228
2229                 private string CurrentNodeOuter()
2230                 {
2231                         return _text.Substring(_currentnode._outerstartindex, _currentnode._outerlength);
2232                 }
2233
2234                 private string CurrentNodeInner()
2235                 {
2236                         return _text.Substring(_currentnode._innerstartindex, _currentnode._innerlength);
2237                 }
2238
2239                 /// <summary>
2240                 /// Determines if the specified character is considered as a whitespace character.
2241                 /// </summary>
2242                 /// <param name="c">The character to check.</param>
2243                 /// <returns>true if if the specified character is considered as a whitespace character.</returns>
2244                 public static bool IsWhiteSpace(int c)
2245                 {
2246                         if ((c == 10) || (c == 13) || (c == 32) || (c == 9))
2247                         {
2248                                 return true;
2249                         }
2250                         return false;
2251                 }
2252
2253         }
2254
2255         internal class EncodingFoundException: Exception
2256         {
2257                 private Encoding _encoding;
2258
2259                 internal EncodingFoundException(Encoding encoding)
2260                 {
2261                         _encoding = encoding;
2262                 }
2263
2264                 internal Encoding Encoding
2265                 {
2266                         get
2267                         {
2268                                 return _encoding;
2269                         }
2270                 }
2271         }
2272 }