Filters/HtmlAgilityPack/HtmlDocument.cs

   1 // HtmlAgilityPack V1.0 - Simon Mourier <simonm@microsoft.com>
   2
   3 /*
   4 Copyright (C) 2003 Simon Mourier <simonm@microsoft.com>
   5 All rights reserved.
   6
   7 Redistribution and use in source and binary forms, with or without
   8 modification, are permitted provided that the following conditions
   9 are met:
  10 1. Redistributions of source code must retain the above copyright
  11    notice, this list of conditions and the following disclaimer.
  12 2. Redistributions in binary form must reproduce the above copyright
  13    notice, this list of conditions and the following disclaimer in the
  14    documentation and/or other materials provided with the distribution.
  15 3. The name of the author may not be used to endorse or promote products
  16    derived from this software without specific prior written permission.
  17
  18 THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  19 IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  20 OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  21 IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  22 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  23 NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  27 THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28 */
  29
  30 using System;
  31 using System.IO;
  32 using System.Text;
  33 using System.Diagnostics;
  34 using System.Collections;
  35 using System.Text.RegularExpressions;
  36 using System.Xml;
  37 using System.Xml.XPath;
  38
  39
  40 // Legend: SLIM=Comment added describing changes to original HtmlAgilityPack
  41 //              to reduce memory consumption
  42 // Once the parser is free of bugs, the comments will be taken out
  43 namespace HtmlAgilityPack
  44 {
  45         /// <summary>
  46         /// Represents the type of parsing error.
  47         /// </summary>
  48         public enum HtmlParseErrorCode
  49         {
  50                 /// <summary>
  51                 /// A tag was not closed.
  52                 /// </summary>
  53                 TagNotClosed,
  54
  55                 /// <summary>
  56                 /// A tag was not opened.
  57                 /// </summary>
  58                 TagNotOpened,
  59
  60                 /// <summary>
  61                 /// There is a charset mismatch between stream and declared (META) encoding.
  62                 /// </summary>
  63                 CharsetMismatch,
  64
  65                 /// <summary>
  66                 /// An end tag was not required.
  67                 /// </summary>
  68                 EndTagNotRequired,
  69
  70                 /// <summary>
  71                 /// An end tag is invalid at this position.
  72                 /// </summary>
  73                 EndTagInvalidHere
  74         }
  75
  76         /// <summary>
  77         /// Represents a parsing error found during document parsing.
  78         /// </summary>
  79         public class HtmlParseError
  80         {
  81                 private HtmlParseErrorCode _code;
  82                 private int _line;
  83                 private int _linePosition;
  84                 private int _streamPosition;
  85                 private string _sourceText;
  86                 private string _reason;
  87
  88                 internal HtmlParseError(
  89                         HtmlParseErrorCode code,
  90                         int line,
  91                         int linePosition,
  92                         int streamPosition,
  93                         string sourceText,
  94                         string reason)
  95                 {
  96                         _code = code;
  97                         _line = line;
  98                         _linePosition = linePosition;
  99                         _streamPosition = streamPosition;
 100                         _sourceText = sourceText;
 101                         _reason = reason;
 102                 }
 103
 104                 /// <summary>
 105                 /// Gets the type of error.
 106                 /// </summary>
 107                 public HtmlParseErrorCode Code
 108                 {
 109                         get
 110                         {
 111                                 return _code;
 112                         }
 113                 }
 114
 115                 /// <summary>
 116                 /// Gets the line number of this error in the document.
 117                 /// </summary>
 118                 public int Line
 119                 {
 120                         get
 121                         {
 122                                 return _line;
 123                         }
 124                 }
 125
 126                 /// <summary>
 127                 /// Gets the column number of this error in the document.
 128                 /// </summary>
 129                 public int LinePosition
 130                 {
 131                         get
 132                         {
 133                                 return _linePosition;
 134                         }
 135                 }
 136
 137                 /// <summary>
 138                 /// Gets the absolstream position of this error in the document, relative to the start of the document.
 139                 /// </summary>
 140                 public int StreamPosition
 141                 {
 142                         get
 143                         {
 144                                 return _streamPosition;
 145                         }
 146                 }
 147
 148                 /// <summary>
 149                 /// Gets the the full text of the line containing the error.
 150                 /// </summary>
 151                 public string SourceText
 152                 {
 153                         get
 154                         {
 155                                 return _sourceText;
 156                                 }
 157                 }
 158
 159                 /// <summary>
 160                 /// Gets a description for the error.
 161                 /// </summary>
 162                 public string Reason
 163                 {
 164                         get
 165                         {
 166                                 return _reason;
 167                         }
 168                 }
 169         }
 170
 171
 172         abstract class StreamAsArray {
 173                 public abstract bool Eof (int index);
 174                 public abstract char this [int index] { get;}
 175                 public abstract string Substring (int startindex, int length);
 176                 public abstract int FullLength { get;}
 177         }
 178
 179         // SLIM: creating this class to wrap around a textreader
 180         //       to emulate ReadToEnd () behaviour
 181         class ImplStreamAsArray : StreamAsArray {
 182                 private StreamReader _reader;
 183                 private int _length;
 184                 private int _position;
 185                 private bool _eof;
 186                 private char[] _buf_previous; // could have used only one array
 187                 private char[] _buf_current; // but, this is cleaner
 188                 private int _block_size;
 189
 190                 public ImplStreamAsArray (StreamReader r)
 191                 {
 192                         _reader = r;
 193                         _length = 0;
 194                         _position = 0;
 195                         _eof = false;
 196
 197                         _block_size = 1024;
 198                         _buf_previous = new char [_block_size];
 199                         _buf_current = new char [_block_size];
 200
 201                         Read (true);
 202                 }
 203
 204                 private void Read (bool initial)
 205                 {
 206                         if ( !initial) {
 207                                 Array.Copy (_buf_current, _buf_previous, _block_size);
 208                                 _position += _block_size;
 209                         }
 210                         HtmlDocument.Debug ("Debug: Read in buffer at:" + _position);
 211
 212                         int num_read = _reader.Read (_buf_current, 0, _block_size);
 213                         if (num_read < _block_size) {
 214                                 _eof = true;
 215                                 _length = _position + num_read;
 216                         }
 217                         HtmlDocument.Debug ("[" + new string (_buf_current, 0, num_read) + "]");
 218                 }
 219
 220                 public override bool Eof (int index) {
 221                         if (_eof)
 222                                 return (index == _length);
 223                         else {
 224                                 if (index >= _position + _block_size &&
 225                                     index < _position + _block_size + _block_size)
 226                                         Read (false);
 227                                 if (_eof)
 228                                         return (index == _length);
 229                                 else
 230                                         return false;
 231                         }
 232                 }
 233
 234                 public override char this[int index] {
 235                         get {
 236                                 if (index >= _position &&
 237                                     index < _position + _block_size)
 238                                         return _buf_current [index % _block_size];
 239                                 if (index >= _position - _block_size &&
 240                                     index < _position)
 241                                         return _buf_previous [ index % _block_size];
 242                                 if (index >= _position + _block_size &&
 243                                     index < _position + _block_size + _block_size) {
 244                                         Read (false);
 245                                         return _buf_current [index % _block_size];
 246                                 }
 247                                 Console.WriteLine ("EXCEPTION!!!");
 248                                 throw new Exception (String.Format ("{0} is out of current bounds:[{1}-{2}] and further than read-ahead",
 249                                                                     index,
 250                                                                     _position - _block_size,
 251                                                                     _position + _block_size - 1));
 252                         }
 253                 }
 254
 255                 // evil function ... you get what you pay for!
 256                 private string OutOfBandRead (int startindex, int length)
 257                 {
 258                         HtmlDocument.Debug ("Out of band read! From " + startindex + " to " + (startindex + length - 1));
 259                         ResetPosition (startindex);
 260                         // ahh.. now we are at the correct place
 261                         // create a buffer of required length
 262                         // who cares if the buffer size does not align well
 263                         // with page boundary
 264                         char[] temp_buf = new char [length];
 265                         int num_read = _reader.Read (temp_buf, 0, length);
 266                         if (num_read < length) {
 267                                 // Shouldnt occur!!!
 268                                 _eof = true;
 269                                 _length = startindex + num_read;
 270                         }
 271                         // discard data and reset stream position
 272                         int t = (_eof ? _length :_position + _block_size);
 273                         ResetPosition (t);
 274                         return new String (temp_buf);
 275                 }
 276
 277                 // streamreader does not allow seeking
 278                 // seek on its basestream does not reflect the position
 279                 // of the reader - it is governed by the buffer size
 280                 // of the underlying stream
 281                 // :( so, read character by character from beginning ...
 282                 private void ResetPosition (int pos)
 283                 {
 284                         _reader.DiscardBufferedData ();
 285                         _reader.BaseStream.Position = 0;
 286                         // read in chunks of block_size
 287                         int n1 = pos / _block_size;
 288                         int n2 = pos % _block_size;
 289                         char[] tmp = new char [_block_size];
 290                         // yo ho... start reading till we have reach pos
 291                         // hopefully, reader will buffer itself, so we can be mean and get one char at a time
 292                         for (int i = 0; i < n1; ++i)
 293                                 _reader.Read (tmp, 0, _block_size);
 294                         for (int i = 0; i < n2; ++i)
 295                                 _reader.Read ();
 296                         tmp = null;
 297                 }
 298
 299                 public override string Substring (int startindex, int length)
 300                 {
 301                         if (length == 0) {
 302                                 HtmlDocument.Debug ("substring:" + startindex + " " + length + " " + _position + ":");
 303                                 return String.Empty;
 304                         }
 305                         if (length > _block_size || startindex < _position - _block_size) {
 306                                 return OutOfBandRead (startindex, length);
 307                         }
 308                         if (startindex + length - 1 >= _position + _block_size) {
 309                                 Read (false);
 310                         }
 311                         string substr;
 312                         if (startindex < _position) {
 313                                 int len_1 = _position - startindex;
 314                                 if (length < len_1)
 315                                         substr = new String (_buf_previous, _block_size - len_1, length);
 316                                 else {
 317                                         substr = new String (_buf_previous, _block_size - len_1, len_1);
 318                                         substr += new String (_buf_current, 0, length - len_1);
 319                                 }
 320                         } else {
 321                                 substr = new String (_buf_current, startindex - _position, length);
 322                         }
 323                         return substr;
 324                 }
 325
 326                 // FIXME: Is this costly ?
 327                 public override int FullLength {
 328                         get {
 329                                 return (int)_reader.BaseStream.Length;
 330                         }
 331                 }
 332         }
 333
 334         // A dummy StreamAsArray wrapper around a string
 335         class DummyStreamAsArray : StreamAsArray {
 336                 private string _base_string;
 337                 private int _length;
 338
 339                 public DummyStreamAsArray(string str)
 340                 {
 341                         _base_string = str;
 342                         _length = str.Length;
 343                 }
 344
 345                 public override bool Eof(int index)
 346                 {
 347                         return (index >= _length);
 348                 }
 349
 350                 public new char this[int index] {
 351                         get { return _base_string [index]; }
 352                 }
 353
 354                 public override string Substring (int startindex, int length)
 355                 {
 356                         return _base_string.Substring (startindex, length);
 357                 }
 358
 359                 public override int FullLength {
 360                         get { return _length; }
 361                 }
 362         }
 363
 364         /// <summary>
 365         /// Represents a complete HTML document.
 366         /// </summary>
 367         public class HtmlDocument: IXPathNavigable
 368         {
 369                 // SLIM: Make the parser event driven
 370                 // callback for FilterHtml
 371                 // return value is a way for the callback to signal to continue or stop parsing
 372                 public delegate bool NodeHandler (HtmlNode node);
 373                 public NodeHandler ReportNode;
 374                 // misnomer ... should be called event_driven_mode
 375                 private bool _streammode = false;
 376                 private bool _stop_parsing = false;
 377
 378                 internal static readonly string HtmlExceptionRefNotChild = "Reference node must be a child of this node";
 379                 internal static readonly string HtmlExceptionUseIdAttributeFalse = "You need to set UseIdAttribute property to true to enable this feature";
 380
 381                 internal Hashtable _openednodes;
 382                 internal Hashtable _lastnodes = new Hashtable();
 383                 internal Hashtable _nodesid;
 384                 private HtmlNode _documentnode;
 385                 //SLIM: internal string _text;
 386                 internal StreamAsArray _text;
 387                 private HtmlNode _currentnode;
 388                 private HtmlNode _lastparentnode;
 389                 private HtmlAttribute _currentattribute;
 390                 private int _index;
 391                 private int _line;
 392                 private int _lineposition, _maxlineposition;
 393                 private int _c;
 394                 private bool _fullcomment;
 395                 private System.Text.Encoding _streamencoding;
 396                 private System.Text.Encoding _declaredencoding;
 397                 private ArrayList _parseerrors = new ArrayList();
 398                 private ParseState _state, _oldstate;
 399                 private Crc32 _crc32 = null;
 400                 private bool _onlyDetectEncoding = false;
 401                 private int _pcdata_quote_char = '\0';
 402
 403                 private static bool _debug = false;
 404                 internal static void Debug (string s)
 405                 {
 406                         if (_debug)
 407                                 Console.WriteLine (s);
 408                 }
 409
 410                 // public props
 411
 412                 /// <summary>
 413                 /// Defines if a checksum must be computed for the document while parsing. Default is false.
 414                 /// </summary>
 415                 public bool OptionComputeChecksum = false;
 416
 417                 /// <summary>
 418                 /// Defines if declared encoding must be read from the document.
 419                 /// Declared encoding is determined using the meta http-equiv="content-type" content="text/html;charset=XXXXX" html node.
 420                 /// Default is true.
 421                 /// </summary>
 422                 public bool OptionReadEncoding = true;
 423
 424
 425                 /// <summary>
 426                 /// Defines if non closed nodes will be checked at the end of parsing. Default is true.
 427                 /// </summary>
 428                 public bool OptionCheckSyntax = true;
 429
 430                 /// <summary>
 431                 /// Defines if the 'id' attribute must be specifically used. Default is true.
 432                 /// </summary>
 433                 public bool OptionUseIdAttribute = true;
 434
 435                 /// <summary>
 436                 /// Defines if empty nodes must be written as closed during output. Default is false.
 437                 /// </summary>
 438                 public bool OptionWriteEmptyNodes = false;
 439
 440                 /// <summary>
 441                 /// Defines if output must conform to XML, instead of HTML.
 442                 /// </summary>
 443                 public bool OptionOutputAsXml = false;
 444
 445                 /// <summary>
 446                 /// Defines if name must be output in uppercase. Default is false.
 447                 /// </summary>
 448                 public bool OptionOutputUpperCase = false;
 449
 450                 /// <summary>
 451                 /// Defines if attribute value output must be optimized (not bound with double quotes if it is possible). Default is false.
 452                 /// </summary>
 453                 public bool OptionOutputOptimizeAttributeValues = false;
 454
 455                 /// <summary>
 456                 /// Adds Debugging attributes to node. Default is false.
 457                 /// </summary>
 458                 public bool OptionAddDebuggingAttributes = false;
 459
 460                 /// <summary>
 461                 /// Defines if source text must be extracted while parsing errors.
 462                 /// If the document has a lot of errors, or cascading errors, parsing performance can be dramatically affected if set to true.
 463                 /// Default is false.
 464                 /// </summary>
 465                 public bool OptionExtractErrorSourceText = false; // turning this on can dramatically slow performance if a lot of errors are detected
 466
 467                 /// <summary>
 468                 /// Defines if closing for non closed nodes must be done at the end or directly in the document.
 469                 /// Setting this to true can actually change how browsers render the page. Default is false.
 470                 /// </summary>
 471                 public bool OptionAutoCloseOnEnd = false; // close errors at the end
 472
 473                 /// <summary>
 474                 /// Defines if LI, TR, TH, TD tags must be partially fixed when nesting errors are detected. Default is false.
 475                 /// </summary>
 476                 public bool OptionFixNestedTags = false; // fix li, tr, th, td tags
 477
 478                 /// <summary>
 479                 /// Defines the maximum length of source text or parse errors. Default is 100.
 480                 /// </summary>
 481                 public int OptionExtractErrorSourceTextMaxLength = 100;
 482
 483                 /// <summary>
 484                 /// Defines the default stream encoding to use. Default is System.Text.Encoding.Default.
 485                 /// </summary>
 486                 // From http://www.w3.org/TR/REC-html40/charset.html
 487                 // The HTTP protocol ([RFC2616], section 3.7.1) mentions ISO-8859-1 as a default character encoding when the "charset" parameter is absent from the "Content-Type" header field.
 488                 // So, however we are still using UTF-8 for some unknown reason
 489                 //FIXME: Fix the default encoding!
 490                 public System.Text.Encoding OptionDefaultStreamEncoding = Encoding.UTF8;
 491
 492                 /// <summary>
 493                 /// Gets a list of parse errors found in the document.
 494                 /// </summary>
 495                 public ArrayList ParseErrors
 496                 {
 497                         get
 498                         {
 499                                 return _parseerrors;
 500                         }
 501                 }
 502
 503                 /// <summary>
 504                 /// Gets the document's stream encoding.
 505                 /// </summary>
 506                 public System.Text.Encoding StreamEncoding
 507                 {
 508                         get
 509                         {
 510                                 return _streamencoding;
 511                         }
 512                 }
 513
 514                 /// <summary>
 515                 /// Gets the document's declared encoding.
 516                 /// Declared encoding is determined using the meta http-equiv="content-type" content="text/html;charset=XXXXX" html node.
 517                 /// </summary>
 518                 public System.Text.Encoding DeclaredEncoding
 519                 {
 520                         get
 521                         {
 522                                 return _declaredencoding;
 523                         }
 524                 }
 525
 526                 /// <summary>
 527                 /// Creates an instance of an HTML document.
 528                 /// </summary>
 529                 public HtmlDocument()
 530                 {
 531                         _documentnode = CreateNode(HtmlNodeType.Document, 0);
 532                 }
 533
 534                 internal HtmlNode GetXmlDeclaration()
 535                 {
 536                         if (!_documentnode.HasChildNodes)
 537                         {
 538                                 return null;
 539                         }
 540
 541                         foreach(HtmlNode node in _documentnode._childnodes)
 542                         {
 543                                 if (node.Name == "?xml") // it's ok, names are case sensitive
 544                                 {
 545                                         return node;
 546                                 }
 547                         }
 548                         return null;
 549                 }
 550
 551                 /// <summary>
 552                 /// Applies HTML encoding to a specified string.
 553                 /// </summary>
 554                 /// <param name="html">The input string to encode. May not be null.</param>
 555                 /// <returns>The encoded string.</returns>
 556                 public static string HtmlEncode(string html)
 557                 {
 558                         if (html == null)
 559                         {
 560                                 throw new ArgumentNullException("html");
 561                         }
 562                         // replace & by &amp; but only once!
 563                         Regex rx = new Regex("&(?!(amp;)|(lt;)|(gt;)|(quot;))", RegexOptions.IgnoreCase);
 564                         return rx.Replace(html, "&amp;").Replace("<", "&lt;").Replace(">", "&gt;").Replace("\"", "&quot;");
 565                 }
 566
 567                 /// <summary>
 568                 /// Detects the encoding of an HTML stream.
 569                 /// </summary>
 570                 /// <param name="stream">The input stream. May not be null.</param>
 571                 /// <returns>The detected encoding.</returns>
 572                 public Encoding DetectEncoding(Stream stream)
 573                 {
 574                         if (stream == null)
 575                         {
 576                                 throw new ArgumentNullException("stream");
 577                         }
 578                         return DetectEncoding(new StreamReader(stream));
 579                 }
 580
 581                 /// <summary>
 582                 /// Detects the encoding of an HTML file.
 583                 /// </summary>
 584                 /// <param name="path">Path for the file containing the HTML document to detect. May not be null.</param>
 585                 /// <returns>The detected encoding.</returns>
 586                 public Encoding DetectEncoding(string path)
 587                 {
 588                         if (path == null)
 589                         {
 590                                 throw new ArgumentNullException("path");
 591                         }
 592                         StreamReader sr = new StreamReader(path, OptionDefaultStreamEncoding);
 593                         Encoding encoding = DetectEncoding(sr);
 594                         sr.Close();
 595                         return encoding;
 596                 }
 597
 598                 /// <summary>
 599                 /// Detects the encoding of an HTML text.
 600                 /// </summary>
 601                 /// <param name="html">The input html text. May not be null.</param>
 602                 /// <returns>The detected encoding.</returns>
 603                 public Encoding DetectEncodingHtml(string html)
 604                 {
 605                         if (html == null)
 606                         {
 607                                 throw new ArgumentNullException("html");
 608                         }
 609                         StringReader sr = new StringReader(html);
 610                         Encoding encoding = DetectEncoding(sr);
 611                         sr.Close();
 612                         return encoding;
 613                 }
 614
 615                 /// <summary>
 616                 /// Detects the encoding of an HTML text provided on a TextReader.
 617                 /// </summary>
 618                 /// <param name="reader">The TextReader used to feed the HTML. May not be null.</param>
 619                 /// <returns>The detected encoding.</returns>
 620                 public Encoding DetectEncoding(TextReader reader)
 621                 {
 622                         if (reader == null)
 623                         {
 624                                 throw new ArgumentNullException("reader");
 625                         }
 626                         _onlyDetectEncoding = true;
 627                         if (OptionCheckSyntax)
 628                         {
 629                                 _openednodes = new Hashtable();
 630                         }
 631                         else
 632                         {
 633                                 _openednodes = null;
 634                         }
 635
 636                         if (OptionUseIdAttribute)
 637                         {
 638                                 _nodesid = new Hashtable();
 639                         }
 640                         else
 641                         {
 642                                 _nodesid = null;
 643                         }
 644
 645                         StreamReader sr = reader as StreamReader;
 646                         if (sr != null)
 647                         {
 648                                 _streamencoding = sr.CurrentEncoding;
 649                                 _text = new ImplStreamAsArray (sr);
 650                         }
 651                         else
 652                         {
 653                                 _streamencoding = null;
 654                                 // Expensive, but cannot avoid since TextReader doesnt have any length of the underlying data
 655                                 _text = new DummyStreamAsArray (reader.ReadToEnd());
 656                         }
 657                         _declaredencoding = null;
 658
 659                         // SLIM: _text = reader.ReadToEnd();
 660                         _documentnode = CreateNode(HtmlNodeType.Document, 0);
 661
 662                         // this is a hack, but it allows us not to muck with the original parsing code
 663                         try
 664                         {
 665                                 Parse();
 666                         }
 667                         catch(EncodingFoundException ex)
 668                         {
 669                                 _lastnodes.Clear();
 670                                 return ex.Encoding;
 671                         }
 672                         return null;
 673                 }
 674
 675                 /// <summary>
 676                 /// Loads an HTML document from a stream.
 677                 /// </summary>
 678                 /// <param name="stream">The input stream.</param>
 679                 public void Load(Stream stream)
 680                 {
 681                         Load(new StreamReader(stream, OptionDefaultStreamEncoding));
 682                 }
 683
 684                 /// <summary>
 685                 /// Loads an HTML document from a stream.
 686                 /// </summary>
 687                 /// <param name="stream">The input stream.</param>
 688                 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the stream.</param>
 689                 public void Load(Stream stream, bool detectEncodingFromByteOrderMarks)
 690                 {
 691                         Load(new StreamReader(stream, detectEncodingFromByteOrderMarks));
 692                 }
 693
 694                 /// <summary>
 695                 /// Loads an HTML document from a stream.
 696                 /// </summary>
 697                 /// <param name="stream">The input stream.</param>
 698                 /// <param name="encoding">The character encoding to use.</param>
 699                 public void Load(Stream stream, Encoding encoding)
 700                 {
 701                         Load(new StreamReader(stream, encoding));
 702                 }
 703
 704                 /// <summary>
 705                 /// Loads an HTML document from a stream.
 706                 /// </summary>
 707                 /// <param name="stream">The input stream.</param>
 708                 /// <param name="encoding">The character encoding to use.</param>
 709                 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the stream.</param>
 710                 public void Load(Stream stream, Encoding encoding, bool detectEncodingFromByteOrderMarks)
 711                 {
 712                         Load(new StreamReader(stream, encoding, detectEncodingFromByteOrderMarks));
 713                 }
 714
 715                 /// <summary>
 716                 /// Loads an HTML document from a stream.
 717                 /// </summary>
 718                 /// <param name="stream">The input stream.</param>
 719                 /// <param name="encoding">The character encoding to use.</param>
 720                 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the stream.</param>
 721                 /// <param name="buffersize">The minimum buffer size.</param>
 722                 public void Load(Stream stream, Encoding encoding, bool detectEncodingFromByteOrderMarks, int buffersize)
 723                 {
 724                         Load(new StreamReader(stream, encoding, detectEncodingFromByteOrderMarks, buffersize));
 725                 }
 726
 727                 /// <summary>
 728                 /// Loads an HTML document from a file.
 729                 /// </summary>
 730                 /// <param name="path">The complete file path to be read. May not be null.</param>
 731                 public void Load(string path)
 732                 {
 733                         if (path == null)
 734                         {
 735                                 throw new ArgumentNullException("path");
 736                         }
 737                         StreamReader sr = new StreamReader(path, OptionDefaultStreamEncoding);
 738                         Load(sr);
 739                         sr.Close();
 740                 }
 741
 742                 /// <summary>
 743                 /// Loads an HTML document from a file.
 744                 /// </summary>
 745                 /// <param name="path">The complete file path to be read. May not be null.</param>
 746                 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
 747                 public void Load(string path, bool detectEncodingFromByteOrderMarks)
 748                 {
 749                         if (path == null)
 750                         {
 751                                 throw new ArgumentNullException("path");
 752                         }
 753                         StreamReader sr = new StreamReader(path, detectEncodingFromByteOrderMarks);
 754                         Load(sr);
 755                         sr.Close();
 756                 }
 757
 758                 /// <summary>
 759                 /// Loads an HTML document from a file.
 760                 /// </summary>
 761                 /// <param name="path">The complete file path to be read. May not be null.</param>
 762                 /// <param name="encoding">The character encoding to use. May not be null.</param>
 763                 public void Load(string path, Encoding encoding)
 764                 {
 765                         if (path == null)
 766                         {
 767                                 throw new ArgumentNullException("path");
 768                         }
 769                         if (encoding == null)
 770                         {
 771                                 throw new ArgumentNullException("encoding");
 772                         }
 773                         StreamReader sr = new StreamReader(path, encoding);
 774                         Load(sr);
 775                         sr.Close();
 776                 }
 777
 778                 /// <summary>
 779                 /// Loads an HTML document from a file.
 780                 /// </summary>
 781                 /// <param name="path">The complete file path to be read. May not be null.</param>
 782                 /// <param name="encoding">The character encoding to use. May not be null.</param>
 783                 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
 784                 public void Load(string path, Encoding encoding, bool detectEncodingFromByteOrderMarks)
 785                 {
 786                         if (path == null)
 787                         {
 788                                 throw new ArgumentNullException("path");
 789                         }
 790                         if (encoding == null)
 791                         {
 792                                 throw new ArgumentNullException("encoding");
 793                         }
 794                         StreamReader sr = new StreamReader(path, encoding, detectEncodingFromByteOrderMarks);
 795                         Load(sr);
 796                         sr.Close();
 797                 }
 798
 799                 /// <summary>
 800                 /// Loads an HTML document from a file.
 801                 /// </summary>
 802                 /// <param name="path">The complete file path to be read. May not be null.</param>
 803                 /// <param name="encoding">The character encoding to use. May not be null.</param>
 804                 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
 805                 /// <param name="buffersize">The minimum buffer size.</param>
 806                 public void Load(string path, Encoding encoding, bool detectEncodingFromByteOrderMarks, int buffersize)
 807                 {
 808                         if (path == null)
 809                         {
 810                                 throw new ArgumentNullException("path");
 811                         }
 812                         if (encoding == null)
 813                         {
 814                                 throw new ArgumentNullException("encoding");
 815                         }
 816                         StreamReader sr = new StreamReader(path, encoding, detectEncodingFromByteOrderMarks, buffersize);
 817                         Load(sr);
 818                         sr.Close();
 819                 }
 820
 821                 /// <summary>
 822                 /// Loads the HTML document from the specified string.
 823                 /// </summary>
 824                 /// <param name="html">String containing the HTML document to load. May not be null.</param>
 825                 public void LoadHtml(string html)
 826                 {
 827                         if (html == null)
 828                         {
 829                                 throw new ArgumentNullException("html");
 830                         }
 831                         StringReader sr = new StringReader(html);
 832                         Load(sr);
 833                         sr.Close();
 834                 }
 835
 836                 /// <summary>
 837                 /// Detects the encoding of an HTML document from a file first, and then loads the file.
 838                 /// </summary>
 839                 /// <param name="path">The complete file path to be read.</param>
 840                 public void DetectEncodingAndLoad(string path)
 841                 {
 842                         DetectEncodingAndLoad(path, true);
 843                 }
 844
 845                 /// <summary>
 846                 /// Detects the encoding of an HTML document from a file first, and then loads the file.
 847                 /// </summary>
 848                 /// <param name="path">The complete file path to be read. May not be null.</param>
 849                 /// <param name="detectEncoding">true to detect encoding, false otherwise.</param>
 850                 public void DetectEncodingAndLoad(string path, bool detectEncoding)
 851                 {
 852                         if (path == null)
 853                         {
 854                                 throw new ArgumentNullException("path");
 855                         }
 856                         System.Text.Encoding enc;
 857                         if (detectEncoding)
 858                         {
 859                                 enc = DetectEncoding(path);
 860                         }
 861                         else
 862                         {
 863                                 enc = null;
 864                         }
 865
 866                         if (enc == null)
 867                         {
 868                                 Load(path);
 869                         }
 870                         else
 871                         {
 872                                 Load(path, enc);
 873                         }
 874                 }
 875
 876                 /// <summary>
 877                 /// Loads the HTML document from the specified TextReader.
 878                 /// </summary>
 879                 /// <param name="reader">The TextReader used to feed the HTML data into the document. May not be null.</param>
 880                 public void Load(TextReader reader)
 881                 {
 882                         // all Load methods pass down to this one
 883                         if (reader == null)
 884                         {
 885                                 throw new ArgumentNullException("reader");
 886                         }
 887
 888                         _onlyDetectEncoding = false;
 889
 890                         if (OptionCheckSyntax)
 891                         {
 892                                 _openednodes = new Hashtable();
 893                         }
 894                         else
 895                         {
 896                                 _openednodes = null;
 897                         }
 898
 899                         if (OptionUseIdAttribute)
 900                         {
 901                                 _nodesid = new Hashtable();
 902                         }
 903                         else
 904                         {
 905                                 _nodesid = null;
 906                         }
 907
 908                         StreamReader sr = reader as StreamReader;
 909                         if (sr != null)
 910                         {
 911                                 try
 912                                 {
 913                                     // trigger bom read if needed
 914                                     sr.Peek();
 915                                 }
 916                                 catch
 917                                 {
 918                                     // void on purpose
 919                                 }
 920                                 _streamencoding = sr.CurrentEncoding;
 921                                 _text = new ImplStreamAsArray (sr);
 922                         }
 923                         else
 924                         {
 925                                 _streamencoding = null;
 926                                 // Expensive, but cannot avoid since TextReader doesnt have any length of the underlying data
 927                                 _text = new DummyStreamAsArray (reader.ReadToEnd());
 928                         }
 929                         _declaredencoding = null;
 930
 931                         // SLIM: _text = reader.ReadToEnd();
 932                         _documentnode = CreateNode(HtmlNodeType.Document, 0);
 933                         Parse();
 934
 935                         if (OptionCheckSyntax)
 936                         {
 937                                 foreach(HtmlNode node in _openednodes.Values)
 938                                 {
 939                                         if (!node._starttag)    // already reported
 940                                         {
 941                                                 continue;
 942                                         }
 943
 944                                         string html;
 945                                         if (OptionExtractErrorSourceText)
 946                                         {
 947                                                 html = node.OuterHtml;
 948                                                 if (html.Length > OptionExtractErrorSourceTextMaxLength)
 949                                                 {
 950                                                         html = html.Substring(0, OptionExtractErrorSourceTextMaxLength);
 951                                                 }
 952                                         }
 953                                         else
 954                                         {
 955                                                 html = string.Empty;
 956                                         }
 957                                         AddError(
 958                                                 HtmlParseErrorCode.TagNotClosed,
 959                                                 node._line, node._lineposition,
 960                                                 node._streamposition, html,
 961                                                 "End tag </" + node.Name + "> was not found");
 962                                 }
 963
 964                                 // we don't need this anymore
 965                                 _openednodes.Clear();
 966                         }
 967                 }
 968
 969                 internal System.Text.Encoding GetOutEncoding()
 970                 {
 971                         // when unspecified, use the stream encoding first
 972                         if (_declaredencoding != null)
 973                         {
 974                                 return _declaredencoding;
 975                         }
 976                         else
 977                         {
 978                                 if (_streamencoding != null)
 979                                 {
 980                                         return _streamencoding;
 981                                 }
 982                         }
 983                         return OptionDefaultStreamEncoding;
 984                 }
 985
 986
 987                 /// <summary>
 988                 /// Gets the document's output encoding.
 989                 /// </summary>
 990                 public System.Text.Encoding Encoding
 991                 {
 992                         get
 993                         {
 994                                 return GetOutEncoding();
 995                         }
 996                 }
 997
 998                 /// <summary>
 999                 /// Saves the HTML document to the specified stream.
1000                 /// </summary>
1001                 /// <param name="outStream">The stream to which you want to save.</param>
1002                 public void Save(Stream outStream)
1003                 {
1004                         StreamWriter sw = new StreamWriter(outStream, GetOutEncoding());
1005                         Save(sw);
1006                 }
1007
1008                 /// <summary>
1009                 /// Saves the HTML document to the specified stream.
1010                 /// </summary>
1011                 /// <param name="outStream">The stream to which you want to save. May not be null.</param>
1012                 /// <param name="encoding">The character encoding to use. May not be null.</param>
1013                 public void Save(Stream outStream, System.Text.Encoding encoding)
1014                 {
1015                         if (outStream == null)
1016                         {
1017                                 throw new ArgumentNullException("outStream");
1018                         }
1019                         if (encoding == null)
1020                         {
1021                                 throw new ArgumentNullException("encoding");
1022                         }
1023                         StreamWriter sw = new StreamWriter(outStream, encoding);
1024                         Save(sw);
1025                 }
1026
1027                 /// <summary>
1028                 /// Saves the mixed document to the specified file.
1029                 /// </summary>
1030                 /// <param name="filename">The location of the file where you want to save the document.</param>
1031                 public void Save(string filename)
1032                 {
1033                         StreamWriter sw = new StreamWriter(filename, false, GetOutEncoding());
1034                         Save(sw);
1035                         sw.Close();
1036                 }
1037
1038                 /// <summary>
1039                 /// Saves the mixed document to the specified file.
1040                 /// </summary>
1041                 /// <param name="filename">The location of the file where you want to save the document. May not be null.</param>
1042                 /// <param name="encoding">The character encoding to use. May not be null.</param>
1043                 public void Save(string filename, System.Text.Encoding encoding)
1044                 {
1045                         if (filename == null)
1046                         {
1047                                 throw new ArgumentNullException("filename");
1048                         }
1049                         if (encoding == null)
1050                         {
1051                                 throw new ArgumentNullException("encoding");
1052                         }
1053                         StreamWriter sw = new StreamWriter(filename, false, encoding);
1054                         Save(sw);
1055                         sw.Close();
1056                 }
1057
1058                 /// <summary>
1059                 /// Saves the HTML document to the specified StreamWriter.
1060                 /// </summary>
1061                 /// <param name="writer">The StreamWriter to which you want to save.</param>
1062                 public void Save(StreamWriter writer)
1063                 {
1064                         Save((TextWriter)writer);
1065                 }
1066
1067                 /// <summary>
1068                 /// Saves the HTML document to the specified TextWriter.
1069                 /// </summary>
1070                 /// <param name="writer">The TextWriter to which you want to save. May not be null.</param>
1071                 public void Save(TextWriter writer)
1072                 {
1073                         if (writer == null)
1074                         {
1075                                 throw new ArgumentNullException("writer");
1076                         }
1077                         DocumentNode.WriteTo(writer);
1078                 }
1079
1080                 /// <summary>
1081                 /// Saves the HTML document to the specified XmlWriter.
1082                 /// </summary>
1083                 /// <param name="writer">The XmlWriter to which you want to save.</param>
1084                 public void Save(XmlWriter writer)
1085                 {
1086                         DocumentNode.WriteTo(writer);
1087                         writer.Flush();
1088                 }
1089
1090                 /// <summary>
1091                 /// Creates a new XPathNavigator object for navigating this HTML document.
1092                 /// </summary>
1093                 /// <returns>An XPathNavigator object. The XPathNavigator is positioned on the root of the document.</returns>
1094                 public XPathNavigator CreateNavigator()
1095                 {
1096                         return new HtmlNodeNavigator(this, _documentnode);
1097                 }
1098
1099                 internal void SetIdForNode(HtmlNode node, string id)
1100                 {
1101                         if (!OptionUseIdAttribute)
1102                         {
1103                                 return;
1104                         }
1105
1106                         if ((_nodesid == null) || (id == null))
1107                         {
1108                                 return;
1109                         }
1110
1111                         if (node == null)
1112                         {
1113                                 _nodesid.Remove(id.ToLower());
1114                         }
1115                         else
1116                         {
1117                                 _nodesid[id.ToLower()] = node;
1118                         }
1119                 }
1120
1121                 /// <summary>
1122                 /// Gets the HTML node with the specified 'id' attribute value.
1123                 /// </summary>
1124                 /// <param name="id">The attribute id to match. May not be null.</param>
1125                 /// <returns>The HTML node with the matching id or null if not found.</returns>
1126                 public HtmlNode GetElementbyId(string id)
1127                 {
1128                         if (id == null)
1129                         {
1130                                 throw new ArgumentNullException("id");
1131                         }
1132                         if (_nodesid == null)
1133                         {
1134                                 throw new Exception(HtmlExceptionUseIdAttributeFalse);
1135                         }
1136
1137                         return _nodesid[id.ToLower()] as HtmlNode;
1138                 }
1139
1140                 /// <summary>
1141                 /// Creates an HTML element node with the specified name.
1142                 /// </summary>
1143                 /// <param name="name">The qualified name of the element. May not be null.</param>
1144                 /// <returns>The new HTML node.</returns>
1145                 public HtmlNode CreateElement(string name)
1146                 {
1147                         if (name == null)
1148                         {
1149                                 throw new ArgumentNullException("name");
1150                         }
1151                         HtmlNode node = CreateNode(HtmlNodeType.Element);
1152                         node._name = name;
1153                         return node;
1154                 }
1155
1156                 /// <summary>
1157                 /// Creates an HTML comment node.
1158                 /// </summary>
1159                 /// <returns>The new HTML comment node.</returns>
1160                 public HtmlCommentNode CreateComment()
1161                 {
1162                         return (HtmlCommentNode)CreateNode(HtmlNodeType.Comment);
1163                 }
1164
1165                 /// <summary>
1166                 /// Creates an HTML comment node with the specified comment text.
1167                 /// </summary>
1168                 /// <param name="comment">The comment text. May not be null.</param>
1169                 /// <returns>The new HTML comment node.</returns>
1170                 public HtmlCommentNode CreateComment(string comment)
1171                 {
1172                         if (comment == null)
1173                         {
1174                                 throw new ArgumentNullException("comment");
1175                         }
1176                         HtmlCommentNode c = CreateComment();
1177                         c.Comment = comment;
1178                         return c;
1179                 }
1180
1181                 /// <summary>
1182                 /// Creates an HTML text node.
1183                 /// </summary>
1184                 /// <returns>The new HTML text node.</returns>
1185                 public HtmlTextNode CreateTextNode()
1186                 {
1187                         return (HtmlTextNode)CreateNode(HtmlNodeType.Text);
1188                 }
1189
1190                 /// <summary>
1191                 /// Creates an HTML text node with the specified text.
1192                 /// </summary>
1193                 /// <param name="text">The text of the node. May not be null.</param>
1194                 /// <returns>The new HTML text node.</returns>
1195                 public HtmlTextNode CreateTextNode(string text)
1196                 {
1197                         if (text == null)
1198                         {
1199                                 throw new ArgumentNullException("text");
1200                         }
1201                         HtmlTextNode t = CreateTextNode();
1202                         t.Text = text;
1203                         return t;
1204                 }
1205
1206                 internal HtmlNode CreateNode(HtmlNodeType type)
1207                 {
1208                         return CreateNode(type, -1);
1209                 }
1210
1211                 internal HtmlNode CreateNode(HtmlNodeType type, int index)
1212                 {
1213                         switch (type)
1214                         {
1215                                 case HtmlNodeType.Comment:
1216                                         return new HtmlCommentNode(this, index);
1217
1218                                 case HtmlNodeType.Text:
1219                                         return new HtmlTextNode(this, index);
1220
1221                                 default:
1222                                         return new HtmlNode(type, this, index);
1223                         }
1224                 }
1225
1226                 internal HtmlAttribute CreateAttribute()
1227                 {
1228                         return new HtmlAttribute(this);
1229                 }
1230
1231                 /// <summary>
1232                 /// Creates an HTML attribute with the specified name.
1233                 /// </summary>
1234                 /// <param name="name">The name of the attribute. May not be null.</param>
1235                 /// <returns>The new HTML attribute.</returns>
1236                 public HtmlAttribute CreateAttribute(string name)
1237                 {
1238                         if (name == null)
1239                         {
1240                                 throw new ArgumentNullException("name");
1241                         }
1242                         HtmlAttribute att = CreateAttribute();
1243                         att.Name = name;
1244                         return att;
1245                 }
1246
1247                 /// <summary>
1248                 /// Creates an HTML attribute with the specified name.
1249                 /// </summary>
1250                 /// <param name="name">The name of the attribute. May not be null.</param>
1251                 /// <param name="value">The value of the attribute.</param>
1252                 /// <returns>The new HTML attribute.</returns>
1253                 public HtmlAttribute CreateAttribute(string name, string value)
1254                 {
1255                         if (name == null)
1256                         {
1257                                 throw new ArgumentNullException("name");
1258                         }
1259                         HtmlAttribute att = CreateAttribute(name);
1260                         att.Value = value;
1261                         return att;
1262                 }
1263
1264                 /// <summary>
1265                 /// Gets the root node of the document.
1266                 /// </summary>
1267                 public HtmlNode DocumentNode
1268                 {
1269                         get
1270                         {
1271                                 return _documentnode;
1272                         }
1273                 }
1274
1275                 /// <summary>
1276                 /// Gets the document CRC32 checksum if OptionComputeChecksum was set to true before parsing, 0 otherwise.
1277                 /// </summary>
1278                 public int CheckSum
1279                 {
1280                         get
1281                         {
1282                                 if (_crc32 == null)
1283                                 {
1284                                         return 0;
1285                                 }
1286                                 else
1287                                 {
1288                                         return (int)_crc32.CheckSum;
1289                                 }
1290                         }
1291                 }
1292
1293                 public bool StreamMode
1294                 {
1295                         get
1296                         {
1297                                 return _streammode;
1298                         }
1299                         set
1300                         {
1301                                 _streammode = value;
1302                         }
1303                 }
1304
1305                 private HtmlParseError AddError(
1306                                 HtmlParseErrorCode code,
1307                                 int line,
1308                                 int linePosition,
1309                                 int streamPosition,
1310                                 string sourceText,
1311                                 string reason)
1312                         {
1313                         HtmlParseError err = new HtmlParseError(code, line, linePosition, streamPosition, sourceText, reason);
1314                         _parseerrors.Add(err);
1315                         return err;
1316                 }
1317
1318                 private enum ParseState
1319                 {
1320                         Text,
1321                         WhichTag,
1322                         Tag,
1323                         BetweenAttributes,
1324                         EmptyTag,
1325                         AttributeName,
1326                         AttributeBeforeEquals,
1327                         AttributeAfterEquals,
1328                         AttributeValue,
1329                         Comment,
1330                         QuotedAttributeValue,
1331                         ServerSideCode,
1332                         PcDataQuote,
1333                         PcData
1334                 }
1335
1336                 private void IncrementPosition()
1337                 {
1338                         if (_crc32 != null)
1339                         {
1340                                 // REVIEW: should we add some checksum code in DecrementPosition too?
1341                                 _crc32.AddToCRC32(_c);
1342                         }
1343
1344                         _index++;
1345                         _maxlineposition = _lineposition;
1346                         if (_c == 10)
1347                         {
1348                                 _lineposition = 1;
1349                                 _line++;
1350                         }
1351                         else
1352                         {
1353                                 _lineposition++;
1354                         }
1355                 }
1356
1357                 private void DecrementPosition()
1358                 {
1359                         _index--;
1360                         if (_lineposition == 1)
1361                         {
1362                                 _lineposition = _maxlineposition;
1363                                 _line--;
1364                         }
1365                         else
1366                         {
1367                                 _lineposition--;
1368                         }
1369                 }
1370
1371                 private void Parse()
1372                 {
1373                         int lastquote = 0;
1374                         if (OptionComputeChecksum)
1375                         {
1376                                 _crc32 = new Crc32();
1377                         }
1378
1379                         _lastnodes = new Hashtable();
1380                         _c = 0;
1381                         _fullcomment = false;
1382                         _parseerrors = new ArrayList();
1383                         _line = 1;
1384                         _lineposition = 1;
1385                         _maxlineposition = 1;
1386
1387                         _state = ParseState.Text;
1388                         _oldstate = _state;
1389                         _documentnode._innerlength = _text.FullLength;
1390                         _documentnode._outerlength = _text.FullLength;
1391
1392                         _lastparentnode = _documentnode;
1393                         _currentnode = CreateNode(HtmlNodeType.Text, 0);
1394                         _currentattribute = null;
1395
1396                         _index = 0;
1397                         PushNodeStart(HtmlNodeType.Text, 0);
1398                         // SLIM: while (_index<_text.Length)
1399                         while (! _stop_parsing && ! _text.Eof (_index))
1400                         {
1401                                 _c = _text[_index];
1402                                 IncrementPosition();
1403
1404                                 switch(_state)
1405                                 {
1406                                         case ParseState.Text:
1407                                                 if (NewCheck())
1408                                                         continue;
1409                                                 break;
1410
1411                                         case ParseState.WhichTag:
1412                                                 if (NewCheck())
1413                                                         continue;
1414                                                 if (_c == '/')
1415                                                 {
1416                                                         PushNodeNameStart(false, _index);
1417                                                 }
1418                                                 else
1419                                                 {
1420                                                         PushNodeNameStart(true, _index-1);
1421                                                         DecrementPosition();
1422                                                 }
1423                                                 _state = ParseState.Tag;
1424                                                 break;
1425
1426                                         case ParseState.Tag:
1427                                                 if (NewCheck())
1428                                                         continue;
1429                                                 if (IsWhiteSpace(_c))
1430                                                 {
1431                                                         PushNodeNameEnd(_index-1);
1432                                                         if (_state != ParseState.Tag)
1433                                                                 continue;
1434                                                         _state = ParseState.BetweenAttributes;
1435                                                         continue;
1436                                                 }
1437                                                 if (_c == '/')
1438                                                 {
1439                                                         PushNodeNameEnd(_index-1);
1440                                                         if (_state != ParseState.Tag)
1441                                                                 continue;
1442                                                         _state = ParseState.EmptyTag;
1443                                                         continue;
1444                                                 }
1445                                                 if (_c == '>')
1446                                                 {
1447                                                         PushNodeNameEnd(_index-1);
1448                                                         if (_state != ParseState.Tag)
1449                                                                 continue;
1450                                                         PushNodeEnd(_index, false);
1451                                                         if (_state != ParseState.Tag)
1452                                                                 continue;
1453                                                         _state = ParseState.Text;
1454                                                         PushNodeStart(HtmlNodeType.Text, _index);
1455                                                 }
1456                                                 break;
1457
1458                                         case ParseState.BetweenAttributes:
1459                                                 if (NewCheck())
1460                                                         continue;
1461
1462                                                 if (IsWhiteSpace(_c))
1463                                                         continue;
1464
1465                                                 if ((_c == '/') || (_c == '?'))
1466                                                 {
1467                                                         _state = ParseState.EmptyTag;
1468                                                         continue;
1469                                                 }
1470
1471                                                 if (_c == '>')
1472                                                 {
1473                                                         PushNodeEnd(_index, false);
1474                                                         if (_state != ParseState.BetweenAttributes)
1475                                                                 continue;
1476                                                         _state = ParseState.Text;
1477                                                         PushNodeStart(HtmlNodeType.Text, _index);
1478                                                         continue;
1479                                                 }
1480
1481                                                 PushAttributeNameStart(_index-1);
1482                                                 _state = ParseState.AttributeName;
1483                                                 break;
1484
1485                                         case ParseState.EmptyTag:
1486                                                 if (NewCheck())
1487                                                         continue;
1488
1489                                                 if (_c == '>')
1490                                                 {
1491                                                         PushNodeEnd(_index, true);
1492                                                         if (_state != ParseState.EmptyTag)
1493                                                                 continue;
1494                                                         _state = ParseState.Text;
1495                                                         PushNodeStart(HtmlNodeType.Text, _index);
1496                                                         continue;
1497                                                 }
1498                                                 _state = ParseState.BetweenAttributes;
1499                                                 break;
1500
1501                                         case ParseState.AttributeName:
1502                                                 if (NewCheck())
1503                                                         continue;
1504
1505                                                 if (IsWhiteSpace(_c))
1506                                                 {
1507                                                         PushAttributeNameEnd(_index-1);
1508                                                         _state = ParseState.AttributeBeforeEquals;
1509                                                         continue;
1510                                                 }
1511                                                 if (_c == '=')
1512                                                 {
1513                                                         PushAttributeNameEnd(_index-1);
1514                                                         _state = ParseState.AttributeAfterEquals;
1515                                                         continue;
1516                                                 }
1517                                                 if (_c == '>')
1518                                                 {
1519                                                         PushAttributeNameEnd(_index-1);
1520                                                         PushNodeEnd(_index, false);
1521                                                         if (_state != ParseState.AttributeName)
1522                                                                 continue;
1523                                                         _state = ParseState.Text;
1524                                                         PushNodeStart(HtmlNodeType.Text, _index);
1525                                                         continue;
1526                                                 }
1527                                                 break;
1528
1529                                         case ParseState.AttributeBeforeEquals:
1530                                                 if (NewCheck())
1531                                                         continue;
1532
1533                                                 if (IsWhiteSpace(_c))
1534                                                         continue;
1535                                                 if (_c == '>')
1536                                                 {
1537                                                         PushNodeEnd(_index, false);
1538                                                         if (_state != ParseState.AttributeBeforeEquals)
1539                                                                 continue;
1540                                                         _state = ParseState.Text;
1541                                                         PushNodeStart(HtmlNodeType.Text, _index);
1542                                                         continue;
1543                                                 }
1544                                                 if (_c == '=')
1545                                                 {
1546                                                         _state = ParseState.AttributeAfterEquals;
1547                                                         continue;
1548                                                 }
1549                                                 // no equals, no whitespace, it's a new attrribute starting
1550                                                 _state = ParseState.BetweenAttributes;
1551                                                 DecrementPosition();
1552                                                 break;
1553
1554                                         case ParseState.AttributeAfterEquals:
1555                                                 if (NewCheck())
1556                                                         continue;
1557
1558                                                 if (IsWhiteSpace(_c))
1559                                                         continue;
1560
1561                                                 if ((_c == '\'') || (_c == '"'))
1562                                                 {
1563                                                         _state = ParseState.QuotedAttributeValue;
1564                                                         PushAttributeValueStart(_index);
1565                                                         lastquote = _c;
1566                                                         continue;
1567                                                 }
1568                                                 if (_c == '>')
1569                                                 {
1570                                                         PushNodeEnd(_index, false);
1571                                                         if (_state != ParseState.AttributeAfterEquals)
1572                                                                 continue;
1573                                                         _state = ParseState.Text;
1574                                                         PushNodeStart(HtmlNodeType.Text, _index);
1575                                                         continue;
1576                                                 }
1577                                                 PushAttributeValueStart(_index-1);
1578                                                 _state = ParseState.AttributeValue;
1579                                                 break;
1580
1581                                         case ParseState.AttributeValue:
1582                                                 if (NewCheck())
1583                                                         continue;
1584
1585                                                 if (IsWhiteSpace(_c))
1586                                                 {
1587                                                         PushAttributeValueEnd(_index-1);
1588                                                         _state = ParseState.BetweenAttributes;
1589                                                         continue;
1590                                                 }
1591
1592                                                 if (_c == '>')
1593                                                 {
1594                                                         PushAttributeValueEnd(_index-1);
1595                                                         PushNodeEnd(_index, false);
1596                                                         if (_state != ParseState.AttributeValue)
1597                                                                 continue;
1598                                                         _state = ParseState.Text;
1599                                                         PushNodeStart(HtmlNodeType.Text, _index);
1600                                                         continue;
1601                                                 }
1602                                                 break;
1603
1604                                         case ParseState.QuotedAttributeValue:
1605                                                 if (_c == lastquote)
1606                                                 {
1607                                                         PushAttributeValueEnd(_index-1);
1608                                                         _state = ParseState.BetweenAttributes;
1609                                                         continue;
1610                                                 }
1611                                                 if (_c == '<')
1612                                                 {
1613                                                         //SLIM: if (_index<_text.Length)
1614                                                         if (!_text.Eof (_index))
1615                                                         {
1616                                                                 if (_text[_index] == '%')
1617                                                                 {
1618                                                                         _oldstate = _state;
1619                                                                         _state = ParseState.ServerSideCode;
1620                                                                         continue;
1621                                                                 }
1622                                                         }
1623                                                 }
1624                                                 break;
1625
1626                                         case ParseState.Comment:
1627                                                 if (_c == '>')
1628                                                 {
1629                                                         if (_fullcomment)
1630                                                         {
1631                                                                 if ((_text[_index-2] != '-') ||
1632                                                                         (_text[_index-3] != '-'))
1633                                                                 {
1634                                                                         continue;
1635                                                                 }
1636                                                         }
1637                                                         PushNodeEnd(_index, false);
1638                                                         _state = ParseState.Text;
1639                                                         PushNodeStart(HtmlNodeType.Text, _index);
1640                                                         continue;
1641                                                 }
1642                                                 break;
1643
1644                                         case ParseState.ServerSideCode:
1645                                                 if (_c == '%')
1646                                                 {
1647                                                         //SLIM: if (_index<_text.Length)
1648                                                         if (! _text.Eof (_index))
1649                                                         {
1650                                                                 if (_text[_index] == '>')
1651                                                                 {
1652                                                                         switch(_oldstate)
1653                                                                         {
1654                                                                                 case ParseState.AttributeAfterEquals:
1655                                                                                         _state = ParseState.AttributeValue;
1656                                                                                         break;
1657
1658                                                                                 case ParseState.BetweenAttributes:
1659                                                                                         PushAttributeNameEnd(_index+1);
1660                                                                                         _state = ParseState.BetweenAttributes;
1661                                                                                         break;
1662
1663                                                                                 default:
1664                                                                                         _state = _oldstate;
1665                                                                                         break;
1666                                                                         }
1667                                                                         IncrementPosition();
1668                                                                 }
1669                                                         }
1670                                                 }
1671                                                 break;
1672
1673                                         // handle <script>a="</script>"</script>
1674                                         case ParseState.PcDataQuote:
1675                                                 if ((_c == _pcdata_quote_char) && (_text [_index - 2] != '\\')) {
1676                                                         _pcdata_quote_char = '\0';
1677                                                         _state = ParseState.PcData;
1678                                                 }
1679                                                 break;
1680
1681                                         case ParseState.PcData:
1682                                                 Debug ("PCDATA " + _currentnode.Name + " " + _text.Substring(_index-1,  _currentnode._namelength+2));
1683                                                 if (_c == '\"' || _c == '\''){
1684                                                         _pcdata_quote_char = _c;
1685                                                         _state = ParseState.PcDataQuote;
1686                                                         break;
1687                                                 }
1688                                                 // look for </tag + 1 char
1689
1690                                                 // check buffer end
1691                                                 //SLIM: if ((_currentnode._namelength+3)<=(_text.Length-(_index-1)))
1692                                                 if (! _text.Eof (_currentnode._namelength + _index + 1))
1693                                                 {
1694                                                         if (string.Compare(_text.Substring(_index-1, _currentnode._namelength+2),
1695                                                                 "</" + _currentnode.Name, true) == 0)
1696                                                         {
1697                                                                 int c = _text[_index-1 + 2 + _currentnode.Name.Length];
1698                                                                 if ((c == '>') || (IsWhiteSpace(c)))
1699                                                                 {
1700                                                                         // add the script as a text node
1701                                                                         HtmlNode script = CreateNode(HtmlNodeType.Text,
1702                                                                                 _currentnode._outerstartindex + _currentnode._outerlength);
1703                                                                         script._outerlength = _index-1 - script._outerstartindex;
1704                                                                         if (_streammode && ReportNode != null)
1705                                                                                 _stop_parsing = ! ReportNode (script);
1706                                                                         else
1707                                                                                 _currentnode.AppendChild(script);
1708                                                                         Debug ("Found script: [" + script.InnerText + "]");
1709
1710                                                                         PushNodeStart(HtmlNodeType.Element, _index-1);
1711                                                                         PushNodeNameStart(false, _index-1 +2);
1712                                                                         _state = ParseState.Tag;
1713                                                                         IncrementPosition();
1714                                                                 }
1715                                                         }
1716                                                 }
1717                                                 break;
1718                                 }
1719                         }
1720
1721                         // finish the current work
1722                         if (_currentnode._namestartindex > 0)
1723                         {
1724                                 PushNodeNameEnd(_index);
1725                         }
1726                         PushNodeEnd(_index, false);
1727
1728                         // we don't need this anymore
1729                         _lastnodes.Clear();
1730                 }
1731
1732                 private bool NewCheck()
1733                 {
1734                         if (_c != '<')
1735                         {
1736                                 return false;
1737                         }
1738                         //SLIM: if (_index<_text.Length)
1739                         if (! _text.Eof (_index))
1740                         {
1741                                 if (_text[_index] == '%')
1742                                 {
1743                                         switch(_state)
1744                                         {
1745                                                 case ParseState.AttributeAfterEquals:
1746                             PushAttributeValueStart(_index-1);
1747                                                         break;
1748
1749                                                 case ParseState.BetweenAttributes:
1750                                                         PushAttributeNameStart(_index-1);
1751                                                         break;
1752
1753                                                 case ParseState.WhichTag:
1754                                                         PushNodeNameStart(true, _index-1);
1755                                                         _state = ParseState.Tag;
1756                                                         break;
1757                                         }
1758                                         _oldstate = _state;
1759                                         _state = ParseState.ServerSideCode;
1760                                         return true;
1761                                 }
1762                         }
1763
1764                         PushNodeEnd(_index-1, true);
1765                         _state = ParseState.WhichTag;
1766                         //SLIM: if ((_index-1) <= (_text.Length-2))
1767                         if (!_text.Eof (_index))
1768                         {
1769                                 if (_text[_index] == '!')
1770                                 {
1771                                         PushNodeStart(HtmlNodeType.Comment, _index-1);
1772                                         PushNodeNameStart(true, _index);
1773                                         PushNodeNameEnd(_index+1);
1774                                         _state = ParseState.Comment;
1775                                         //SLIM: if (_index<(_text.Length-2))
1776                                         if (! _text.Eof (_index + 2))
1777                                         {
1778                                                 if ((_text[_index+1] == '-') &&
1779                                                         (_text[_index+2] == '-'))
1780                                                 {
1781                                                         _fullcomment = true;
1782                                                 }
1783                                                 else
1784                                                 {
1785                                                         _fullcomment = false;
1786                                                 }
1787                                         }
1788                                         return true;
1789                                 }
1790                         }
1791                         PushNodeStart(HtmlNodeType.Element, _index-1);
1792                         return true;
1793                 }
1794
1795                 private void ReadDocumentEncoding(HtmlNode node)
1796                 {
1797                         if (!OptionReadEncoding)
1798                                 return;
1799                         // format is
1800                         // <meta http-equiv="content-type" content="text/html;charset=iso-8859-1" />
1801
1802                         // when we append a child, we are in node end, so attributes are already populated
1803                         if (node._namelength == 4)      // quick check, avoids string alloc
1804                         {
1805                                 // only these nodes can occur before meta
1806                                 // if we started seeing any other node, we will never see a meta node
1807                                 if (node.NodeType == HtmlNodeType.Element &&
1808                                              (node.Name != "head" && node.Name != "script" &&
1809                                               node.Name != "style" && node.Name != "title" &&
1810                                               node.Name != "head" && node.Name != "link" &&
1811                                               node.Name != "html" && node.Name != "meta"))
1812                                     throw new EncodingFoundException (null);
1813                                 else if (node.Name == "meta") // all nodes names are lowercase
1814                                 {
1815                                         HtmlAttribute att = node.Attributes["http-equiv"];
1816                                         if (att != null)
1817                                         {
1818                                                 if (string.Compare(att.Value, "content-type", true) == 0)
1819                                                 {
1820                                                         HtmlAttribute content = node.Attributes["content"];
1821                                                         if (content != null)
1822                                                         {
1823                                                                 string charset = NameValuePairList.GetNameValuePairsValue(content.Value, "charset");
1824                                                                 if (charset != null)
1825                                                                 {
1826                                                                         _declaredencoding = Encoding.GetEncoding(charset);
1827                                                                         if (_onlyDetectEncoding)
1828                                                                         {
1829                                                                                 throw new EncodingFoundException(_declaredencoding);
1830                                                                         }
1831
1832                                                                         if (_streamencoding != null)
1833                                                                         {
1834                                                                                 if (_declaredencoding.WindowsCodePage != _streamencoding.WindowsCodePage)
1835                                                                                 {
1836                                                                                         AddError(
1837                                                                                                 HtmlParseErrorCode.CharsetMismatch,
1838                                                                                                 _line, _lineposition,
1839                                                                                                 _index, node.OuterHtml,
1840                                                                                                 "Encoding mismatch between StreamEncoding: " +
1841                                                                                                 _streamencoding.WebName + " and DeclaredEncoding: " + _declaredencoding.WebName);
1842                                                                                 }
1843                                                                         }
1844                                                                 }
1845                                                         }
1846                                                 }
1847                                         }
1848                                 }
1849                         }
1850                 }
1851
1852                 private void PushAttributeNameStart(int index)
1853                 {
1854                         _currentattribute = CreateAttribute();
1855                         _currentattribute._namestartindex = index;
1856                         _currentattribute._line = _line;
1857                         _currentattribute._lineposition = _lineposition;
1858                         _currentattribute._streamposition = index;
1859                 }
1860
1861                 private void PushAttributeNameEnd(int index)
1862                 {
1863                         _currentattribute._namelength = index - _currentattribute._namestartindex;
1864                         _currentnode.Attributes.Append(_currentattribute);
1865                 }
1866
1867                 private void PushAttributeValueStart(int index)
1868                 {
1869                         _currentattribute._valuestartindex = index;
1870                 }
1871
1872                 private void PushAttributeValueEnd(int index)
1873                 {
1874                         _currentattribute._valuelength = index - _currentattribute._valuestartindex;
1875                 }
1876
1877                 private void PushNodeStart(HtmlNodeType type, int index)
1878                 {
1879                         _currentnode = CreateNode(type, index);
1880                         _currentnode._line = _line;
1881                         _currentnode._lineposition = _lineposition;
1882                         if (type == HtmlNodeType.Element)
1883                         {
1884                                 _currentnode._lineposition--;
1885                         }
1886                         _currentnode._streamposition = index;
1887                 }
1888
1889                 private void PushNodeEnd(int index, bool close)
1890                 {
1891                         _currentnode._outerlength = index - _currentnode._outerstartindex;
1892
1893                         //SLIM: inform caller
1894                         if (_streammode && ReportNode != null)
1895                                 _stop_parsing = ! ReportNode (_currentnode);
1896
1897                         if (_debug) {
1898                                 if (_currentnode._nodetype == HtmlNodeType.Text)
1899                                         Debug ("Text:" + _currentnode.InnerText);
1900                                 else
1901                                         Debug ((_currentnode.StartTag ? "Start-" : "End-") + _currentnode.Name);
1902                         }
1903                         if ((_currentnode._nodetype == HtmlNodeType.Text) ||
1904                                 (_currentnode._nodetype == HtmlNodeType.Comment))
1905                         {
1906                                 // forget about void nodes
1907                                 if (_currentnode._outerlength>0)
1908                                 {
1909                                         _currentnode._innerlength = _currentnode._outerlength;
1910                                         _currentnode._innerstartindex = _currentnode._outerstartindex;
1911                                         // SLIM: no need to append child in stream mode
1912                                         // SLIM: whatever the caller needs to do, tell it to do now
1913                                         if (!_streammode && _lastparentnode != null)
1914                                         {
1915                                            _lastparentnode.AppendChild(_currentnode);
1916                                         }
1917                                 }
1918                         }
1919                         else
1920                         {
1921                                 if ((_currentnode._starttag) && (_lastparentnode != _currentnode))
1922                                 {
1923                                         // add to parent node
1924                                         // SLIM: no need to append child in stream mode
1925                                         // SLIM: whatever the caller needs to do, tell it to do now
1926                                         if (!_streammode && _lastparentnode != null)
1927                                         {
1928                                            _lastparentnode.AppendChild(_currentnode);
1929                                         }
1930
1931                                         ReadDocumentEncoding(_currentnode);
1932
1933                                         // remember last node of this kind
1934                                         // SLIM: we still to store _currentnode to help other tags in the same level
1935                                         HtmlNode prev = (HtmlNode)_lastnodes[_currentnode.Name];
1936                                         _currentnode._prevwithsamename = prev;
1937                                         _lastnodes[_currentnode.Name] = _currentnode;
1938
1939                                         // change parent?
1940                                         if ((_currentnode.NodeType == HtmlNodeType.Document) ||
1941                                                 (_currentnode.NodeType == HtmlNodeType.Element))
1942                                         {
1943                                                 _lastparentnode = _currentnode;
1944                                         }
1945
1946                                         if (HtmlNode.IsCDataElement(CurrentNodeName()))
1947                                         {
1948                                                 _state = ParseState.PcData;
1949                                                 return;
1950                                         }
1951
1952                                         if ((HtmlNode.IsClosedElement(_currentnode.Name)) ||
1953                                                 (HtmlNode.IsEmptyElement(_currentnode.Name)))
1954                                         {
1955                                                 close = true;
1956                                         }
1957                                 }
1958                         }
1959
1960                         if ((close) || (!_currentnode._starttag))
1961                         {
1962                                 CloseCurrentNode();
1963                                 if ((_currentnode._nodetype == HtmlNodeType.Text) ||
1964                                     (_currentnode._nodetype == HtmlNodeType.Comment))
1965                                         _currentnode = null;
1966                         }
1967                 }
1968
1969                 private void PushNodeNameStart(bool starttag, int index)
1970                 {
1971                         _currentnode._starttag = starttag;
1972                         _currentnode._namestartindex = index;
1973                 }
1974
1975                 private string[] GetResetters(string name)
1976                 {
1977                         switch (name)
1978                         {
1979                                 case "li":
1980                                         return new string[]{"ul"};
1981
1982                                 case "tr":
1983                                         return new string[]{"table"};
1984
1985                                 case "th":
1986                                 case "td":
1987                                         return new string[]{"tr", "table"};
1988
1989                                 default:
1990                                         return null;
1991                         }
1992                 }
1993
1994                 private void FixNestedTags()
1995                 {
1996                         // we are only interested by start tags, not closing tags
1997                         if (!_currentnode._starttag)
1998                                 return;
1999
2000                         string name = CurrentNodeName().ToLower();
2001                         FixNestedTag(name, GetResetters(name));
2002                 }
2003
2004                 private void FixNestedTag(string name, string[] resetters)
2005                 {
2006                         if (resetters == null)
2007                                 return;
2008
2009                         HtmlNode prev;
2010
2011                         // if we find a previous unclosed same name node, without a resetter node between, we must close it
2012                         prev = (HtmlNode)_lastnodes[name];
2013                         if ((prev != null) && (!prev.Closed))
2014                         {
2015
2016                                 // try to find a resetter node, if found, we do nothing
2017                                 if (FindResetterNodes(prev, resetters))
2018                                 {
2019                                         return;
2020                                 }
2021
2022                                 // ok we need to close the prev now
2023                                 // create a fake closer node
2024                                 HtmlNode close = new HtmlNode(prev.NodeType, this, -1);
2025                                 close._endnode = close;
2026                                 prev.CloseNode(close);
2027
2028                         }
2029                 }
2030
2031                 private bool FindResetterNodes(HtmlNode node, string[] names)
2032                 {
2033                         if (names == null)
2034                         {
2035                                 return false;
2036                         }
2037                         for(int i=0;i<names.Length;i++)
2038                         {
2039                                 if (FindResetterNode(node, names[i]) != null)
2040                                 {
2041                                         return true;
2042                                 }
2043                         }
2044                         return false;
2045                 }
2046
2047                 private HtmlNode FindResetterNode(HtmlNode node, string name)
2048                 {
2049                         HtmlNode resetter = (HtmlNode)_lastnodes[name];
2050                         if (resetter == null)
2051                                 return null;
2052                         if (resetter.Closed)
2053                         {
2054                                 return null;
2055                         }
2056                         if (resetter._streamposition<node._streamposition)
2057                         {
2058                                 return null;
2059                         }
2060                         return resetter;
2061                 }
2062
2063                 private void PushNodeNameEnd(int index)
2064                 {
2065                         _currentnode._namelength = index - _currentnode._namestartindex;
2066                         if (OptionFixNestedTags)
2067                         {
2068                                 FixNestedTags();
2069                         }
2070                 }
2071
2072                 private void CloseCurrentNode()
2073                 {
2074                         if (_currentnode.Closed) // text or document are by def closed
2075                                 return;
2076
2077                         bool error = false;
2078
2079                         // find last node of this kind
2080                         HtmlNode prev = (HtmlNode)_lastnodes[_currentnode.Name];
2081                         if (prev == null)
2082                         {
2083                                 if (HtmlNode.IsClosedElement(_currentnode.Name))
2084                                 {
2085                                         // </br> will be seen as <br>
2086                                         _currentnode.CloseNode(_currentnode);
2087
2088                                         // add to parent node
2089                                         if (_lastparentnode != null)
2090                                         {
2091                                                 HtmlNode foundNode = null;
2092                                                 Stack futureChild = new Stack();
2093                                                 for (HtmlNode node = _lastparentnode.LastChild; node != null; node = node.PreviousSibling)
2094                                                 {
2095                                                         if ((node.Name == _currentnode.Name) && (! node.HasChildNodes))
2096                                                         {
2097                                                                 foundNode = node;
2098                                                                 break;
2099                                                         }
2100                                                         futureChild.Push(node);
2101                                                 }
2102                                                 if (foundNode != null)
2103                                                 {
2104                                                         HtmlNode node = null;
2105                                                         while(futureChild.Count != 0)
2106                                                         {
2107                                                                 node = (HtmlNode)futureChild.Pop();
2108                                                                 _lastparentnode.RemoveChild(node);
2109                                                                 foundNode.AppendChild(node);
2110                                                         }
2111                                                 }
2112                                                 else
2113                                                 {
2114                                                         _lastparentnode.AppendChild(_currentnode);
2115                                                 }
2116
2117                                         }
2118                                 }
2119                                 else
2120                                 {
2121                                         // node has no parent
2122                                         // node is not a closed node
2123
2124                                         if (HtmlNode.CanOverlapElement(_currentnode.Name))
2125                                         {
2126                                                 // this is a hack: add it as a text node
2127                                                 HtmlNode closenode = CreateNode(HtmlNodeType.Text, _currentnode._outerstartindex);
2128                                                 closenode._outerlength = _currentnode._outerlength;
2129                                                 ((HtmlTextNode)closenode).Text = ((HtmlTextNode)closenode).Text.ToLower();
2130                                                 if (_lastparentnode != null)
2131                                                 {
2132                                                         _lastparentnode.AppendChild(closenode);
2133                                                 }
2134
2135                                         }
2136                                         else
2137                                         {
2138                                                 if (HtmlNode.IsEmptyElement(_currentnode.Name))
2139                                                 {
2140                                                         AddError(
2141                                                                 HtmlParseErrorCode.EndTagNotRequired,
2142                                                                 _currentnode._line, _currentnode._lineposition,
2143                                                                 _currentnode._streamposition, _currentnode.OuterHtml,
2144                                                                 "End tag </" + _currentnode.Name + "> is not required");
2145                                                 }
2146                                                 else
2147                                                 {
2148                                                         // node cannot overlap, node is not empty
2149                                                         AddError(
2150                                                                 HtmlParseErrorCode.TagNotOpened,
2151                                                                 _currentnode._line, _currentnode._lineposition,
2152                                                                 _currentnode._streamposition, _currentnode.OuterHtml,
2153                                                                 "Start tag <" + _currentnode.Name + "> was not found");
2154                                                         error = true;
2155                                                 }
2156                                         }
2157                                 }
2158                         }
2159                         else
2160                         {
2161                                 if (OptionFixNestedTags)
2162                                 {
2163                                         if (FindResetterNodes(prev, GetResetters(_currentnode.Name)))
2164                                         {
2165                                                 AddError(
2166                                                         HtmlParseErrorCode.EndTagInvalidHere,
2167                                                         _currentnode._line, _currentnode._lineposition,
2168                                                         _currentnode._streamposition, _currentnode.OuterHtml,
2169                                                         "End tag </" + _currentnode.Name + "> invalid here");
2170                                                 error = true;
2171                                         }
2172                                 }
2173
2174                                 if (!error)
2175                                 {
2176                                         _lastnodes[_currentnode.Name] = prev._prevwithsamename;
2177                                         prev.CloseNode(_currentnode);
2178                                 }
2179                         }
2180
2181
2182                         // we close this node, get grandparent
2183                         if (!error)
2184                         {
2185                                 if ((_lastparentnode != null) &&
2186                                         ((!HtmlNode.IsClosedElement(_currentnode.Name)) ||
2187                                         (_currentnode._starttag)))
2188                                 {
2189                                         UpdateLastParentNode();
2190                                 }
2191                         }
2192                 }
2193
2194                 internal void UpdateLastParentNode()
2195                 {
2196                         do
2197                         {
2198                                 if (_lastparentnode.Closed)
2199                                 {
2200                                         _lastparentnode = _lastparentnode.ParentNode;
2201                                 }
2202                         }
2203                         while ((_lastparentnode != null) && (_lastparentnode.Closed));
2204                         if (_lastparentnode == null)
2205                         {
2206                                 _lastparentnode = _documentnode;
2207                         }
2208                 }
2209
2210                 private string CurrentAttributeName()
2211                 {
2212                         return _text.Substring(_currentattribute._namestartindex, _currentattribute._namelength);
2213                 }
2214
2215                 private string CurrentAttributeValue()
2216                 {
2217                         return _text.Substring(_currentattribute._valuestartindex, _currentattribute._valuelength);
2218                 }
2219
2220                 private string CurrentNodeName()
2221                 {
2222                         return _text.Substring(_currentnode._namestartindex, _currentnode._namelength);
2223                 }
2224
2225                 private string CurrentNodeOuter()
2226                 {
2227                         return _text.Substring(_currentnode._outerstartindex, _currentnode._outerlength);
2228                 }
2229
2230                 private string CurrentNodeInner()
2231                 {
2232                         return _text.Substring(_currentnode._innerstartindex, _currentnode._innerlength);
2233                 }
2234
2235                 /// <summary>
2236                 /// Determines if the specified character is considered as a whitespace character.
2237                 /// </summary>
2238                 /// <param name="c">The character to check.</param>
2239                 /// <returns>true if if the specified character is considered as a whitespace character.</returns>
2240                 public static bool IsWhiteSpace(int c)
2241                 {
2242                         if ((c == 10) || (c == 13) || (c == 32) || (c == 9))
2243                         {
2244                                 return true;
2245                         }
2246                         return false;
2247                 }
2248
2249         }
2250
2251         internal class EncodingFoundException: Exception
2252         {
2253                 private Encoding _encoding;
2254
2255                 internal EncodingFoundException(Encoding encoding)
2256                 {
2257                         _encoding = encoding;
2258                 }
2259
2260                 internal Encoding Encoding
2261                 {
2262                         get
2263                         {
2264                                 return _encoding;
2265                         }
2266                 }
2267         }
2268 }