Filters/HtmlAgilityPack/HtmlDocument.cs

   1 // HtmlAgilityPack V1.0 - Simon Mourier <simonm@microsoft.com>
   2
   3 /*
   4 Copyright (C) 2003 Simon Mourier <simonm@microsoft.com>
   5 All rights reserved.
   6
   7 Redistribution and use in source and binary forms, with or without
   8 modification, are permitted provided that the following conditions
   9 are met:
  10 1. Redistributions of source code must retain the above copyright
  11    notice, this list of conditions and the following disclaimer.
  12 2. Redistributions in binary form must reproduce the above copyright
  13    notice, this list of conditions and the following disclaimer in the
  14    documentation and/or other materials provided with the distribution.
  15 3. The name of the author may not be used to endorse or promote products
  16    derived from this software without specific prior written permission.
  17
  18 THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  19 IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  20 OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  21 IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  22 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  23 NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  27 THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28 */
  29
  30 using System;
  31 using System.IO;
  32 using System.Text;
  33 using System.Diagnostics;
  34 using System.Collections;
  35 using System.Text.RegularExpressions;
  36 using System.Xml;
  37 using System.Xml.XPath;
  38
  39 namespace HtmlAgilityPack
  40 {
  41         /// <summary>
  42         /// Represents the type of parsing error.
  43         /// </summary>
  44         public enum HtmlParseErrorCode
  45         {
  46                 /// <summary>
  47                 /// A tag was not closed.
  48                 /// </summary>
  49                 TagNotClosed,
  50
  51                 /// <summary>
  52                 /// A tag was not opened.
  53                 /// </summary>
  54                 TagNotOpened,
  55
  56                 /// <summary>
  57                 /// There is a charset mismatch between stream and declared (META) encoding.
  58                 /// </summary>
  59                 CharsetMismatch,
  60
  61                 /// <summary>
  62                 /// An end tag was not required.
  63                 /// </summary>
  64                 EndTagNotRequired,
  65
  66                 /// <summary>
  67                 /// An end tag is invalid at this position.
  68                 /// </summary>
  69                 EndTagInvalidHere
  70         }
  71
  72         /// <summary>
  73         /// Represents a parsing error found during document parsing.
  74         /// </summary>
  75         public class HtmlParseError
  76         {
  77                 private HtmlParseErrorCode _code;
  78                 private int _line;
  79                 private int _linePosition;
  80                 private int _streamPosition;
  81                 private string _sourceText;
  82                 private string _reason;
  83
  84                 internal HtmlParseError(
  85                         HtmlParseErrorCode code,
  86                         int line,
  87                         int linePosition,
  88                         int streamPosition,
  89                         string sourceText,
  90                         string reason)
  91                 {
  92                         _code = code;
  93                         _line = line;
  94                         _linePosition = linePosition;
  95                         _streamPosition = streamPosition;
  96                         _sourceText = sourceText;
  97                         _reason = reason;
  98                 }
  99
 100                 /// <summary>
 101                 /// Gets the type of error.
 102                 /// </summary>
 103                 public HtmlParseErrorCode Code
 104                 {
 105                         get
 106                         {
 107                                 return _code;
 108                         }
 109                 }
 110
 111                 /// <summary>
 112                 /// Gets the line number of this error in the document.
 113                 /// </summary>
 114                 public int Line
 115                 {
 116                         get
 117                         {
 118                                 return _line;
 119                         }
 120                 }
 121
 122                 /// <summary>
 123                 /// Gets the column number of this error in the document.
 124                 /// </summary>
 125                 public int LinePosition
 126                 {
 127                         get
 128                         {
 129                                 return _linePosition;
 130                         }
 131                 }
 132
 133                 /// <summary>
 134                 /// Gets the absolstream position of this error in the document, relative to the start of the document.
 135                 /// </summary>
 136                 public int StreamPosition
 137                 {
 138                         get
 139                         {
 140                                 return _streamPosition;
 141                         }
 142                 }
 143
 144                 /// <summary>
 145                 /// Gets the the full text of the line containing the error.
 146                 /// </summary>
 147                 public string SourceText
 148                 {
 149                         get
 150                         {
 151                                 return _sourceText;
 152                                 }
 153                 }
 154
 155                 /// <summary>
 156                 /// Gets a description for the error.
 157                 /// </summary>
 158                 public string Reason
 159                 {
 160                         get
 161                         {
 162                                 return _reason;
 163                         }
 164                 }
 165         }
 166
 167         /// <summary>
 168         /// Represents a complete HTML document.
 169         /// </summary>
 170         public class HtmlDocument: IXPathNavigable
 171         {
 172                 internal static readonly string HtmlExceptionRefNotChild = "Reference node must be a child of this node";
 173                 internal static readonly string HtmlExceptionUseIdAttributeFalse = "You need to set UseIdAttribute property to true to enable this feature";
 174
 175                 internal Hashtable _openednodes;
 176                 internal Hashtable _lastnodes = new Hashtable();
 177                 internal Hashtable _nodesid;
 178                 private HtmlNode _documentnode;
 179                 internal string _text;
 180                 private HtmlNode _currentnode;
 181                 private HtmlNode _lastparentnode;
 182                 private HtmlAttribute _currentattribute;
 183                 private int _index;
 184                 private int _line;
 185                 private int _lineposition, _maxlineposition;
 186                 private int _c;
 187                 private bool _fullcomment;
 188                 private System.Text.Encoding _streamencoding;
 189                 private System.Text.Encoding _declaredencoding;
 190                 private ArrayList _parseerrors = new ArrayList();
 191                 private ParseState _state, _oldstate;
 192                 private Crc32 _crc32 = null;
 193                 private bool _onlyDetectEncoding = false;
 194
 195                 // public props
 196
 197                 /// <summary>
 198                 /// Defines if a checksum must be computed for the document while parsing. Default is false.
 199                 /// </summary>
 200                 public bool OptionComputeChecksum = false;
 201
 202                 /// <summary>
 203                 /// Defines if declared encoding must be read from the document.
 204                 /// Declared encoding is determined using the meta http-equiv="content-type" content="text/html;charset=XXXXX" html node.
 205                 /// Default is true.
 206                 /// </summary>
 207                 public bool OptionReadEncoding = true;
 208
 209
 210                 /// <summary>
 211                 /// Defines if non closed nodes will be checked at the end of parsing. Default is true.
 212                 /// </summary>
 213                 public bool OptionCheckSyntax = true;
 214
 215                 /// <summary>
 216                 /// Defines if the 'id' attribute must be specifically used. Default is true.
 217                 /// </summary>
 218                 public bool OptionUseIdAttribute = true;
 219
 220                 /// <summary>
 221                 /// Defines if empty nodes must be written as closed during output. Default is false.
 222                 /// </summary>
 223                 public bool OptionWriteEmptyNodes = false;
 224
 225                 /// <summary>
 226                 /// Defines if output must conform to XML, instead of HTML.
 227                 /// </summary>
 228                 public bool OptionOutputAsXml = false;
 229
 230                 /// <summary>
 231                 /// Defines if name must be output in uppercase. Default is false.
 232                 /// </summary>
 233                 public bool OptionOutputUpperCase = false;
 234
 235                 /// <summary>
 236                 /// Defines if attribute value output must be optimized (not bound with double quotes if it is possible). Default is false.
 237                 /// </summary>
 238                 public bool OptionOutputOptimizeAttributeValues = false;
 239
 240                 /// <summary>
 241                 /// Adds Debugging attributes to node. Default is false.
 242                 /// </summary>
 243                 public bool OptionAddDebuggingAttributes = false;
 244
 245                 /// <summary>
 246                 /// Defines if source text must be extracted while parsing errors.
 247                 /// If the document has a lot of errors, or cascading errors, parsing performance can be dramatically affected if set to true.
 248                 /// Default is false.
 249                 /// </summary>
 250                 public bool OptionExtractErrorSourceText = false; // turning this on can dramatically slow performance if a lot of errors are detected
 251
 252                 /// <summary>
 253                 /// Defines if closing for non closed nodes must be done at the end or directly in the document.
 254                 /// Setting this to true can actually change how browsers render the page. Default is false.
 255                 /// </summary>
 256                 public bool OptionAutoCloseOnEnd = false; // close errors at the end
 257
 258                 /// <summary>
 259                 /// Defines if LI, TR, TH, TD tags must be partially fixed when nesting errors are detected. Default is false.
 260                 /// </summary>
 261                 public bool OptionFixNestedTags = false; // fix li, tr, th, td tags
 262
 263                 /// <summary>
 264                 /// Defines the maximum length of source text or parse errors. Default is 100.
 265                 /// </summary>
 266                 public int OptionExtractErrorSourceTextMaxLength = 100;
 267
 268                 /// <summary>
 269                 /// Defines the default stream encoding to use. Default is System.Text.Encoding.Default.
 270                 /// </summary>
 271                 public System.Text.Encoding OptionDefaultStreamEncoding = System.Text.Encoding.Default;
 272
 273                 /// <summary>
 274                 /// Gets a list of parse errors found in the document.
 275                 /// </summary>
 276                 public ArrayList ParseErrors
 277                 {
 278                         get
 279                         {
 280                                 return _parseerrors;
 281                         }
 282                 }
 283
 284                 /// <summary>
 285                 /// Gets the document's stream encoding.
 286                 /// </summary>
 287                 public System.Text.Encoding StreamEncoding
 288                 {
 289                         get
 290                         {
 291                                 return _streamencoding;
 292                         }
 293                 }
 294
 295                 /// <summary>
 296                 /// Gets the document's declared encoding.
 297                 /// Declared encoding is determined using the meta http-equiv="content-type" content="text/html;charset=XXXXX" html node.
 298                 /// </summary>
 299                 public System.Text.Encoding DeclaredEncoding
 300                 {
 301                         get
 302                         {
 303                                 return _declaredencoding;
 304                         }
 305                 }
 306
 307                 /// <summary>
 308                 /// Creates an instance of an HTML document.
 309                 /// </summary>
 310                 public HtmlDocument()
 311                 {
 312                         _documentnode = CreateNode(HtmlNodeType.Document, 0);
 313                 }
 314
 315                 internal HtmlNode GetXmlDeclaration()
 316                 {
 317                         if (!_documentnode.HasChildNodes)
 318                         {
 319                                 return null;
 320                         }
 321
 322                         foreach(HtmlNode node in _documentnode._childnodes)
 323                         {
 324                                 if (node.Name == "?xml") // it's ok, names are case sensitive
 325                                 {
 326                                         return node;
 327                                 }
 328                         }
 329                         return null;
 330                 }
 331
 332                 /// <summary>
 333                 /// Applies HTML encoding to a specified string.
 334                 /// </summary>
 335                 /// <param name="html">The input string to encode. May not be null.</param>
 336                 /// <returns>The encoded string.</returns>
 337                 public static string HtmlEncode(string html)
 338                 {
 339                         if (html == null)
 340                         {
 341                                 throw new ArgumentNullException("html");
 342                         }
 343                         // replace & by &amp; but only once!
 344                         Regex rx = new Regex("&(?!(amp;)|(lt;)|(gt;)|(quot;))", RegexOptions.IgnoreCase);
 345                         return rx.Replace(html, "&amp;").Replace("<", "&lt;").Replace(">", "&gt;").Replace("\"", "&quot;");
 346                 }
 347
 348                 /// <summary>
 349                 /// Detects the encoding of an HTML stream.
 350                 /// </summary>
 351                 /// <param name="stream">The input stream. May not be null.</param>
 352                 /// <returns>The detected encoding.</returns>
 353                 public Encoding DetectEncoding(Stream stream)
 354                 {
 355                         if (stream == null)
 356                         {
 357                                 throw new ArgumentNullException("stream");
 358                         }
 359                         return DetectEncoding(new StreamReader(stream));
 360                 }
 361
 362                 /// <summary>
 363                 /// Detects the encoding of an HTML file.
 364                 /// </summary>
 365                 /// <param name="path">Path for the file containing the HTML document to detect. May not be null.</param>
 366                 /// <returns>The detected encoding.</returns>
 367                 public Encoding DetectEncoding(string path)
 368                 {
 369                         if (path == null)
 370                         {
 371                                 throw new ArgumentNullException("path");
 372                         }
 373                         StreamReader sr = new StreamReader(path, OptionDefaultStreamEncoding);
 374                         Encoding encoding = DetectEncoding(sr);
 375                         sr.Close();
 376                         return encoding;
 377                 }
 378
 379                 /// <summary>
 380                 /// Detects the encoding of an HTML text.
 381                 /// </summary>
 382                 /// <param name="html">The input html text. May not be null.</param>
 383                 /// <returns>The detected encoding.</returns>
 384                 public Encoding DetectEncodingHtml(string html)
 385                 {
 386                         if (html == null)
 387                         {
 388                                 throw new ArgumentNullException("html");
 389                         }
 390                         StringReader sr = new StringReader(html);
 391                         Encoding encoding = DetectEncoding(sr);
 392                         sr.Close();
 393                         return encoding;
 394                 }
 395
 396                 /// <summary>
 397                 /// Detects the encoding of an HTML text provided on a TextReader.
 398                 /// </summary>
 399                 /// <param name="reader">The TextReader used to feed the HTML. May not be null.</param>
 400                 /// <returns>The detected encoding.</returns>
 401                 public Encoding DetectEncoding(TextReader reader)
 402                 {
 403                         if (reader == null)
 404                         {
 405                                 throw new ArgumentNullException("reader");
 406                         }
 407                         _onlyDetectEncoding = true;
 408                         if (OptionCheckSyntax)
 409                         {
 410                                 _openednodes = new Hashtable();
 411                         }
 412                         else
 413                         {
 414                                 _openednodes = null;
 415                         }
 416
 417                         if (OptionUseIdAttribute)
 418                         {
 419                                 _nodesid = new Hashtable();
 420                         }
 421                         else
 422                         {
 423                                 _nodesid = null;
 424                         }
 425
 426                         StreamReader sr = reader as StreamReader;
 427                         if (sr != null)
 428                         {
 429                                 _streamencoding = sr.CurrentEncoding;
 430                         }
 431                         else
 432                         {
 433                                 _streamencoding = null;
 434                         }
 435                         _declaredencoding = null;
 436
 437                         _text = reader.ReadToEnd();
 438                         _documentnode = CreateNode(HtmlNodeType.Document, 0);
 439
 440                         // this is a hack, but it allows us not to muck with the original parsing code
 441                         try
 442                         {
 443                                 Parse();
 444                         }
 445                         catch(EncodingFoundException ex)
 446                         {
 447                                 return ex.Encoding;
 448                         }
 449                         return null;
 450                 }
 451
 452                 /// <summary>
 453                 /// Loads an HTML document from a stream.
 454                 /// </summary>
 455                 /// <param name="stream">The input stream.</param>
 456                 public void Load(Stream stream)
 457                 {
 458                         Load(new StreamReader(stream, OptionDefaultStreamEncoding));
 459                 }
 460
 461                 /// <summary>
 462                 /// Loads an HTML document from a stream.
 463                 /// </summary>
 464                 /// <param name="stream">The input stream.</param>
 465                 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the stream.</param>
 466                 public void Load(Stream stream, bool detectEncodingFromByteOrderMarks)
 467                 {
 468                         Load(new StreamReader(stream, detectEncodingFromByteOrderMarks));
 469                 }
 470
 471                 /// <summary>
 472                 /// Loads an HTML document from a stream.
 473                 /// </summary>
 474                 /// <param name="stream">The input stream.</param>
 475                 /// <param name="encoding">The character encoding to use.</param>
 476                 public void Load(Stream stream, Encoding encoding)
 477                 {
 478                         Load(new StreamReader(stream, encoding));
 479                 }
 480
 481                 /// <summary>
 482                 /// Loads an HTML document from a stream.
 483                 /// </summary>
 484                 /// <param name="stream">The input stream.</param>
 485                 /// <param name="encoding">The character encoding to use.</param>
 486                 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the stream.</param>
 487                 public void Load(Stream stream, Encoding encoding, bool detectEncodingFromByteOrderMarks)
 488                 {
 489                         Load(new StreamReader(stream, encoding, detectEncodingFromByteOrderMarks));
 490                 }
 491
 492                 /// <summary>
 493                 /// Loads an HTML document from a stream.
 494                 /// </summary>
 495                 /// <param name="stream">The input stream.</param>
 496                 /// <param name="encoding">The character encoding to use.</param>
 497                 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the stream.</param>
 498                 /// <param name="buffersize">The minimum buffer size.</param>
 499                 public void Load(Stream stream, Encoding encoding, bool detectEncodingFromByteOrderMarks, int buffersize)
 500                 {
 501                         Load(new StreamReader(stream, encoding, detectEncodingFromByteOrderMarks, buffersize));
 502                 }
 503
 504                 /// <summary>
 505                 /// Loads an HTML document from a file.
 506                 /// </summary>
 507                 /// <param name="path">The complete file path to be read. May not be null.</param>
 508                 public void Load(string path)
 509                 {
 510                         if (path == null)
 511                         {
 512                                 throw new ArgumentNullException("path");
 513                         }
 514                         StreamReader sr = new StreamReader(path, OptionDefaultStreamEncoding);
 515                         Load(sr);
 516                         sr.Close();
 517                 }
 518
 519                 /// <summary>
 520                 /// Loads an HTML document from a file.
 521                 /// </summary>
 522                 /// <param name="path">The complete file path to be read. May not be null.</param>
 523                 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
 524                 public void Load(string path, bool detectEncodingFromByteOrderMarks)
 525                 {
 526                         if (path == null)
 527                         {
 528                                 throw new ArgumentNullException("path");
 529                         }
 530                         StreamReader sr = new StreamReader(path, detectEncodingFromByteOrderMarks);
 531                         Load(sr);
 532                         sr.Close();
 533                 }
 534
 535                 /// <summary>
 536                 /// Loads an HTML document from a file.
 537                 /// </summary>
 538                 /// <param name="path">The complete file path to be read. May not be null.</param>
 539                 /// <param name="encoding">The character encoding to use. May not be null.</param>
 540                 public void Load(string path, Encoding encoding)
 541                 {
 542                         if (path == null)
 543                         {
 544                                 throw new ArgumentNullException("path");
 545                         }
 546                         if (encoding == null)
 547                         {
 548                                 throw new ArgumentNullException("encoding");
 549                         }
 550                         StreamReader sr = new StreamReader(path, encoding);
 551                         Load(sr);
 552                         sr.Close();
 553                 }
 554
 555                 /// <summary>
 556                 /// Loads an HTML document from a file.
 557                 /// </summary>
 558                 /// <param name="path">The complete file path to be read. May not be null.</param>
 559                 /// <param name="encoding">The character encoding to use. May not be null.</param>
 560                 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
 561                 public void Load(string path, Encoding encoding, bool detectEncodingFromByteOrderMarks)
 562                 {
 563                         if (path == null)
 564                         {
 565                                 throw new ArgumentNullException("path");
 566                         }
 567                         if (encoding == null)
 568                         {
 569                                 throw new ArgumentNullException("encoding");
 570                         }
 571                         StreamReader sr = new StreamReader(path, encoding, detectEncodingFromByteOrderMarks);
 572                         Load(sr);
 573                         sr.Close();
 574                 }
 575
 576                 /// <summary>
 577                 /// Loads an HTML document from a file.
 578                 /// </summary>
 579                 /// <param name="path">The complete file path to be read. May not be null.</param>
 580                 /// <param name="encoding">The character encoding to use. May not be null.</param>
 581                 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
 582                 /// <param name="buffersize">The minimum buffer size.</param>
 583                 public void Load(string path, Encoding encoding, bool detectEncodingFromByteOrderMarks, int buffersize)
 584                 {
 585                         if (path == null)
 586                         {
 587                                 throw new ArgumentNullException("path");
 588                         }
 589                         if (encoding == null)
 590                         {
 591                                 throw new ArgumentNullException("encoding");
 592                         }
 593                         StreamReader sr = new StreamReader(path, encoding, detectEncodingFromByteOrderMarks, buffersize);
 594                         Load(sr);
 595                         sr.Close();
 596                 }
 597
 598                 /// <summary>
 599                 /// Loads the HTML document from the specified string.
 600                 /// </summary>
 601                 /// <param name="html">String containing the HTML document to load. May not be null.</param>
 602                 public void LoadHtml(string html)
 603                 {
 604                         if (html == null)
 605                         {
 606                                 throw new ArgumentNullException("html");
 607                         }
 608                         StringReader sr = new StringReader(html);
 609                         Load(sr);
 610                         sr.Close();
 611                 }
 612
 613                 /// <summary>
 614                 /// Detects the encoding of an HTML document from a file first, and then loads the file.
 615                 /// </summary>
 616                 /// <param name="path">The complete file path to be read.</param>
 617                 public void DetectEncodingAndLoad(string path)
 618                 {
 619                         DetectEncodingAndLoad(path, true);
 620                 }
 621
 622                 /// <summary>
 623                 /// Detects the encoding of an HTML document from a file first, and then loads the file.
 624                 /// </summary>
 625                 /// <param name="path">The complete file path to be read. May not be null.</param>
 626                 /// <param name="detectEncoding">true to detect encoding, false otherwise.</param>
 627                 public void DetectEncodingAndLoad(string path, bool detectEncoding)
 628                 {
 629                         if (path == null)
 630                         {
 631                                 throw new ArgumentNullException("path");
 632                         }
 633                         System.Text.Encoding enc;
 634                         if (detectEncoding)
 635                         {
 636                                 enc = DetectEncoding(path);
 637                         }
 638                         else
 639                         {
 640                                 enc = null;
 641                         }
 642
 643                         if (enc == null)
 644                         {
 645                                 Load(path);
 646                         }
 647                         else
 648                         {
 649                                 Load(path, enc);
 650                         }
 651                 }
 652
 653                 /// <summary>
 654                 /// Loads the HTML document from the specified TextReader.
 655                 /// </summary>
 656                 /// <param name="reader">The TextReader used to feed the HTML data into the document. May not be null.</param>
 657                 public void Load(TextReader reader)
 658                 {
 659                         // all Load methods pass down to this one
 660                         if (reader == null)
 661                         {
 662                                 throw new ArgumentNullException("reader");
 663                         }
 664
 665                         _onlyDetectEncoding = false;
 666
 667                         if (OptionCheckSyntax)
 668                         {
 669                                 _openednodes = new Hashtable();
 670                         }
 671                         else
 672                         {
 673                                 _openednodes = null;
 674                         }
 675
 676                         if (OptionUseIdAttribute)
 677                         {
 678                                 _nodesid = new Hashtable();
 679                         }
 680                         else
 681                         {
 682                                 _nodesid = null;
 683                         }
 684
 685                         StreamReader sr = reader as StreamReader;
 686                         if (sr != null)
 687                         {
 688                                 try
 689                                 {
 690                                     // trigger bom read if needed
 691                                     sr.Peek();
 692                                 }
 693                                 catch
 694                                 {
 695                                     // void on purpose
 696                                 }
 697                                 _streamencoding = sr.CurrentEncoding;
 698                         }
 699                         else
 700                         {
 701                                 _streamencoding = null;
 702                         }
 703                         _declaredencoding = null;
 704
 705                         _text = reader.ReadToEnd();
 706                         _documentnode = CreateNode(HtmlNodeType.Document, 0);
 707                         Parse();
 708
 709                         if (OptionCheckSyntax)
 710                         {
 711                                 foreach(HtmlNode node in _openednodes.Values)
 712                                 {
 713                                         if (!node._starttag)    // already reported
 714                                         {
 715                                                 continue;
 716                                         }
 717
 718                                         string html;
 719                                         if (OptionExtractErrorSourceText)
 720                                         {
 721                                                 html = node.OuterHtml;
 722                                                 if (html.Length > OptionExtractErrorSourceTextMaxLength)
 723                                                 {
 724                                                         html = html.Substring(0, OptionExtractErrorSourceTextMaxLength);
 725                                                 }
 726                                         }
 727                                         else
 728                                         {
 729                                                 html = string.Empty;
 730                                         }
 731                                         AddError(
 732                                                 HtmlParseErrorCode.TagNotClosed,
 733                                                 node._line, node._lineposition,
 734                                                 node._streamposition, html,
 735                                                 "End tag </" + node.Name + "> was not found");
 736                                 }
 737
 738                                 // we don't need this anymore
 739                                 _openednodes.Clear();
 740                         }
 741                 }
 742
 743                 internal System.Text.Encoding GetOutEncoding()
 744                 {
 745                         // when unspecified, use the stream encoding first
 746                         if (_declaredencoding != null)
 747                         {
 748                                 return _declaredencoding;
 749                         }
 750                         else
 751                         {
 752                                 if (_streamencoding != null)
 753                                 {
 754                                         return _streamencoding;
 755                                 }
 756                         }
 757                         return OptionDefaultStreamEncoding;
 758                 }
 759
 760
 761                 /// <summary>
 762                 /// Gets the document's output encoding.
 763                 /// </summary>
 764                 public System.Text.Encoding Encoding
 765                 {
 766                         get
 767                         {
 768                                 return GetOutEncoding();
 769                         }
 770                 }
 771
 772                 /// <summary>
 773                 /// Saves the HTML document to the specified stream.
 774                 /// </summary>
 775                 /// <param name="outStream">The stream to which you want to save.</param>
 776                 public void Save(Stream outStream)
 777                 {
 778                         StreamWriter sw = new StreamWriter(outStream, GetOutEncoding());
 779                         Save(sw);
 780                 }
 781
 782                 /// <summary>
 783                 /// Saves the HTML document to the specified stream.
 784                 /// </summary>
 785                 /// <param name="outStream">The stream to which you want to save. May not be null.</param>
 786                 /// <param name="encoding">The character encoding to use. May not be null.</param>
 787                 public void Save(Stream outStream, System.Text.Encoding encoding)
 788                 {
 789                         if (outStream == null)
 790                         {
 791                                 throw new ArgumentNullException("outStream");
 792                         }
 793                         if (encoding == null)
 794                         {
 795                                 throw new ArgumentNullException("encoding");
 796                         }
 797                         StreamWriter sw = new StreamWriter(outStream, encoding);
 798                         Save(sw);
 799                 }
 800
 801                 /// <summary>
 802                 /// Saves the mixed document to the specified file.
 803                 /// </summary>
 804                 /// <param name="filename">The location of the file where you want to save the document.</param>
 805                 public void Save(string filename)
 806                 {
 807                         StreamWriter sw = new StreamWriter(filename, false, GetOutEncoding());
 808                         Save(sw);
 809                         sw.Close();
 810                 }
 811
 812                 /// <summary>
 813                 /// Saves the mixed document to the specified file.
 814                 /// </summary>
 815                 /// <param name="filename">The location of the file where you want to save the document. May not be null.</param>
 816                 /// <param name="encoding">The character encoding to use. May not be null.</param>
 817                 public void Save(string filename, System.Text.Encoding encoding)
 818                 {
 819                         if (filename == null)
 820                         {
 821                                 throw new ArgumentNullException("filename");
 822                         }
 823                         if (encoding == null)
 824                         {
 825                                 throw new ArgumentNullException("encoding");
 826                         }
 827                         StreamWriter sw = new StreamWriter(filename, false, encoding);
 828                         Save(sw);
 829                         sw.Close();
 830                 }
 831
 832                 /// <summary>
 833                 /// Saves the HTML document to the specified StreamWriter.
 834                 /// </summary>
 835                 /// <param name="writer">The StreamWriter to which you want to save.</param>
 836                 public void Save(StreamWriter writer)
 837                 {
 838                         Save((TextWriter)writer);
 839                 }
 840
 841                 /// <summary>
 842                 /// Saves the HTML document to the specified TextWriter.
 843                 /// </summary>
 844                 /// <param name="writer">The TextWriter to which you want to save. May not be null.</param>
 845                 public void Save(TextWriter writer)
 846                 {
 847                         if (writer == null)
 848                         {
 849                                 throw new ArgumentNullException("writer");
 850                         }
 851                         DocumentNode.WriteTo(writer);
 852                 }
 853
 854                 /// <summary>
 855                 /// Saves the HTML document to the specified XmlWriter.
 856                 /// </summary>
 857                 /// <param name="writer">The XmlWriter to which you want to save.</param>
 858                 public void Save(XmlWriter writer)
 859                 {
 860                         DocumentNode.WriteTo(writer);
 861                         writer.Flush();
 862                 }
 863
 864                 /// <summary>
 865                 /// Creates a new XPathNavigator object for navigating this HTML document.
 866                 /// </summary>
 867                 /// <returns>An XPathNavigator object. The XPathNavigator is positioned on the root of the document.</returns>
 868                 public XPathNavigator CreateNavigator()
 869                 {
 870                         return new HtmlNodeNavigator(this, _documentnode);
 871                 }
 872
 873                 internal void SetIdForNode(HtmlNode node, string id)
 874                 {
 875                         if (!OptionUseIdAttribute)
 876                         {
 877                                 return;
 878                         }
 879
 880                         if ((_nodesid == null) || (id == null))
 881                         {
 882                                 return;
 883                         }
 884
 885                         if (node == null)
 886                         {
 887                                 _nodesid.Remove(id.ToLower());
 888                         }
 889                         else
 890                         {
 891                                 _nodesid[id.ToLower()] = node;
 892                         }
 893                 }
 894
 895                 /// <summary>
 896                 /// Gets the HTML node with the specified 'id' attribute value.
 897                 /// </summary>
 898                 /// <param name="id">The attribute id to match. May not be null.</param>
 899                 /// <returns>The HTML node with the matching id or null if not found.</returns>
 900                 public HtmlNode GetElementbyId(string id)
 901                 {
 902                         if (id == null)
 903                         {
 904                                 throw new ArgumentNullException("id");
 905                         }
 906                         if (_nodesid == null)
 907                         {
 908                                 throw new Exception(HtmlExceptionUseIdAttributeFalse);
 909                         }
 910
 911                         return _nodesid[id.ToLower()] as HtmlNode;
 912                 }
 913
 914                 /// <summary>
 915                 /// Creates an HTML element node with the specified name.
 916                 /// </summary>
 917                 /// <param name="name">The qualified name of the element. May not be null.</param>
 918                 /// <returns>The new HTML node.</returns>
 919                 public HtmlNode CreateElement(string name)
 920                 {
 921                         if (name == null)
 922                         {
 923                                 throw new ArgumentNullException("name");
 924                         }
 925                         HtmlNode node = CreateNode(HtmlNodeType.Element);
 926                         node._name = name;
 927                         return node;
 928                 }
 929
 930                 /// <summary>
 931                 /// Creates an HTML comment node.
 932                 /// </summary>
 933                 /// <returns>The new HTML comment node.</returns>
 934                 public HtmlCommentNode CreateComment()
 935                 {
 936                         return (HtmlCommentNode)CreateNode(HtmlNodeType.Comment);
 937                 }
 938
 939                 /// <summary>
 940                 /// Creates an HTML comment node with the specified comment text.
 941                 /// </summary>
 942                 /// <param name="comment">The comment text. May not be null.</param>
 943                 /// <returns>The new HTML comment node.</returns>
 944                 public HtmlCommentNode CreateComment(string comment)
 945                 {
 946                         if (comment == null)
 947                         {
 948                                 throw new ArgumentNullException("comment");
 949                         }
 950                         HtmlCommentNode c = CreateComment();
 951                         c.Comment = comment;
 952                         return c;
 953                 }
 954
 955                 /// <summary>
 956                 /// Creates an HTML text node.
 957                 /// </summary>
 958                 /// <returns>The new HTML text node.</returns>
 959                 public HtmlTextNode CreateTextNode()
 960                 {
 961                         return (HtmlTextNode)CreateNode(HtmlNodeType.Text);
 962                 }
 963
 964                 /// <summary>
 965                 /// Creates an HTML text node with the specified text.
 966                 /// </summary>
 967                 /// <param name="text">The text of the node. May not be null.</param>
 968                 /// <returns>The new HTML text node.</returns>
 969                 public HtmlTextNode CreateTextNode(string text)
 970                 {
 971                         if (text == null)
 972                         {
 973                                 throw new ArgumentNullException("text");
 974                         }
 975                         HtmlTextNode t = CreateTextNode();
 976                         t.Text = text;
 977                         return t;
 978                 }
 979
 980                 internal HtmlNode CreateNode(HtmlNodeType type)
 981                 {
 982                         return CreateNode(type, -1);
 983                 }
 984
 985                 internal HtmlNode CreateNode(HtmlNodeType type, int index)
 986                 {
 987                         switch (type)
 988                         {
 989                                 case HtmlNodeType.Comment:
 990                                         return new HtmlCommentNode(this, index);
 991
 992                                 case HtmlNodeType.Text:
 993                                         return new HtmlTextNode(this, index);
 994
 995                                 default:
 996                                         return new HtmlNode(type, this, index);
 997                         }
 998                 }
 999
1000                 internal HtmlAttribute CreateAttribute()
1001                 {
1002                         return new HtmlAttribute(this);
1003                 }
1004
1005                 /// <summary>
1006                 /// Creates an HTML attribute with the specified name.
1007                 /// </summary>
1008                 /// <param name="name">The name of the attribute. May not be null.</param>
1009                 /// <returns>The new HTML attribute.</returns>
1010                 public HtmlAttribute CreateAttribute(string name)
1011                 {
1012                         if (name == null)
1013                         {
1014                                 throw new ArgumentNullException("name");
1015                         }
1016                         HtmlAttribute att = CreateAttribute();
1017                         att.Name = name;
1018                         return att;
1019                 }
1020
1021                 /// <summary>
1022                 /// Creates an HTML attribute with the specified name.
1023                 /// </summary>
1024                 /// <param name="name">The name of the attribute. May not be null.</param>
1025                 /// <param name="value">The value of the attribute.</param>
1026                 /// <returns>The new HTML attribute.</returns>
1027                 public HtmlAttribute CreateAttribute(string name, string value)
1028                 {
1029                         if (name == null)
1030                         {
1031                                 throw new ArgumentNullException("name");
1032                         }
1033                         HtmlAttribute att = CreateAttribute(name);
1034                         att.Value = value;
1035                         return att;
1036                 }
1037
1038                 /// <summary>
1039                 /// Gets the root node of the document.
1040                 /// </summary>
1041                 public HtmlNode DocumentNode
1042                 {
1043                         get
1044                         {
1045                                 return _documentnode;
1046                         }
1047                 }
1048
1049                 /// <summary>
1050                 /// Gets the document CRC32 checksum if OptionComputeChecksum was set to true before parsing, 0 otherwise.
1051                 /// </summary>
1052                 public int CheckSum
1053                 {
1054                         get
1055                         {
1056                                 if (_crc32 == null)
1057                                 {
1058                                         return 0;
1059                                 }
1060                                 else
1061                                 {
1062                                         return (int)_crc32.CheckSum;
1063                                 }
1064                         }
1065                 }
1066
1067                 private HtmlParseError AddError(
1068                                 HtmlParseErrorCode code,
1069                                 int line,
1070                                 int linePosition,
1071                                 int streamPosition,
1072                                 string sourceText,
1073                                 string reason)
1074                         {
1075                         HtmlParseError err = new HtmlParseError(code, line, linePosition, streamPosition, sourceText, reason);
1076                         _parseerrors.Add(err);
1077                         return err;
1078                 }
1079
1080                 private enum ParseState
1081                 {
1082                         Text,
1083                         WhichTag,
1084                         Tag,
1085                         BetweenAttributes,
1086                         EmptyTag,
1087                         AttributeName,
1088                         AttributeBeforeEquals,
1089                         AttributeAfterEquals,
1090                         AttributeValue,
1091                         Comment,
1092                         QuotedAttributeValue,
1093                         ServerSideCode,
1094                         PcData
1095                 }
1096
1097                 private void IncrementPosition()
1098                 {
1099                         if (_crc32 != null)
1100                         {
1101                                 // REVIEW: should we add some checksum code in DecrementPosition too?
1102                                 _crc32.AddToCRC32(_c);
1103                         }
1104
1105                         _index++;
1106                         _maxlineposition = _lineposition;
1107                         if (_c == 10)
1108                         {
1109                                 _lineposition = 1;
1110                                 _line++;
1111                         }
1112                         else
1113                         {
1114                                 _lineposition++;
1115                         }
1116                 }
1117
1118                 private void DecrementPosition()
1119                 {
1120                         _index--;
1121                         if (_lineposition == 1)
1122                         {
1123                                 _lineposition = _maxlineposition;
1124                                 _line--;
1125                         }
1126                         else
1127                         {
1128                                 _lineposition--;
1129                         }
1130                 }
1131
1132                 private void Parse()
1133                 {
1134                         int lastquote = 0;
1135                         if (OptionComputeChecksum)
1136                         {
1137                                 _crc32 = new Crc32();
1138                         }
1139
1140                         _lastnodes = new Hashtable();
1141                         _c = 0;
1142                         _fullcomment = false;
1143                         _parseerrors = new ArrayList();
1144                         _line = 1;
1145                         _lineposition = 1;
1146                         _maxlineposition = 1;
1147
1148                         _state = ParseState.Text;
1149                         _oldstate = _state;
1150                         _documentnode._innerlength = _text.Length;
1151                         _documentnode._outerlength = _text.Length;
1152
1153                         _lastparentnode = _documentnode;
1154                         _currentnode = CreateNode(HtmlNodeType.Text, 0);
1155                         _currentattribute = null;
1156
1157                         _index = 0;
1158                         PushNodeStart(HtmlNodeType.Text, 0);
1159                         while (_index<_text.Length)
1160                         {
1161                                 _c = _text[_index];
1162                                 IncrementPosition();
1163
1164                                 switch(_state)
1165                                 {
1166                                         case ParseState.Text:
1167                                                 if (NewCheck())
1168                                                         continue;
1169                                                 break;
1170
1171                                         case ParseState.WhichTag:
1172                                                 if (NewCheck())
1173                                                         continue;
1174                                                 if (_c == '/')
1175                                                 {
1176                                                         PushNodeNameStart(false, _index);
1177                                                 }
1178                                                 else
1179                                                 {
1180                                                         PushNodeNameStart(true, _index-1);
1181                                                         DecrementPosition();
1182                                                 }
1183                                                 _state = ParseState.Tag;
1184                                                 break;
1185
1186                                         case ParseState.Tag:
1187                                                 if (NewCheck())
1188                                                         continue;
1189                                                 if (IsWhiteSpace(_c))
1190                                                 {
1191                                                         PushNodeNameEnd(_index-1);
1192                                                         if (_state != ParseState.Tag)
1193                                                                 continue;
1194                                                         _state = ParseState.BetweenAttributes;
1195                                                         continue;
1196                                                 }
1197                                                 if (_c == '/')
1198                                                 {
1199                                                         PushNodeNameEnd(_index-1);
1200                                                         if (_state != ParseState.Tag)
1201                                                                 continue;
1202                                                         _state = ParseState.EmptyTag;
1203                                                         continue;
1204                                                 }
1205                                                 if (_c == '>')
1206                                                 {
1207                                                         PushNodeNameEnd(_index-1);
1208                                                         if (_state != ParseState.Tag)
1209                                                                 continue;
1210                                                         PushNodeEnd(_index, false);
1211                                                         if (_state != ParseState.Tag)
1212                                                                 continue;
1213                                                         _state = ParseState.Text;
1214                                                         PushNodeStart(HtmlNodeType.Text, _index);
1215                                                 }
1216                                                 break;
1217
1218                                         case ParseState.BetweenAttributes:
1219                                                 if (NewCheck())
1220                                                         continue;
1221
1222                                                 if (IsWhiteSpace(_c))
1223                                                         continue;
1224
1225                                                 if ((_c == '/') || (_c == '?'))
1226                                                 {
1227                                                         _state = ParseState.EmptyTag;
1228                                                         continue;
1229                                                 }
1230
1231                                                 if (_c == '>')
1232                                                 {
1233                                                         PushNodeEnd(_index, false);
1234                                                         if (_state != ParseState.BetweenAttributes)
1235                                                                 continue;
1236                                                         _state = ParseState.Text;
1237                                                         PushNodeStart(HtmlNodeType.Text, _index);
1238                                                         continue;
1239                                                 }
1240
1241                                                 PushAttributeNameStart(_index-1);
1242                                                 _state = ParseState.AttributeName;
1243                                                 break;
1244
1245                                         case ParseState.EmptyTag:
1246                                                 if (NewCheck())
1247                                                         continue;
1248
1249                                                 if (_c == '>')
1250                                                 {
1251                                                         PushNodeEnd(_index, true);
1252                                                         if (_state != ParseState.EmptyTag)
1253                                                                 continue;
1254                                                         _state = ParseState.Text;
1255                                                         PushNodeStart(HtmlNodeType.Text, _index);
1256                                                         continue;
1257                                                 }
1258                                                 _state = ParseState.BetweenAttributes;
1259                                                 break;
1260
1261                                         case ParseState.AttributeName:
1262                                                 if (NewCheck())
1263                                                         continue;
1264
1265                                                 if (IsWhiteSpace(_c))
1266                                                 {
1267                                                         PushAttributeNameEnd(_index-1);
1268                                                         _state = ParseState.AttributeBeforeEquals;
1269                                                         continue;
1270                                                 }
1271                                                 if (_c == '=')
1272                                                 {
1273                                                         PushAttributeNameEnd(_index-1);
1274                                                         _state = ParseState.AttributeAfterEquals;
1275                                                         continue;
1276                                                 }
1277                                                 if (_c == '>')
1278                                                 {
1279                                                         PushAttributeNameEnd(_index-1);
1280                                                         PushNodeEnd(_index, false);
1281                                                         if (_state != ParseState.AttributeName)
1282                                                                 continue;
1283                                                         _state = ParseState.Text;
1284                                                         PushNodeStart(HtmlNodeType.Text, _index);
1285                                                         continue;
1286                                                 }
1287                                                 break;
1288
1289                                         case ParseState.AttributeBeforeEquals:
1290                                                 if (NewCheck())
1291                                                         continue;
1292
1293                                                 if (IsWhiteSpace(_c))
1294                                                         continue;
1295                                                 if (_c == '>')
1296                                                 {
1297                                                         PushNodeEnd(_index, false);
1298                                                         if (_state != ParseState.AttributeBeforeEquals)
1299                                                                 continue;
1300                                                         _state = ParseState.Text;
1301                                                         PushNodeStart(HtmlNodeType.Text, _index);
1302                                                         continue;
1303                                                 }
1304                                                 if (_c == '=')
1305                                                 {
1306                                                         _state = ParseState.AttributeAfterEquals;
1307                                                         continue;
1308                                                 }
1309                                                 // no equals, no whitespace, it's a new attrribute starting
1310                                                 _state = ParseState.BetweenAttributes;
1311                                                 DecrementPosition();
1312                                                 break;
1313
1314                                         case ParseState.AttributeAfterEquals:
1315                                                 if (NewCheck())
1316                                                         continue;
1317
1318                                                 if (IsWhiteSpace(_c))
1319                                                         continue;
1320
1321                                                 if ((_c == '\'') || (_c == '"'))
1322                                                 {
1323                                                         _state = ParseState.QuotedAttributeValue;
1324                                                         PushAttributeValueStart(_index);
1325                                                         lastquote = _c;
1326                                                         continue;
1327                                                 }
1328                                                 if (_c == '>')
1329                                                 {
1330                                                         PushNodeEnd(_index, false);
1331                                                         if (_state != ParseState.AttributeAfterEquals)
1332                                                                 continue;
1333                                                         _state = ParseState.Text;
1334                                                         PushNodeStart(HtmlNodeType.Text, _index);
1335                                                         continue;
1336                                                 }
1337                                                 PushAttributeValueStart(_index-1);
1338                                                 _state = ParseState.AttributeValue;
1339                                                 break;
1340
1341                                         case ParseState.AttributeValue:
1342                                                 if (NewCheck())
1343                                                         continue;
1344
1345                                                 if (IsWhiteSpace(_c))
1346                                                 {
1347                                                         PushAttributeValueEnd(_index-1);
1348                                                         _state = ParseState.BetweenAttributes;
1349                                                         continue;
1350                                                 }
1351
1352                                                 if (_c == '>')
1353                                                 {
1354                                                         PushAttributeValueEnd(_index-1);
1355                                                         PushNodeEnd(_index, false);
1356                                                         if (_state != ParseState.AttributeValue)
1357                                                                 continue;
1358                                                         _state = ParseState.Text;
1359                                                         PushNodeStart(HtmlNodeType.Text, _index);
1360                                                         continue;
1361                                                 }
1362                                                 break;
1363
1364                                         case ParseState.QuotedAttributeValue:
1365                                                 if (_c == lastquote)
1366                                                 {
1367                                                         PushAttributeValueEnd(_index-1);
1368                                                         _state = ParseState.BetweenAttributes;
1369                                                         continue;
1370                                                 }
1371                                                 if (_c == '<')
1372                                                 {
1373                                                         if (_index<_text.Length)
1374                                                         {
1375                                                                 if (_text[_index] == '%')
1376                                                                 {
1377                                                                         _oldstate = _state;
1378                                                                         _state = ParseState.ServerSideCode;
1379                                                                         continue;
1380                                                                 }
1381                                                         }
1382                                                 }
1383                                                 break;
1384
1385                                         case ParseState.Comment:
1386                                                 if (_c == '>')
1387                                                 {
1388                                                         if (_fullcomment)
1389                                                         {
1390                                                                 if ((_text[_index-2] != '-') ||
1391                                                                         (_text[_index-3] != '-'))
1392                                                                 {
1393                                                                         continue;
1394                                                                 }
1395                                                         }
1396                                                         PushNodeEnd(_index, false);
1397                                                         _state = ParseState.Text;
1398                                                         PushNodeStart(HtmlNodeType.Text, _index);
1399                                                         continue;
1400                                                 }
1401                                                 break;
1402
1403                                         case ParseState.ServerSideCode:
1404                                                 if (_c == '%')
1405                                                 {
1406                                                         if (_index<_text.Length)
1407                                                         {
1408                                                                 if (_text[_index] == '>')
1409                                                                 {
1410                                                                         switch(_oldstate)
1411                                                                         {
1412                                                                                 case ParseState.AttributeAfterEquals:
1413                                                                                         _state = ParseState.AttributeValue;
1414                                                                                         break;
1415
1416                                                                                 case ParseState.BetweenAttributes:
1417                                                                                         PushAttributeNameEnd(_index+1);
1418                                                                                         _state = ParseState.BetweenAttributes;
1419                                                                                         break;
1420
1421                                                                                 default:
1422                                                                                         _state = _oldstate;
1423                                                                                         break;
1424                                                                         }
1425                                                                         IncrementPosition();
1426                                                                 }
1427                                                         }
1428                                                 }
1429                                                 break;
1430
1431                                         case ParseState.PcData:
1432                                                 // look for </tag + 1 char
1433
1434                                                 // check buffer end
1435                                                 if ((_currentnode._namelength+3)<=(_text.Length-(_index-1)))
1436                                                 {
1437                                                         if (string.Compare(_text.Substring(_index-1, _currentnode._namelength+2),
1438                                                                 "</" + _currentnode.Name, true) == 0)
1439                                                         {
1440                                                                 int c = _text[_index-1 + 2 + _currentnode.Name.Length];
1441                                                                 if ((c == '>') || (IsWhiteSpace(c)))
1442                                                                 {
1443                                                                         // add the script as a text node
1444                                                                         HtmlNode script = CreateNode(HtmlNodeType.Text,
1445                                                                                 _currentnode._outerstartindex + _currentnode._outerlength);
1446                                                                         script._outerlength = _index-1 - script._outerstartindex;
1447                                                                         _currentnode.AppendChild(script);
1448
1449
1450                                                                         PushNodeStart(HtmlNodeType.Element, _index-1);
1451                                                                         PushNodeNameStart(false, _index-1 +2);
1452                                                                         _state = ParseState.Tag;
1453                                                                         IncrementPosition();
1454                                                                 }
1455                                                         }
1456                                                 }
1457                                                 break;
1458                                 }
1459                         }
1460
1461                         // finish the current work
1462                         if (_currentnode._namestartindex > 0)
1463                         {
1464                                 PushNodeNameEnd(_index);
1465                         }
1466                         PushNodeEnd(_index, false);
1467
1468                         // we don't need this anymore
1469                         _lastnodes.Clear();
1470                 }
1471
1472                 private bool NewCheck()
1473                 {
1474                         if (_c != '<')
1475                         {
1476                                 return false;
1477                         }
1478                         if (_index<_text.Length)
1479                         {
1480                                 if (_text[_index] == '%')
1481                                 {
1482                                         switch(_state)
1483                                         {
1484                                                 case ParseState.AttributeAfterEquals:
1485                             PushAttributeValueStart(_index-1);
1486                                                         break;
1487
1488                                                 case ParseState.BetweenAttributes:
1489                                                         PushAttributeNameStart(_index-1);
1490                                                         break;
1491
1492                                                 case ParseState.WhichTag:
1493                                                         PushNodeNameStart(true, _index-1);
1494                                                         _state = ParseState.Tag;
1495                                                         break;
1496                                         }
1497                                         _oldstate = _state;
1498                                         _state = ParseState.ServerSideCode;
1499                                         return true;
1500                                 }
1501                         }
1502
1503                         PushNodeEnd(_index-1, true);
1504                         _state = ParseState.WhichTag;
1505                         if ((_index-1) <= (_text.Length-2))
1506                         {
1507                                 if (_text[_index] == '!')
1508                                 {
1509                                         PushNodeStart(HtmlNodeType.Comment, _index-1);
1510                                         PushNodeNameStart(true, _index);
1511                                         PushNodeNameEnd(_index+1);
1512                                         _state = ParseState.Comment;
1513                                         if (_index<(_text.Length-2))
1514                                         {
1515                                                 if ((_text[_index+1] == '-') &&
1516                                                         (_text[_index+2] == '-'))
1517                                                 {
1518                                                         _fullcomment = true;
1519                                                 }
1520                                                 else
1521                                                 {
1522                                                         _fullcomment = false;
1523                                                 }
1524                                         }
1525                                         return true;
1526                                 }
1527                         }
1528                         PushNodeStart(HtmlNodeType.Element, _index-1);
1529                         return true;
1530                 }
1531
1532                 private void ReadDocumentEncoding(HtmlNode node)
1533                 {
1534                         if (!OptionReadEncoding)
1535                                 return;
1536                         // format is
1537                         // <meta http-equiv="content-type" content="text/html;charset=iso-8859-1" />
1538
1539                         // when we append a child, we are in node end, so attributes are already populated
1540                         if (node._namelength == 4)      // quick check, avoids string alloc
1541                         {
1542                                 if (node.Name == "meta") // all nodes names are lowercase
1543                                 {
1544                                         HtmlAttribute att = node.Attributes["http-equiv"];
1545                                         if (att != null)
1546                                         {
1547                                                 if (string.Compare(att.Value, "content-type", true) == 0)
1548                                                 {
1549                                                         HtmlAttribute content = node.Attributes["content"];
1550                                                         if (content != null)
1551                                                         {
1552                                                                 string charset = NameValuePairList.GetNameValuePairsValue(content.Value, "charset");
1553                                                                 if (charset != null)
1554                                                                 {
1555                                                                         _declaredencoding = Encoding.GetEncoding(charset);
1556                                                                         if (_onlyDetectEncoding)
1557                                                                         {
1558                                                                                 throw new EncodingFoundException(_declaredencoding);
1559                                                                         }
1560
1561                                                                         if (_streamencoding != null)
1562                                                                         {
1563                                                                                 if (_declaredencoding.WindowsCodePage != _streamencoding.WindowsCodePage)
1564                                                                                 {
1565                                                                                         AddError(
1566                                                                                                 HtmlParseErrorCode.CharsetMismatch,
1567                                                                                                 _line, _lineposition,
1568                                                                                                 _index, node.OuterHtml,
1569                                                                                                 "Encoding mismatch between StreamEncoding: " +
1570                                                                                                 _streamencoding.WebName + " and DeclaredEncoding: " + _declaredencoding.WebName);
1571                                                                                 }
1572                                                                         }
1573                                                                 }
1574                                                         }
1575                                                 }
1576                                         }
1577                                 }
1578                         }
1579                 }
1580
1581                 private void PushAttributeNameStart(int index)
1582                 {
1583                         _currentattribute = CreateAttribute();
1584                         _currentattribute._namestartindex = index;
1585                         _currentattribute._line = _line;
1586                         _currentattribute._lineposition = _lineposition;
1587                         _currentattribute._streamposition = index;
1588                 }
1589
1590                 private void PushAttributeNameEnd(int index)
1591                 {
1592                         _currentattribute._namelength = index - _currentattribute._namestartindex;
1593                         _currentnode.Attributes.Append(_currentattribute);
1594                 }
1595
1596                 private void PushAttributeValueStart(int index)
1597                 {
1598                         _currentattribute._valuestartindex = index;
1599                 }
1600
1601                 private void PushAttributeValueEnd(int index)
1602                 {
1603                         _currentattribute._valuelength = index - _currentattribute._valuestartindex;
1604                 }
1605
1606                 private void PushNodeStart(HtmlNodeType type, int index)
1607                 {
1608                         _currentnode = CreateNode(type, index);
1609                         _currentnode._line = _line;
1610                         _currentnode._lineposition = _lineposition;
1611                         if (type == HtmlNodeType.Element)
1612                         {
1613                                 _currentnode._lineposition--;
1614                         }
1615                         _currentnode._streamposition = index;
1616                 }
1617
1618                 private void PushNodeEnd(int index, bool close)
1619                 {
1620                         _currentnode._outerlength = index - _currentnode._outerstartindex;
1621
1622                         if ((_currentnode._nodetype == HtmlNodeType.Text) ||
1623                                 (_currentnode._nodetype == HtmlNodeType.Comment))
1624                         {
1625                                 // forget about void nodes
1626                                 if (_currentnode._outerlength>0)
1627                                 {
1628                                         _currentnode._innerlength = _currentnode._outerlength;
1629                                         _currentnode._innerstartindex = _currentnode._outerstartindex;
1630                                         if (_lastparentnode != null)
1631                                         {
1632                                                 _lastparentnode.AppendChild(_currentnode);
1633                                         }
1634                                 }
1635                         }
1636                         else
1637                         {
1638                                 if ((_currentnode._starttag) && (_lastparentnode != _currentnode))
1639                                 {
1640                                         // add to parent node
1641                                         if (_lastparentnode != null)
1642                                         {
1643                                                 _lastparentnode.AppendChild(_currentnode);
1644                                         }
1645
1646                                         ReadDocumentEncoding(_currentnode);
1647
1648                                         // remember last node of this kind
1649                                         HtmlNode prev = (HtmlNode)_lastnodes[_currentnode.Name];
1650                                         _currentnode._prevwithsamename = prev;
1651                                         _lastnodes[_currentnode.Name] = _currentnode;
1652
1653                                         // change parent?
1654                                         if ((_currentnode.NodeType == HtmlNodeType.Document) ||
1655                                                 (_currentnode.NodeType == HtmlNodeType.Element))
1656                                         {
1657                                                 _lastparentnode = _currentnode;
1658                                         }
1659
1660                                         if (HtmlNode.IsCDataElement(CurrentNodeName()))
1661                                         {
1662                                                 _state = ParseState.PcData;
1663                                                 return;
1664                                         }
1665
1666                                         if ((HtmlNode.IsClosedElement(_currentnode.Name)) ||
1667                                                 (HtmlNode.IsEmptyElement(_currentnode.Name)))
1668                                         {
1669                                                 close = true;
1670                                         }
1671                                 }
1672                         }
1673
1674                         if ((close) || (!_currentnode._starttag))
1675                         {
1676                                 CloseCurrentNode();
1677                         }
1678                 }
1679
1680                 private void PushNodeNameStart(bool starttag, int index)
1681                 {
1682                         _currentnode._starttag = starttag;
1683                         _currentnode._namestartindex = index;
1684                 }
1685
1686                 private string[] GetResetters(string name)
1687                 {
1688                         switch (name)
1689                         {
1690                                 case "li":
1691                                         return new string[]{"ul"};
1692
1693                                 case "tr":
1694                                         return new string[]{"table"};
1695
1696                                 case "th":
1697                                 case "td":
1698                                         return new string[]{"tr", "table"};
1699
1700                                 default:
1701                                         return null;
1702                         }
1703                 }
1704
1705                 private void FixNestedTags()
1706                 {
1707                         // we are only interested by start tags, not closing tags
1708                         if (!_currentnode._starttag)
1709                                 return;
1710
1711                         string name = CurrentNodeName().ToLower();
1712                         FixNestedTag(name, GetResetters(name));
1713                 }
1714
1715                 private void FixNestedTag(string name, string[] resetters)
1716                 {
1717                         if (resetters == null)
1718                                 return;
1719
1720                         HtmlNode prev;
1721
1722                         // if we find a previous unclosed same name node, without a resetter node between, we must close it
1723                         prev = (HtmlNode)_lastnodes[name];
1724                         if ((prev != null) && (!prev.Closed))
1725                         {
1726
1727                                 // try to find a resetter node, if found, we do nothing
1728                                 if (FindResetterNodes(prev, resetters))
1729                                 {
1730                                         return;
1731                                 }
1732
1733                                 // ok we need to close the prev now
1734                                 // create a fake closer node
1735                                 HtmlNode close = new HtmlNode(prev.NodeType, this, -1);
1736                                 close._endnode = close;
1737                                 prev.CloseNode(close);
1738
1739                         }
1740                 }
1741
1742                 private bool FindResetterNodes(HtmlNode node, string[] names)
1743                 {
1744                         if (names == null)
1745                         {
1746                                 return false;
1747                         }
1748                         for(int i=0;i<names.Length;i++)
1749                         {
1750                                 if (FindResetterNode(node, names[i]) != null)
1751                                 {
1752                                         return true;
1753                                 }
1754                         }
1755                         return false;
1756                 }
1757
1758                 private HtmlNode FindResetterNode(HtmlNode node, string name)
1759                 {
1760                         HtmlNode resetter = (HtmlNode)_lastnodes[name];
1761                         if (resetter == null)
1762                                 return null;
1763                         if (resetter.Closed)
1764                         {
1765                                 return null;
1766                         }
1767                         if (resetter._streamposition<node._streamposition)
1768                         {
1769                                 return null;
1770                         }
1771                         return resetter;
1772                 }
1773
1774                 private void PushNodeNameEnd(int index)
1775                 {
1776                         _currentnode._namelength = index - _currentnode._namestartindex;
1777                         if (OptionFixNestedTags)
1778                         {
1779                                 FixNestedTags();
1780                         }
1781                 }
1782
1783                 private void CloseCurrentNode()
1784                 {
1785                         if (_currentnode.Closed) // text or document are by def closed
1786                                 return;
1787
1788                         bool error = false;
1789
1790                         // find last node of this kind
1791                         HtmlNode prev = (HtmlNode)_lastnodes[_currentnode.Name];
1792                         if (prev == null)
1793                         {
1794                                 if (HtmlNode.IsClosedElement(_currentnode.Name))
1795                                 {
1796                                         // </br> will be seen as <br>
1797                                         _currentnode.CloseNode(_currentnode);
1798
1799                                         // add to parent node
1800                                         if (_lastparentnode != null)
1801                                         {
1802                                                 HtmlNode foundNode = null;
1803                                                 Stack futureChild = new Stack();
1804                                                 for (HtmlNode node = _lastparentnode.LastChild; node != null; node = node.PreviousSibling)
1805                                                 {
1806                                                         if ((node.Name == _currentnode.Name) && (! node.HasChildNodes))
1807                                                         {
1808                                                                 foundNode = node;
1809                                                                 break;
1810                                                         }
1811                                                         futureChild.Push(node);
1812                                                 }
1813                                                 if (foundNode != null)
1814                                                 {
1815                                                         HtmlNode node = null;
1816                                                         while(futureChild.Count != 0)
1817                                                         {
1818                                                                 node = (HtmlNode)futureChild.Pop();
1819                                                                 _lastparentnode.RemoveChild(node);
1820                                                                 foundNode.AppendChild(node);
1821                                                         }
1822                                                 }
1823                                                 else
1824                                                 {
1825                                                         _lastparentnode.AppendChild(_currentnode);
1826                                                 }
1827
1828                                         }
1829                                 }
1830                                 else
1831                                 {
1832                                         // node has no parent
1833                                         // node is not a closed node
1834
1835                                         if (HtmlNode.CanOverlapElement(_currentnode.Name))
1836                                         {
1837                                                 // this is a hack: add it as a text node
1838                                                 HtmlNode closenode = CreateNode(HtmlNodeType.Text, _currentnode._outerstartindex);
1839                                                 closenode._outerlength = _currentnode._outerlength;
1840                                                 ((HtmlTextNode)closenode).Text = ((HtmlTextNode)closenode).Text.ToLower();
1841                                                 if (_lastparentnode != null)
1842                                                 {
1843                                                         _lastparentnode.AppendChild(closenode);
1844                                                 }
1845
1846                                         }
1847                                         else
1848                                         {
1849                                                 if (HtmlNode.IsEmptyElement(_currentnode.Name))
1850                                                 {
1851                                                         AddError(
1852                                                                 HtmlParseErrorCode.EndTagNotRequired,
1853                                                                 _currentnode._line, _currentnode._lineposition,
1854                                                                 _currentnode._streamposition, _currentnode.OuterHtml,
1855                                                                 "End tag </" + _currentnode.Name + "> is not required");
1856                                                 }
1857                                                 else
1858                                                 {
1859                                                         // node cannot overlap, node is not empty
1860                                                         AddError(
1861                                                                 HtmlParseErrorCode.TagNotOpened,
1862                                                                 _currentnode._line, _currentnode._lineposition,
1863                                                                 _currentnode._streamposition, _currentnode.OuterHtml,
1864                                                                 "Start tag <" + _currentnode.Name + "> was not found");
1865                                                         error = true;
1866                                                 }
1867                                         }
1868                                 }
1869                         }
1870                         else
1871                         {
1872                                 if (OptionFixNestedTags)
1873                                 {
1874                                         if (FindResetterNodes(prev, GetResetters(_currentnode.Name)))
1875                                         {
1876                                                 AddError(
1877                                                         HtmlParseErrorCode.EndTagInvalidHere,
1878                                                         _currentnode._line, _currentnode._lineposition,
1879                                                         _currentnode._streamposition, _currentnode.OuterHtml,
1880                                                         "End tag </" + _currentnode.Name + "> invalid here");
1881                                                 error = true;
1882                                         }
1883                                 }
1884
1885                                 if (!error)
1886                                 {
1887                                         _lastnodes[_currentnode.Name] = prev._prevwithsamename;
1888                                         prev.CloseNode(_currentnode);
1889                                 }
1890                         }
1891
1892
1893                         // we close this node, get grandparent
1894                         if (!error)
1895                         {
1896                                 if ((_lastparentnode != null) &&
1897                                         ((!HtmlNode.IsClosedElement(_currentnode.Name)) ||
1898                                         (_currentnode._starttag)))
1899                                 {
1900                                         UpdateLastParentNode();
1901                                 }
1902                         }
1903                 }
1904
1905                 internal void UpdateLastParentNode()
1906                 {
1907                         do
1908                         {
1909                                 if (_lastparentnode.Closed)
1910                                 {
1911                                         _lastparentnode = _lastparentnode.ParentNode;
1912                                 }
1913                         }
1914                         while ((_lastparentnode != null) && (_lastparentnode.Closed));
1915                         if (_lastparentnode == null)
1916                         {
1917                                 _lastparentnode = _documentnode;
1918                         }
1919                 }
1920
1921                 private string CurrentAttributeName()
1922                 {
1923                         return _text.Substring(_currentattribute._namestartindex, _currentattribute._namelength);
1924                 }
1925
1926                 private string CurrentAttributeValue()
1927                 {
1928                         return _text.Substring(_currentattribute._valuestartindex, _currentattribute._valuelength);
1929                 }
1930
1931                 private string CurrentNodeName()
1932                 {
1933                         return _text.Substring(_currentnode._namestartindex, _currentnode._namelength);
1934                 }
1935
1936                 private string CurrentNodeOuter()
1937                 {
1938                         return _text.Substring(_currentnode._outerstartindex, _currentnode._outerlength);
1939                 }
1940
1941                 private string CurrentNodeInner()
1942                 {
1943                         return _text.Substring(_currentnode._innerstartindex, _currentnode._innerlength);
1944                 }
1945
1946                 /// <summary>
1947                 /// Determines if the specified character is considered as a whitespace character.
1948                 /// </summary>
1949                 /// <param name="c">The character to check.</param>
1950                 /// <returns>true if if the specified character is considered as a whitespace character.</returns>
1951                 public static bool IsWhiteSpace(int c)
1952                 {
1953                         if ((c == 10) || (c == 13) || (c == 32) || (c == 9))
1954                         {
1955                                 return true;
1956                         }
1957                         return false;
1958                 }
1959
1960         }
1961
1962         internal class EncodingFoundException: Exception
1963         {
1964                 private Encoding _encoding;
1965
1966                 internal EncodingFoundException(Encoding encoding)
1967                 {
1968                         _encoding = encoding;
1969                 }
1970
1971                 internal Encoding Encoding
1972                 {
1973                         get
1974                         {
1975                                 return _encoding;
1976                         }
1977                 }
1978         }
1979 }