Change the GC.GetTotalMemory() threshold to 10%; otherwise there are just too many...
[beagle.git] / Filters / HtmlAgilityPack / HtmlDocument.cs
blob9f723a1a3606abf09d8a69e7edaed857e6af95da
1 // HtmlAgilityPack V1.0 - Simon Mourier <simonm@microsoft.com>
3 /*
4 Copyright (C) 2003 Simon Mourier <simonm@microsoft.com>
5 All rights reserved.
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions
9 are met:
10 1. Redistributions of source code must retain the above copyright
11 notice, this list of conditions and the following disclaimer.
12 2. Redistributions in binary form must reproduce the above copyright
13 notice, this list of conditions and the following disclaimer in the
14 documentation and/or other materials provided with the distribution.
15 3. The name of the author may not be used to endorse or promote products
16 derived from this software without specific prior written permission.
18 THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
19 IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20 OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21 IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
22 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
23 NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
27 THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 using System;
31 using System.IO;
32 using System.Text;
33 using System.Diagnostics;
34 using System.Collections;
35 using System.Text.RegularExpressions;
36 using System.Xml;
37 using System.Xml.XPath;
40 // Legend: SLIM=Comment added describing changes to original HtmlAgilityPack
41 // to reduce memory consumption
42 // Once the parser is free of bugs, the comments will be taken out
43 namespace HtmlAgilityPack
45 /// <summary>
46 /// Represents the type of parsing error.
47 /// </summary>
48 public enum HtmlParseErrorCode
50 /// <summary>
51 /// A tag was not closed.
52 /// </summary>
53 TagNotClosed,
55 /// <summary>
56 /// A tag was not opened.
57 /// </summary>
58 TagNotOpened,
60 /// <summary>
61 /// There is a charset mismatch between stream and declared (META) encoding.
62 /// </summary>
63 CharsetMismatch,
65 /// <summary>
66 /// An end tag was not required.
67 /// </summary>
68 EndTagNotRequired,
70 /// <summary>
71 /// An end tag is invalid at this position.
72 /// </summary>
73 EndTagInvalidHere
76 /// <summary>
77 /// Represents a parsing error found during document parsing.
78 /// </summary>
79 public class HtmlParseError
81 private HtmlParseErrorCode _code;
82 private int _line;
83 private int _linePosition;
84 private int _streamPosition;
85 private string _sourceText;
86 private string _reason;
88 internal HtmlParseError(
89 HtmlParseErrorCode code,
90 int line,
91 int linePosition,
92 int streamPosition,
93 string sourceText,
94 string reason)
96 _code = code;
97 _line = line;
98 _linePosition = linePosition;
99 _streamPosition = streamPosition;
100 _sourceText = sourceText;
101 _reason = reason;
104 /// <summary>
105 /// Gets the type of error.
106 /// </summary>
107 public HtmlParseErrorCode Code
111 return _code;
115 /// <summary>
116 /// Gets the line number of this error in the document.
117 /// </summary>
118 public int Line
122 return _line;
126 /// <summary>
127 /// Gets the column number of this error in the document.
128 /// </summary>
129 public int LinePosition
133 return _linePosition;
137 /// <summary>
138 /// Gets the absolstream position of this error in the document, relative to the start of the document.
139 /// </summary>
140 public int StreamPosition
144 return _streamPosition;
148 /// <summary>
149 /// Gets the the full text of the line containing the error.
150 /// </summary>
151 public string SourceText
155 return _sourceText;
159 /// <summary>
160 /// Gets a description for the error.
161 /// </summary>
162 public string Reason
166 return _reason;
172 abstract class StreamAsArray {
173 public abstract bool Eof (int index);
174 public abstract char this [int index] { get;}
175 public abstract string Substring (int startindex, int length);
176 public abstract int FullLength { get;}
179 // SLIM: creating this class to wrap around a textreader
180 // to emulate ReadToEnd () behaviour
181 class ImplStreamAsArray : StreamAsArray {
182 private StreamReader _reader;
183 private int _length;
184 private int _position;
185 private bool _eof;
186 private char[] _buf_previous; // could have used only one array
187 private char[] _buf_current; // but, this is cleaner
188 private int _block_size;
190 public ImplStreamAsArray (StreamReader r)
192 _reader = r;
193 _length = 0;
194 _position = 0;
195 _eof = false;
197 _block_size = 1024;
198 _buf_previous = new char [_block_size];
199 _buf_current = new char [_block_size];
201 Read (true);
204 private void Read (bool initial)
206 if ( !initial) {
207 Array.Copy (_buf_current, _buf_previous, _block_size);
208 _position += _block_size;
210 HtmlDocument.Debug ("Debug: Read in buffer at:" + _position);
212 int num_read = _reader.Read (_buf_current, 0, _block_size);
213 if (num_read < _block_size) {
214 _eof = true;
215 _length = _position + num_read;
217 HtmlDocument.Debug ("[" + new string (_buf_current, 0, num_read) + "]");
220 public override bool Eof (int index) {
221 if (_eof)
222 return (index == _length);
223 else {
224 if (index >= _position + _block_size &&
225 index < _position + _block_size + _block_size)
226 Read (false);
227 if (_eof)
228 return (index == _length);
229 else
230 return false;
234 public override char this[int index] {
235 get {
236 if (index >= _position &&
237 index < _position + _block_size)
238 return _buf_current [index % _block_size];
239 if (index >= _position - _block_size &&
240 index < _position)
241 return _buf_previous [ index % _block_size];
242 if (index >= _position + _block_size &&
243 index < _position + _block_size + _block_size) {
244 Read (false);
245 return _buf_current [index % _block_size];
247 return OutOfBandRead (index, 1) [0];
251 // evil function ... you get what you pay for!
252 private string OutOfBandRead (int startindex, int length)
254 HtmlDocument.Debug ("Out of band read! From " + startindex + " to " + (startindex + length - 1));
255 ResetPosition (startindex);
256 // ahh.. now we are at the correct place
257 // create a buffer of required length
258 // who cares if the buffer size does not align well
259 // with page boundary
260 char[] temp_buf = new char [length];
261 int num_read = _reader.Read (temp_buf, 0, length);
262 if (num_read < length) {
263 // Shouldnt occur!!!
264 _eof = true;
265 _length = startindex + num_read;
267 // discard data and reset stream position
268 int t = (_eof ? _length :_position + _block_size);
269 ResetPosition (t);
270 return new String (temp_buf);
273 // streamreader does not allow seeking
274 // seek on its basestream does not reflect the position
275 // of the reader - it is governed by the buffer size
276 // of the underlying stream
277 // :( so, read character by character from beginning ...
278 private void ResetPosition (int pos)
280 _reader.DiscardBufferedData ();
281 _reader.BaseStream.Position = 0;
282 // read in chunks of block_size
283 int n1 = pos / _block_size;
284 int n2 = pos % _block_size;
285 char[] tmp = new char [_block_size];
286 // yo ho... start reading till we have reach pos
287 // hopefully, reader will buffer itself, so we can be mean and get one char at a time
288 for (int i = 0; i < n1; ++i)
289 _reader.Read (tmp, 0, _block_size);
290 for (int i = 0; i < n2; ++i)
291 _reader.Read ();
292 tmp = null;
295 public override string Substring (int startindex, int length)
297 if (length == 0) {
298 HtmlDocument.Debug ("substring:" + startindex + " " + length + " " + _position + ":");
299 return String.Empty;
301 if (length > _block_size || startindex < _position - _block_size) {
302 return OutOfBandRead (startindex, length);
304 while (startindex + length - 1 >= _position + _block_size) {
305 Read (false);
307 string substr;
308 if (startindex < _position) {
309 int len_1 = _position - startindex;
310 if (length < len_1)
311 substr = new String (_buf_previous, _block_size - len_1, length);
312 else {
313 substr = new String (_buf_previous, _block_size - len_1, len_1);
314 substr += new String (_buf_current, 0, length - len_1);
316 } else {
317 substr = new String (_buf_current, startindex - _position, length);
319 return substr;
322 // FIXME: Is this costly ?
323 public override int FullLength {
324 get {
325 return (int)_reader.BaseStream.Length;
330 // A dummy StreamAsArray wrapper around a string
331 class DummyStreamAsArray : StreamAsArray {
332 private string _base_string;
333 private int _length;
335 public DummyStreamAsArray(string str)
337 _base_string = str;
338 _length = str.Length;
341 public override bool Eof(int index)
343 return (index >= _length);
346 public new char this[int index] {
347 get { return _base_string [index]; }
350 public override string Substring (int startindex, int length)
352 return _base_string.Substring (startindex, length);
355 public override int FullLength {
356 get { return _length; }
360 /// <summary>
361 /// Represents a complete HTML document.
362 /// </summary>
363 public class HtmlDocument: IXPathNavigable
365 // SLIM: Make the parser event driven
366 // callback for FilterHtml
367 // return value is a way for the callback to signal to continue or stop parsing
368 public delegate bool NodeHandler (HtmlNode node);
369 public NodeHandler ReportNode;
370 // misnomer ... should be called event_driven_mode
371 private bool _streammode = false;
372 private bool _stop_parsing = false;
374 internal static readonly string HtmlExceptionRefNotChild = "Reference node must be a child of this node";
375 internal static readonly string HtmlExceptionUseIdAttributeFalse = "You need to set UseIdAttribute property to true to enable this feature";
377 internal Hashtable _openednodes;
378 internal Hashtable _lastnodes = new Hashtable();
379 internal Hashtable _nodesid;
380 private HtmlNode _documentnode;
381 //SLIM: internal string _text;
382 internal StreamAsArray _text;
383 private HtmlNode _currentnode;
384 private HtmlNode _lastparentnode;
385 private HtmlAttribute _currentattribute;
386 private int _index;
387 private int _line;
388 private int _lineposition, _maxlineposition;
389 private int _c;
390 private bool _fullcomment;
391 private System.Text.Encoding _streamencoding;
392 private System.Text.Encoding _declaredencoding;
393 private ArrayList _parseerrors = new ArrayList();
394 private ParseState _state, _oldstate;
395 private Crc32 _crc32 = null;
396 private bool _onlyDetectEncoding = false;
397 private int _pcdata_quote_char = '\0';
399 private static bool _debug = false;
400 internal static void Debug (string s)
402 if (_debug)
403 Console.WriteLine (s);
406 // public props
408 /// <summary>
409 /// Defines if a checksum must be computed for the document while parsing. Default is false.
410 /// </summary>
411 public bool OptionComputeChecksum = false;
413 /// <summary>
414 /// Defines if declared encoding must be read from the document.
415 /// Declared encoding is determined using the meta http-equiv="content-type" content="text/html;charset=XXXXX" html node.
416 /// Default is true.
417 /// </summary>
418 public bool OptionReadEncoding = true;
421 /// <summary>
422 /// Defines if non closed nodes will be checked at the end of parsing. Default is true.
423 /// </summary>
424 public bool OptionCheckSyntax = true;
426 /// <summary>
427 /// Defines if the 'id' attribute must be specifically used. Default is true.
428 /// </summary>
429 public bool OptionUseIdAttribute = true;
431 /// <summary>
432 /// Defines if empty nodes must be written as closed during output. Default is false.
433 /// </summary>
434 public bool OptionWriteEmptyNodes = false;
436 /// <summary>
437 /// Defines if output must conform to XML, instead of HTML.
438 /// </summary>
439 public bool OptionOutputAsXml = false;
441 /// <summary>
442 /// Defines if name must be output in uppercase. Default is false.
443 /// </summary>
444 public bool OptionOutputUpperCase = false;
446 /// <summary>
447 /// Defines if attribute value output must be optimized (not bound with double quotes if it is possible). Default is false.
448 /// </summary>
449 public bool OptionOutputOptimizeAttributeValues = false;
451 /// <summary>
452 /// Adds Debugging attributes to node. Default is false.
453 /// </summary>
454 public bool OptionAddDebuggingAttributes = false;
456 /// <summary>
457 /// Defines if source text must be extracted while parsing errors.
458 /// If the document has a lot of errors, or cascading errors, parsing performance can be dramatically affected if set to true.
459 /// Default is false.
460 /// </summary>
461 public bool OptionExtractErrorSourceText = false; // turning this on can dramatically slow performance if a lot of errors are detected
463 /// <summary>
464 /// Defines if closing for non closed nodes must be done at the end or directly in the document.
465 /// Setting this to true can actually change how browsers render the page. Default is false.
466 /// </summary>
467 public bool OptionAutoCloseOnEnd = false; // close errors at the end
469 /// <summary>
470 /// Defines if LI, TR, TH, TD tags must be partially fixed when nesting errors are detected. Default is false.
471 /// </summary>
472 public bool OptionFixNestedTags = false; // fix li, tr, th, td tags
474 /// <summary>
475 /// Defines the maximum length of source text or parse errors. Default is 100.
476 /// </summary>
477 public int OptionExtractErrorSourceTextMaxLength = 100;
479 /// <summary>
480 /// Defines the default stream encoding to use. Default is System.Text.Encoding.Default.
481 /// </summary>
482 // From http://www.w3.org/TR/REC-html40/charset.html
483 // The HTTP protocol ([RFC2616], section 3.7.1) mentions ISO-8859-1 as a default character encoding when the "charset" parameter is absent from the "Content-Type" header field.
484 // So, however we are still using UTF-8 for some unknown reason
485 //FIXME: Fix the default encoding!
486 public System.Text.Encoding OptionDefaultStreamEncoding = Encoding.UTF8;
488 /// <summary>
489 /// Gets a list of parse errors found in the document.
490 /// </summary>
491 public ArrayList ParseErrors
495 return _parseerrors;
499 /// <summary>
500 /// Gets the document's stream encoding.
501 /// </summary>
502 public System.Text.Encoding StreamEncoding
506 return _streamencoding;
510 /// <summary>
511 /// Gets the document's declared encoding.
512 /// Declared encoding is determined using the meta http-equiv="content-type" content="text/html;charset=XXXXX" html node.
513 /// </summary>
514 public System.Text.Encoding DeclaredEncoding
518 return _declaredencoding;
522 /// <summary>
523 /// Creates an instance of an HTML document.
524 /// </summary>
525 public HtmlDocument()
527 _documentnode = CreateNode(HtmlNodeType.Document, 0);
530 internal HtmlNode GetXmlDeclaration()
532 if (!_documentnode.HasChildNodes)
534 return null;
537 foreach(HtmlNode node in _documentnode._childnodes)
539 if (node.Name == "?xml") // it's ok, names are case sensitive
541 return node;
544 return null;
547 /// <summary>
548 /// Applies HTML encoding to a specified string.
549 /// </summary>
550 /// <param name="html">The input string to encode. May not be null.</param>
551 /// <returns>The encoded string.</returns>
552 public static string HtmlEncode(string html)
554 if (html == null)
556 throw new ArgumentNullException("html");
558 // replace & by &amp; but only once!
559 Regex rx = new Regex("&(?!(amp;)|(lt;)|(gt;)|(quot;))", RegexOptions.IgnoreCase);
560 return rx.Replace(html, "&amp;").Replace("<", "&lt;").Replace(">", "&gt;").Replace("\"", "&quot;");
563 /// <summary>
564 /// Detects the encoding of an HTML stream.
565 /// </summary>
566 /// <param name="stream">The input stream. May not be null.</param>
567 /// <returns>The detected encoding.</returns>
568 public Encoding DetectEncoding(Stream stream)
570 if (stream == null)
572 throw new ArgumentNullException("stream");
574 return DetectEncoding(new StreamReader(stream));
577 /// <summary>
578 /// Detects the encoding of an HTML file.
579 /// </summary>
580 /// <param name="path">Path for the file containing the HTML document to detect. May not be null.</param>
581 /// <returns>The detected encoding.</returns>
582 public Encoding DetectEncoding(string path)
584 if (path == null)
586 throw new ArgumentNullException("path");
588 StreamReader sr = new StreamReader(path, OptionDefaultStreamEncoding);
589 Encoding encoding = DetectEncoding(sr);
590 sr.Close();
591 return encoding;
594 /// <summary>
595 /// Detects the encoding of an HTML text.
596 /// </summary>
597 /// <param name="html">The input html text. May not be null.</param>
598 /// <returns>The detected encoding.</returns>
599 public Encoding DetectEncodingHtml(string html)
601 if (html == null)
603 throw new ArgumentNullException("html");
605 StringReader sr = new StringReader(html);
606 Encoding encoding = DetectEncoding(sr);
607 sr.Close();
608 return encoding;
611 /// <summary>
612 /// Detects the encoding of an HTML text provided on a TextReader.
613 /// </summary>
614 /// <param name="reader">The TextReader used to feed the HTML. May not be null.</param>
615 /// <returns>The detected encoding.</returns>
616 public Encoding DetectEncoding(TextReader reader)
618 if (reader == null)
620 throw new ArgumentNullException("reader");
622 _onlyDetectEncoding = true;
623 if (OptionCheckSyntax)
625 _openednodes = new Hashtable();
627 else
629 _openednodes = null;
632 if (OptionUseIdAttribute)
634 _nodesid = new Hashtable();
636 else
638 _nodesid = null;
641 StreamReader sr = reader as StreamReader;
642 if (sr != null)
644 _streamencoding = sr.CurrentEncoding;
645 _text = new ImplStreamAsArray (sr);
647 else
649 _streamencoding = null;
650 // Expensive, but cannot avoid since TextReader doesnt have any length of the underlying data
651 _text = new DummyStreamAsArray (reader.ReadToEnd());
653 _declaredencoding = null;
655 // SLIM: _text = reader.ReadToEnd();
656 _documentnode = CreateNode(HtmlNodeType.Document, 0);
658 // this is a hack, but it allows us not to muck with the original parsing code
661 Parse();
663 catch(EncodingFoundException ex)
665 _lastnodes.Clear();
666 return ex.Encoding;
668 return null;
671 /// <summary>
672 /// Loads an HTML document from a stream.
673 /// </summary>
674 /// <param name="stream">The input stream.</param>
675 public void Load(Stream stream)
677 Load(new StreamReader(stream, OptionDefaultStreamEncoding));
680 /// <summary>
681 /// Loads an HTML document from a stream.
682 /// </summary>
683 /// <param name="stream">The input stream.</param>
684 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the stream.</param>
685 public void Load(Stream stream, bool detectEncodingFromByteOrderMarks)
687 Load(new StreamReader(stream, detectEncodingFromByteOrderMarks));
690 /// <summary>
691 /// Loads an HTML document from a stream.
692 /// </summary>
693 /// <param name="stream">The input stream.</param>
694 /// <param name="encoding">The character encoding to use.</param>
695 public void Load(Stream stream, Encoding encoding)
697 Load(new StreamReader(stream, encoding));
700 /// <summary>
701 /// Loads an HTML document from a stream.
702 /// </summary>
703 /// <param name="stream">The input stream.</param>
704 /// <param name="encoding">The character encoding to use.</param>
705 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the stream.</param>
706 public void Load(Stream stream, Encoding encoding, bool detectEncodingFromByteOrderMarks)
708 Load(new StreamReader(stream, encoding, detectEncodingFromByteOrderMarks));
711 /// <summary>
712 /// Loads an HTML document from a stream.
713 /// </summary>
714 /// <param name="stream">The input stream.</param>
715 /// <param name="encoding">The character encoding to use.</param>
716 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the stream.</param>
717 /// <param name="buffersize">The minimum buffer size.</param>
718 public void Load(Stream stream, Encoding encoding, bool detectEncodingFromByteOrderMarks, int buffersize)
720 Load(new StreamReader(stream, encoding, detectEncodingFromByteOrderMarks, buffersize));
723 /// <summary>
724 /// Loads an HTML document from a file.
725 /// </summary>
726 /// <param name="path">The complete file path to be read. May not be null.</param>
727 public void Load(string path)
729 if (path == null)
731 throw new ArgumentNullException("path");
733 StreamReader sr = new StreamReader(path, OptionDefaultStreamEncoding);
734 Load(sr);
735 sr.Close();
738 /// <summary>
739 /// Loads an HTML document from a file.
740 /// </summary>
741 /// <param name="path">The complete file path to be read. May not be null.</param>
742 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
743 public void Load(string path, bool detectEncodingFromByteOrderMarks)
745 if (path == null)
747 throw new ArgumentNullException("path");
749 StreamReader sr = new StreamReader(path, detectEncodingFromByteOrderMarks);
750 Load(sr);
751 sr.Close();
754 /// <summary>
755 /// Loads an HTML document from a file.
756 /// </summary>
757 /// <param name="path">The complete file path to be read. May not be null.</param>
758 /// <param name="encoding">The character encoding to use. May not be null.</param>
759 public void Load(string path, Encoding encoding)
761 if (path == null)
763 throw new ArgumentNullException("path");
765 if (encoding == null)
767 throw new ArgumentNullException("encoding");
769 StreamReader sr = new StreamReader(path, encoding);
770 Load(sr);
771 sr.Close();
774 /// <summary>
775 /// Loads an HTML document from a file.
776 /// </summary>
777 /// <param name="path">The complete file path to be read. May not be null.</param>
778 /// <param name="encoding">The character encoding to use. May not be null.</param>
779 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
780 public void Load(string path, Encoding encoding, bool detectEncodingFromByteOrderMarks)
782 if (path == null)
784 throw new ArgumentNullException("path");
786 if (encoding == null)
788 throw new ArgumentNullException("encoding");
790 StreamReader sr = new StreamReader(path, encoding, detectEncodingFromByteOrderMarks);
791 Load(sr);
792 sr.Close();
795 /// <summary>
796 /// Loads an HTML document from a file.
797 /// </summary>
798 /// <param name="path">The complete file path to be read. May not be null.</param>
799 /// <param name="encoding">The character encoding to use. May not be null.</param>
800 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
801 /// <param name="buffersize">The minimum buffer size.</param>
802 public void Load(string path, Encoding encoding, bool detectEncodingFromByteOrderMarks, int buffersize)
804 if (path == null)
806 throw new ArgumentNullException("path");
808 if (encoding == null)
810 throw new ArgumentNullException("encoding");
812 StreamReader sr = new StreamReader(path, encoding, detectEncodingFromByteOrderMarks, buffersize);
813 Load(sr);
814 sr.Close();
817 /// <summary>
818 /// Loads the HTML document from the specified string.
819 /// </summary>
820 /// <param name="html">String containing the HTML document to load. May not be null.</param>
821 public void LoadHtml(string html)
823 if (html == null)
825 throw new ArgumentNullException("html");
827 StringReader sr = new StringReader(html);
828 Load(sr);
829 sr.Close();
832 /// <summary>
833 /// Detects the encoding of an HTML document from a file first, and then loads the file.
834 /// </summary>
835 /// <param name="path">The complete file path to be read.</param>
836 public void DetectEncodingAndLoad(string path)
838 DetectEncodingAndLoad(path, true);
841 /// <summary>
842 /// Detects the encoding of an HTML document from a file first, and then loads the file.
843 /// </summary>
844 /// <param name="path">The complete file path to be read. May not be null.</param>
845 /// <param name="detectEncoding">true to detect encoding, false otherwise.</param>
846 public void DetectEncodingAndLoad(string path, bool detectEncoding)
848 if (path == null)
850 throw new ArgumentNullException("path");
852 System.Text.Encoding enc;
853 if (detectEncoding)
855 enc = DetectEncoding(path);
857 else
859 enc = null;
862 if (enc == null)
864 Load(path);
866 else
868 Load(path, enc);
872 /// <summary>
873 /// Loads the HTML document from the specified TextReader.
874 /// </summary>
875 /// <param name="reader">The TextReader used to feed the HTML data into the document. May not be null.</param>
876 public void Load(TextReader reader)
878 // all Load methods pass down to this one
879 if (reader == null)
881 throw new ArgumentNullException("reader");
884 _onlyDetectEncoding = false;
886 if (OptionCheckSyntax)
888 _openednodes = new Hashtable();
890 else
892 _openednodes = null;
895 if (OptionUseIdAttribute)
897 _nodesid = new Hashtable();
899 else
901 _nodesid = null;
904 StreamReader sr = reader as StreamReader;
905 if (sr != null)
909 // trigger bom read if needed
910 sr.Peek();
912 catch
914 // void on purpose
916 _streamencoding = sr.CurrentEncoding;
917 _text = new ImplStreamAsArray (sr);
919 else
921 _streamencoding = null;
922 // Expensive, but cannot avoid since TextReader doesnt have any length of the underlying data
923 _text = new DummyStreamAsArray (reader.ReadToEnd());
925 _declaredencoding = null;
927 // SLIM: _text = reader.ReadToEnd();
928 _documentnode = CreateNode(HtmlNodeType.Document, 0);
929 Parse();
931 if (OptionCheckSyntax)
933 foreach(HtmlNode node in _openednodes.Values)
935 if (!node._starttag) // already reported
937 continue;
940 string html;
941 if (OptionExtractErrorSourceText)
943 html = node.OuterHtml;
944 if (html.Length > OptionExtractErrorSourceTextMaxLength)
946 html = html.Substring(0, OptionExtractErrorSourceTextMaxLength);
949 else
951 html = string.Empty;
953 AddError(
954 HtmlParseErrorCode.TagNotClosed,
955 node._line, node._lineposition,
956 node._streamposition, html,
957 "End tag </" + node.Name + "> was not found");
960 // we don't need this anymore
961 _openednodes.Clear();
965 internal System.Text.Encoding GetOutEncoding()
967 // when unspecified, use the stream encoding first
968 if (_declaredencoding != null)
970 return _declaredencoding;
972 else
974 if (_streamencoding != null)
976 return _streamencoding;
979 return OptionDefaultStreamEncoding;
983 /// <summary>
984 /// Gets the document's output encoding.
985 /// </summary>
986 public System.Text.Encoding Encoding
990 return GetOutEncoding();
994 /// <summary>
995 /// Saves the HTML document to the specified stream.
996 /// </summary>
997 /// <param name="outStream">The stream to which you want to save.</param>
998 public void Save(Stream outStream)
1000 StreamWriter sw = new StreamWriter(outStream, GetOutEncoding());
1001 Save(sw);
1004 /// <summary>
1005 /// Saves the HTML document to the specified stream.
1006 /// </summary>
1007 /// <param name="outStream">The stream to which you want to save. May not be null.</param>
1008 /// <param name="encoding">The character encoding to use. May not be null.</param>
1009 public void Save(Stream outStream, System.Text.Encoding encoding)
1011 if (outStream == null)
1013 throw new ArgumentNullException("outStream");
1015 if (encoding == null)
1017 throw new ArgumentNullException("encoding");
1019 StreamWriter sw = new StreamWriter(outStream, encoding);
1020 Save(sw);
1023 /// <summary>
1024 /// Saves the mixed document to the specified file.
1025 /// </summary>
1026 /// <param name="filename">The location of the file where you want to save the document.</param>
1027 public void Save(string filename)
1029 StreamWriter sw = new StreamWriter(filename, false, GetOutEncoding());
1030 Save(sw);
1031 sw.Close();
1034 /// <summary>
1035 /// Saves the mixed document to the specified file.
1036 /// </summary>
1037 /// <param name="filename">The location of the file where you want to save the document. May not be null.</param>
1038 /// <param name="encoding">The character encoding to use. May not be null.</param>
1039 public void Save(string filename, System.Text.Encoding encoding)
1041 if (filename == null)
1043 throw new ArgumentNullException("filename");
1045 if (encoding == null)
1047 throw new ArgumentNullException("encoding");
1049 StreamWriter sw = new StreamWriter(filename, false, encoding);
1050 Save(sw);
1051 sw.Close();
1054 /// <summary>
1055 /// Saves the HTML document to the specified StreamWriter.
1056 /// </summary>
1057 /// <param name="writer">The StreamWriter to which you want to save.</param>
1058 public void Save(StreamWriter writer)
1060 Save((TextWriter)writer);
1063 /// <summary>
1064 /// Saves the HTML document to the specified TextWriter.
1065 /// </summary>
1066 /// <param name="writer">The TextWriter to which you want to save. May not be null.</param>
1067 public void Save(TextWriter writer)
1069 if (writer == null)
1071 throw new ArgumentNullException("writer");
1073 DocumentNode.WriteTo(writer);
1076 /// <summary>
1077 /// Saves the HTML document to the specified XmlWriter.
1078 /// </summary>
1079 /// <param name="writer">The XmlWriter to which you want to save.</param>
1080 public void Save(XmlWriter writer)
1082 DocumentNode.WriteTo(writer);
1083 writer.Flush();
1086 /// <summary>
1087 /// Creates a new XPathNavigator object for navigating this HTML document.
1088 /// </summary>
1089 /// <returns>An XPathNavigator object. The XPathNavigator is positioned on the root of the document.</returns>
1090 public XPathNavigator CreateNavigator()
1092 return new HtmlNodeNavigator(this, _documentnode);
1095 internal void SetIdForNode(HtmlNode node, string id)
1097 if (!OptionUseIdAttribute)
1099 return;
1102 if ((_nodesid == null) || (id == null))
1104 return;
1107 if (node == null)
1109 _nodesid.Remove(id.ToLower());
1111 else
1113 _nodesid[id.ToLower()] = node;
1117 /// <summary>
1118 /// Gets the HTML node with the specified 'id' attribute value.
1119 /// </summary>
1120 /// <param name="id">The attribute id to match. May not be null.</param>
1121 /// <returns>The HTML node with the matching id or null if not found.</returns>
1122 public HtmlNode GetElementbyId(string id)
1124 if (id == null)
1126 throw new ArgumentNullException("id");
1128 if (_nodesid == null)
1130 throw new Exception(HtmlExceptionUseIdAttributeFalse);
1133 return _nodesid[id.ToLower()] as HtmlNode;
1136 /// <summary>
1137 /// Creates an HTML element node with the specified name.
1138 /// </summary>
1139 /// <param name="name">The qualified name of the element. May not be null.</param>
1140 /// <returns>The new HTML node.</returns>
1141 public HtmlNode CreateElement(string name)
1143 if (name == null)
1145 throw new ArgumentNullException("name");
1147 HtmlNode node = CreateNode(HtmlNodeType.Element);
1148 node._name = name;
1149 return node;
1152 /// <summary>
1153 /// Creates an HTML comment node.
1154 /// </summary>
1155 /// <returns>The new HTML comment node.</returns>
1156 public HtmlCommentNode CreateComment()
1158 return (HtmlCommentNode)CreateNode(HtmlNodeType.Comment);
1161 /// <summary>
1162 /// Creates an HTML comment node with the specified comment text.
1163 /// </summary>
1164 /// <param name="comment">The comment text. May not be null.</param>
1165 /// <returns>The new HTML comment node.</returns>
1166 public HtmlCommentNode CreateComment(string comment)
1168 if (comment == null)
1170 throw new ArgumentNullException("comment");
1172 HtmlCommentNode c = CreateComment();
1173 c.Comment = comment;
1174 return c;
1177 /// <summary>
1178 /// Creates an HTML text node.
1179 /// </summary>
1180 /// <returns>The new HTML text node.</returns>
1181 public HtmlTextNode CreateTextNode()
1183 return (HtmlTextNode)CreateNode(HtmlNodeType.Text);
1186 /// <summary>
1187 /// Creates an HTML text node with the specified text.
1188 /// </summary>
1189 /// <param name="text">The text of the node. May not be null.</param>
1190 /// <returns>The new HTML text node.</returns>
1191 public HtmlTextNode CreateTextNode(string text)
1193 if (text == null)
1195 throw new ArgumentNullException("text");
1197 HtmlTextNode t = CreateTextNode();
1198 t.Text = text;
1199 return t;
1202 internal HtmlNode CreateNode(HtmlNodeType type)
1204 return CreateNode(type, -1);
1207 internal HtmlNode CreateNode(HtmlNodeType type, int index)
1209 switch (type)
1211 case HtmlNodeType.Comment:
1212 return new HtmlCommentNode(this, index);
1214 case HtmlNodeType.Text:
1215 return new HtmlTextNode(this, index);
1217 default:
1218 return new HtmlNode(type, this, index);
1222 internal HtmlAttribute CreateAttribute()
1224 return new HtmlAttribute(this);
1227 /// <summary>
1228 /// Creates an HTML attribute with the specified name.
1229 /// </summary>
1230 /// <param name="name">The name of the attribute. May not be null.</param>
1231 /// <returns>The new HTML attribute.</returns>
1232 public HtmlAttribute CreateAttribute(string name)
1234 if (name == null)
1236 throw new ArgumentNullException("name");
1238 HtmlAttribute att = CreateAttribute();
1239 att.Name = name;
1240 return att;
1243 /// <summary>
1244 /// Creates an HTML attribute with the specified name.
1245 /// </summary>
1246 /// <param name="name">The name of the attribute. May not be null.</param>
1247 /// <param name="value">The value of the attribute.</param>
1248 /// <returns>The new HTML attribute.</returns>
1249 public HtmlAttribute CreateAttribute(string name, string value)
1251 if (name == null)
1253 throw new ArgumentNullException("name");
1255 HtmlAttribute att = CreateAttribute(name);
1256 att.Value = value;
1257 return att;
1260 /// <summary>
1261 /// Gets the root node of the document.
1262 /// </summary>
1263 public HtmlNode DocumentNode
1267 return _documentnode;
1271 /// <summary>
1272 /// Gets the document CRC32 checksum if OptionComputeChecksum was set to true before parsing, 0 otherwise.
1273 /// </summary>
1274 public int CheckSum
1278 if (_crc32 == null)
1280 return 0;
1282 else
1284 return (int)_crc32.CheckSum;
1289 public bool StreamMode
1293 return _streammode;
1297 _streammode = value;
1301 private HtmlParseError AddError(
1302 HtmlParseErrorCode code,
1303 int line,
1304 int linePosition,
1305 int streamPosition,
1306 string sourceText,
1307 string reason)
1309 HtmlParseError err = new HtmlParseError(code, line, linePosition, streamPosition, sourceText, reason);
1310 _parseerrors.Add(err);
1311 return err;
1314 private enum ParseState
1316 Text,
1317 WhichTag,
1318 Tag,
1319 BetweenAttributes,
1320 EmptyTag,
1321 AttributeName,
1322 AttributeBeforeEquals,
1323 AttributeAfterEquals,
1324 AttributeValue,
1325 Comment,
1326 QuotedAttributeValue,
1327 ServerSideCode,
1328 PcDataQuote,
1329 PcData
1332 private void IncrementPosition()
1334 if (_crc32 != null)
1336 // REVIEW: should we add some checksum code in DecrementPosition too?
1337 _crc32.AddToCRC32(_c);
1340 _index++;
1341 _maxlineposition = _lineposition;
1342 if (_c == 10)
1344 _lineposition = 1;
1345 _line++;
1347 else
1349 _lineposition++;
1353 private void DecrementPosition()
1355 _index--;
1356 if (_lineposition == 1)
1358 _lineposition = _maxlineposition;
1359 _line--;
1361 else
1363 _lineposition--;
1367 private void Parse()
1369 int lastquote = 0;
1370 if (OptionComputeChecksum)
1372 _crc32 = new Crc32();
1375 _lastnodes = new Hashtable();
1376 _c = 0;
1377 _fullcomment = false;
1378 _parseerrors = new ArrayList();
1379 _line = 1;
1380 _lineposition = 1;
1381 _maxlineposition = 1;
1383 _state = ParseState.Text;
1384 _oldstate = _state;
1385 _documentnode._innerlength = _text.FullLength;
1386 _documentnode._outerlength = _text.FullLength;
1388 _lastparentnode = _documentnode;
1389 _currentnode = CreateNode(HtmlNodeType.Text, 0);
1390 _currentattribute = null;
1392 _index = 0;
1393 PushNodeStart(HtmlNodeType.Text, 0);
1394 // SLIM: while (_index<_text.Length)
1395 while (! _stop_parsing && ! _text.Eof (_index))
1397 _c = _text[_index];
1398 IncrementPosition();
1400 switch(_state)
1402 case ParseState.Text:
1403 if (NewCheck())
1404 continue;
1405 break;
1407 case ParseState.WhichTag:
1408 if (NewCheck())
1409 continue;
1410 if (_c == '/')
1412 PushNodeNameStart(false, _index);
1414 else
1416 PushNodeNameStart(true, _index-1);
1417 DecrementPosition();
1419 _state = ParseState.Tag;
1420 break;
1422 case ParseState.Tag:
1423 if (NewCheck())
1424 continue;
1425 if (IsWhiteSpace(_c))
1427 PushNodeNameEnd(_index-1);
1428 if (_state != ParseState.Tag)
1429 continue;
1430 _state = ParseState.BetweenAttributes;
1431 continue;
1433 if (_c == '/')
1435 PushNodeNameEnd(_index-1);
1436 if (_state != ParseState.Tag)
1437 continue;
1438 _state = ParseState.EmptyTag;
1439 continue;
1441 if (_c == '>')
1443 PushNodeNameEnd(_index-1);
1444 if (_state != ParseState.Tag)
1445 continue;
1446 PushNodeEnd(_index, false);
1447 if (_state != ParseState.Tag)
1448 continue;
1449 _state = ParseState.Text;
1450 PushNodeStart(HtmlNodeType.Text, _index);
1452 break;
1454 case ParseState.BetweenAttributes:
1455 if (NewCheck())
1456 continue;
1458 if (IsWhiteSpace(_c))
1459 continue;
1461 if ((_c == '/') || (_c == '?'))
1463 _state = ParseState.EmptyTag;
1464 continue;
1467 if (_c == '>')
1469 PushNodeEnd(_index, false);
1470 if (_state != ParseState.BetweenAttributes)
1471 continue;
1472 _state = ParseState.Text;
1473 PushNodeStart(HtmlNodeType.Text, _index);
1474 continue;
1477 PushAttributeNameStart(_index-1);
1478 _state = ParseState.AttributeName;
1479 break;
1481 case ParseState.EmptyTag:
1482 if (NewCheck())
1483 continue;
1485 if (_c == '>')
1487 PushNodeEnd(_index, true);
1488 if (_state != ParseState.EmptyTag)
1489 continue;
1490 _state = ParseState.Text;
1491 PushNodeStart(HtmlNodeType.Text, _index);
1492 continue;
1494 _state = ParseState.BetweenAttributes;
1495 break;
1497 case ParseState.AttributeName:
1498 if (NewCheck())
1499 continue;
1501 if (IsWhiteSpace(_c))
1503 PushAttributeNameEnd(_index-1);
1504 _state = ParseState.AttributeBeforeEquals;
1505 continue;
1507 if (_c == '=')
1509 PushAttributeNameEnd(_index-1);
1510 _state = ParseState.AttributeAfterEquals;
1511 continue;
1513 if (_c == '>')
1515 PushAttributeNameEnd(_index-1);
1516 PushNodeEnd(_index, false);
1517 if (_state != ParseState.AttributeName)
1518 continue;
1519 _state = ParseState.Text;
1520 PushNodeStart(HtmlNodeType.Text, _index);
1521 continue;
1523 break;
1525 case ParseState.AttributeBeforeEquals:
1526 if (NewCheck())
1527 continue;
1529 if (IsWhiteSpace(_c))
1530 continue;
1531 if (_c == '>')
1533 PushNodeEnd(_index, false);
1534 if (_state != ParseState.AttributeBeforeEquals)
1535 continue;
1536 _state = ParseState.Text;
1537 PushNodeStart(HtmlNodeType.Text, _index);
1538 continue;
1540 if (_c == '=')
1542 _state = ParseState.AttributeAfterEquals;
1543 continue;
1545 // no equals, no whitespace, it's a new attrribute starting
1546 _state = ParseState.BetweenAttributes;
1547 DecrementPosition();
1548 break;
1550 case ParseState.AttributeAfterEquals:
1551 if (NewCheck())
1552 continue;
1554 if (IsWhiteSpace(_c))
1555 continue;
1557 if ((_c == '\'') || (_c == '"'))
1559 _state = ParseState.QuotedAttributeValue;
1560 PushAttributeValueStart(_index);
1561 lastquote = _c;
1562 continue;
1564 if (_c == '>')
1566 PushNodeEnd(_index, false);
1567 if (_state != ParseState.AttributeAfterEquals)
1568 continue;
1569 _state = ParseState.Text;
1570 PushNodeStart(HtmlNodeType.Text, _index);
1571 continue;
1573 PushAttributeValueStart(_index-1);
1574 _state = ParseState.AttributeValue;
1575 break;
1577 case ParseState.AttributeValue:
1578 if (NewCheck())
1579 continue;
1581 if (IsWhiteSpace(_c))
1583 PushAttributeValueEnd(_index-1);
1584 _state = ParseState.BetweenAttributes;
1585 continue;
1588 if (_c == '>')
1590 PushAttributeValueEnd(_index-1);
1591 PushNodeEnd(_index, false);
1592 if (_state != ParseState.AttributeValue)
1593 continue;
1594 _state = ParseState.Text;
1595 PushNodeStart(HtmlNodeType.Text, _index);
1596 continue;
1598 break;
1600 case ParseState.QuotedAttributeValue:
1601 if (_c == lastquote)
1603 PushAttributeValueEnd(_index-1);
1604 _state = ParseState.BetweenAttributes;
1605 continue;
1607 if (_c == '<')
1609 //SLIM: if (_index<_text.Length)
1610 if (!_text.Eof (_index))
1612 if (_text[_index] == '%')
1614 _oldstate = _state;
1615 _state = ParseState.ServerSideCode;
1616 continue;
1620 break;
1622 case ParseState.Comment:
1623 if (_c == '>')
1625 if (_fullcomment)
1627 if ((_text[_index-2] != '-') ||
1628 (_text[_index-3] != '-'))
1630 continue;
1633 PushNodeEnd(_index, false);
1634 _state = ParseState.Text;
1635 PushNodeStart(HtmlNodeType.Text, _index);
1636 continue;
1638 break;
1640 case ParseState.ServerSideCode:
1641 if (_c == '%')
1643 //SLIM: if (_index<_text.Length)
1644 if (! _text.Eof (_index))
1646 if (_text[_index] == '>')
1648 switch(_oldstate)
1650 case ParseState.AttributeAfterEquals:
1651 _state = ParseState.AttributeValue;
1652 break;
1654 case ParseState.BetweenAttributes:
1655 PushAttributeNameEnd(_index+1);
1656 _state = ParseState.BetweenAttributes;
1657 break;
1659 default:
1660 _state = _oldstate;
1661 break;
1663 IncrementPosition();
1667 break;
1669 // handle <script>a="</script>"</script>
1670 case ParseState.PcDataQuote:
1671 if ((_c == _pcdata_quote_char) && (_text [_index - 2] != '\\')) {
1672 _pcdata_quote_char = '\0';
1673 _state = ParseState.PcData;
1675 break;
1677 case ParseState.PcData:
1678 Debug ("PCDATA " + _currentnode.Name + " " + _text.Substring(_index-1, _currentnode._namelength+2));
1679 if (_c == '\"' || _c == '\''){
1680 _pcdata_quote_char = _c;
1681 _state = ParseState.PcDataQuote;
1682 break;
1684 // look for </tag + 1 char
1686 // check buffer end
1687 //SLIM: if ((_currentnode._namelength+3)<=(_text.Length-(_index-1)))
1688 if (! _text.Eof (_currentnode._namelength + _index + 1))
1690 if (string.Compare(_text.Substring(_index-1, _currentnode._namelength+2),
1691 "</" + _currentnode.Name, true) == 0)
1693 int c = _text[_index-1 + 2 + _currentnode.Name.Length];
1694 if ((c == '>') || (IsWhiteSpace(c)))
1696 // add the script as a text node
1697 HtmlNode script = CreateNode(HtmlNodeType.Text,
1698 _currentnode._outerstartindex + _currentnode._outerlength);
1699 script._outerlength = _index-1 - script._outerstartindex;
1700 if (_streammode && ReportNode != null)
1701 _stop_parsing = ! ReportNode (script);
1702 else
1703 _currentnode.AppendChild(script);
1704 Debug ("Found script: [" + script.InnerText + "]");
1706 PushNodeStart(HtmlNodeType.Element, _index-1);
1707 PushNodeNameStart(false, _index-1 +2);
1708 _state = ParseState.Tag;
1709 IncrementPosition();
1713 break;
1717 // finish the current work
1718 if (_currentnode._namestartindex > 0)
1720 PushNodeNameEnd(_index);
1722 PushNodeEnd(_index, false);
1724 // we don't need this anymore
1725 _lastnodes.Clear();
1728 private bool NewCheck()
1730 if (_c != '<')
1732 return false;
1734 //SLIM: if (_index<_text.Length)
1735 if (! _text.Eof (_index))
1737 if (_text[_index] == '%')
1739 switch(_state)
1741 case ParseState.AttributeAfterEquals:
1742 PushAttributeValueStart(_index-1);
1743 break;
1745 case ParseState.BetweenAttributes:
1746 PushAttributeNameStart(_index-1);
1747 break;
1749 case ParseState.WhichTag:
1750 PushNodeNameStart(true, _index-1);
1751 _state = ParseState.Tag;
1752 break;
1754 _oldstate = _state;
1755 _state = ParseState.ServerSideCode;
1756 return true;
1760 PushNodeEnd(_index-1, true);
1761 _state = ParseState.WhichTag;
1762 //SLIM: if ((_index-1) <= (_text.Length-2))
1763 if (!_text.Eof (_index))
1765 if (_text[_index] == '!')
1767 PushNodeStart(HtmlNodeType.Comment, _index-1);
1768 PushNodeNameStart(true, _index);
1769 PushNodeNameEnd(_index+1);
1770 _state = ParseState.Comment;
1771 //SLIM: if (_index<(_text.Length-2))
1772 if (! _text.Eof (_index + 2))
1774 if ((_text[_index+1] == '-') &&
1775 (_text[_index+2] == '-'))
1777 _fullcomment = true;
1779 else
1781 _fullcomment = false;
1784 return true;
1787 PushNodeStart(HtmlNodeType.Element, _index-1);
1788 return true;
1791 private void ReadDocumentEncoding(HtmlNode node)
1793 if (!OptionReadEncoding)
1794 return;
1795 // format is
1796 // <meta http-equiv="content-type" content="text/html;charset=iso-8859-1" />
1798 // when we append a child, we are in node end, so attributes are already populated
1799 if (node._namelength == 4) // quick check, avoids string alloc
1801 // only these nodes can occur before meta
1802 // if we started seeing any other node, we will never see a meta node
1803 if (node.NodeType == HtmlNodeType.Element &&
1804 (node.Name != "head" && node.Name != "script" &&
1805 node.Name != "style" && node.Name != "title" &&
1806 node.Name != "head" && node.Name != "link" &&
1807 node.Name != "html" && node.Name != "meta")) {
1808 _declaredencoding = null;
1809 if (_onlyDetectEncoding)
1810 throw new EncodingFoundException (null);
1811 else
1812 return;
1813 // FIXME: Should also handle declaredencoding mismatch with detected
1814 // encoding, as done below. None of the current filters run in error
1815 // detection mode currently, so its not needed now.
1817 else if (node.Name == "meta") // all nodes names are lowercase
1819 HtmlAttribute att = node.Attributes["http-equiv"];
1820 if (att != null)
1822 if (string.Compare(att.Value, "content-type", true) == 0)
1824 HtmlAttribute content = node.Attributes["content"];
1825 if (content != null)
1827 string charset = NameValuePairList.GetNameValuePairsValue(content.Value, "charset");
1828 if (charset != null)
1830 _declaredencoding = Encoding.GetEncoding(charset);
1831 if (_onlyDetectEncoding)
1833 throw new EncodingFoundException(_declaredencoding);
1836 if (_streamencoding != null)
1838 if (_declaredencoding.WindowsCodePage != _streamencoding.WindowsCodePage)
1840 AddError(
1841 HtmlParseErrorCode.CharsetMismatch,
1842 _line, _lineposition,
1843 _index, node.OuterHtml,
1844 "Encoding mismatch between StreamEncoding: " +
1845 _streamencoding.WebName + " and DeclaredEncoding: " + _declaredencoding.WebName);
1856 private void PushAttributeNameStart(int index)
1858 _currentattribute = CreateAttribute();
1859 _currentattribute._namestartindex = index;
1860 _currentattribute._line = _line;
1861 _currentattribute._lineposition = _lineposition;
1862 _currentattribute._streamposition = index;
1865 private void PushAttributeNameEnd(int index)
1867 _currentattribute._namelength = index - _currentattribute._namestartindex;
1868 _currentnode.Attributes.Append(_currentattribute);
1871 private void PushAttributeValueStart(int index)
1873 _currentattribute._valuestartindex = index;
1876 private void PushAttributeValueEnd(int index)
1878 _currentattribute._valuelength = index - _currentattribute._valuestartindex;
1881 private void PushNodeStart(HtmlNodeType type, int index)
1883 _currentnode = CreateNode(type, index);
1884 _currentnode._line = _line;
1885 _currentnode._lineposition = _lineposition;
1886 if (type == HtmlNodeType.Element)
1888 _currentnode._lineposition--;
1890 _currentnode._streamposition = index;
1893 private void PushNodeEnd(int index, bool close)
1895 _currentnode._outerlength = index - _currentnode._outerstartindex;
1897 //SLIM: inform caller
1898 if (_streammode && ReportNode != null)
1899 _stop_parsing = ! ReportNode (_currentnode);
1901 if (_debug) {
1902 if (_currentnode._nodetype == HtmlNodeType.Text)
1903 Debug ("Text:" + _currentnode.InnerText);
1904 else
1905 Debug ((_currentnode.StartTag ? "Start-" : "End-") + _currentnode.Name);
1907 if ((_currentnode._nodetype == HtmlNodeType.Text) ||
1908 (_currentnode._nodetype == HtmlNodeType.Comment))
1910 // forget about void nodes
1911 if (_currentnode._outerlength>0)
1913 _currentnode._innerlength = _currentnode._outerlength;
1914 _currentnode._innerstartindex = _currentnode._outerstartindex;
1915 // SLIM: no need to append child in stream mode
1916 // SLIM: whatever the caller needs to do, tell it to do now
1917 if (!_streammode && _lastparentnode != null)
1919 _lastparentnode.AppendChild(_currentnode);
1923 else
1925 if ((_currentnode._starttag) && (_lastparentnode != _currentnode))
1927 // add to parent node
1928 // SLIM: no need to append child in stream mode
1929 // SLIM: whatever the caller needs to do, tell it to do now
1930 if (!_streammode && _lastparentnode != null)
1932 _lastparentnode.AppendChild(_currentnode);
1935 ReadDocumentEncoding(_currentnode);
1937 // remember last node of this kind
1938 // SLIM: we still to store _currentnode to help other tags in the same level
1939 HtmlNode prev = (HtmlNode)_lastnodes[_currentnode.Name];
1940 _currentnode._prevwithsamename = prev;
1941 _lastnodes[_currentnode.Name] = _currentnode;
1943 // change parent?
1944 if ((_currentnode.NodeType == HtmlNodeType.Document) ||
1945 (_currentnode.NodeType == HtmlNodeType.Element))
1947 _lastparentnode = _currentnode;
1950 if (HtmlNode.IsCDataElement(CurrentNodeName()))
1952 _state = ParseState.PcData;
1953 return;
1956 if ((HtmlNode.IsClosedElement(_currentnode.Name)) ||
1957 (HtmlNode.IsEmptyElement(_currentnode.Name)))
1959 close = true;
1964 if ((close) || (!_currentnode._starttag))
1966 CloseCurrentNode();
1967 if ((_currentnode._nodetype == HtmlNodeType.Text) ||
1968 (_currentnode._nodetype == HtmlNodeType.Comment))
1969 _currentnode = null;
1973 private void PushNodeNameStart(bool starttag, int index)
1975 _currentnode._starttag = starttag;
1976 _currentnode._namestartindex = index;
1979 private string[] GetResetters(string name)
1981 switch (name)
1983 case "li":
1984 return new string[]{"ul"};
1986 case "tr":
1987 return new string[]{"table"};
1989 case "th":
1990 case "td":
1991 return new string[]{"tr", "table"};
1993 default:
1994 return null;
1998 private void FixNestedTags()
2000 // we are only interested by start tags, not closing tags
2001 if (!_currentnode._starttag)
2002 return;
2004 string name = CurrentNodeName().ToLower();
2005 FixNestedTag(name, GetResetters(name));
2008 private void FixNestedTag(string name, string[] resetters)
2010 if (resetters == null)
2011 return;
2013 HtmlNode prev;
2015 // if we find a previous unclosed same name node, without a resetter node between, we must close it
2016 prev = (HtmlNode)_lastnodes[name];
2017 if ((prev != null) && (!prev.Closed))
2020 // try to find a resetter node, if found, we do nothing
2021 if (FindResetterNodes(prev, resetters))
2023 return;
2026 // ok we need to close the prev now
2027 // create a fake closer node
2028 HtmlNode close = new HtmlNode(prev.NodeType, this, -1);
2029 close._endnode = close;
2030 prev.CloseNode(close);
2035 private bool FindResetterNodes(HtmlNode node, string[] names)
2037 if (names == null)
2039 return false;
2041 for(int i=0;i<names.Length;i++)
2043 if (FindResetterNode(node, names[i]) != null)
2045 return true;
2048 return false;
2051 private HtmlNode FindResetterNode(HtmlNode node, string name)
2053 HtmlNode resetter = (HtmlNode)_lastnodes[name];
2054 if (resetter == null)
2055 return null;
2056 if (resetter.Closed)
2058 return null;
2060 if (resetter._streamposition<node._streamposition)
2062 return null;
2064 return resetter;
2067 private void PushNodeNameEnd(int index)
2069 _currentnode._namelength = index - _currentnode._namestartindex;
2070 if (OptionFixNestedTags)
2072 FixNestedTags();
2076 private void CloseCurrentNode()
2078 if (_currentnode.Closed) // text or document are by def closed
2079 return;
2081 bool error = false;
2083 // find last node of this kind
2084 HtmlNode prev = (HtmlNode)_lastnodes[_currentnode.Name];
2085 if (prev == null)
2087 if (HtmlNode.IsClosedElement(_currentnode.Name))
2089 // </br> will be seen as <br>
2090 _currentnode.CloseNode(_currentnode);
2092 // add to parent node
2093 if (_lastparentnode != null)
2095 HtmlNode foundNode = null;
2096 Stack futureChild = new Stack();
2097 for (HtmlNode node = _lastparentnode.LastChild; node != null; node = node.PreviousSibling)
2099 if ((node.Name == _currentnode.Name) && (! node.HasChildNodes))
2101 foundNode = node;
2102 break;
2104 futureChild.Push(node);
2106 if (foundNode != null)
2108 HtmlNode node = null;
2109 while(futureChild.Count != 0)
2111 node = (HtmlNode)futureChild.Pop();
2112 _lastparentnode.RemoveChild(node);
2113 foundNode.AppendChild(node);
2116 else
2118 _lastparentnode.AppendChild(_currentnode);
2123 else
2125 // node has no parent
2126 // node is not a closed node
2128 if (HtmlNode.CanOverlapElement(_currentnode.Name))
2130 // this is a hack: add it as a text node
2131 HtmlNode closenode = CreateNode(HtmlNodeType.Text, _currentnode._outerstartindex);
2132 closenode._outerlength = _currentnode._outerlength;
2133 ((HtmlTextNode)closenode).Text = ((HtmlTextNode)closenode).Text.ToLower();
2134 if (_lastparentnode != null)
2136 _lastparentnode.AppendChild(closenode);
2140 else
2142 if (HtmlNode.IsEmptyElement(_currentnode.Name))
2144 AddError(
2145 HtmlParseErrorCode.EndTagNotRequired,
2146 _currentnode._line, _currentnode._lineposition,
2147 _currentnode._streamposition, _currentnode.OuterHtml,
2148 "End tag </" + _currentnode.Name + "> is not required");
2150 else
2152 // node cannot overlap, node is not empty
2153 AddError(
2154 HtmlParseErrorCode.TagNotOpened,
2155 _currentnode._line, _currentnode._lineposition,
2156 _currentnode._streamposition, _currentnode.OuterHtml,
2157 "Start tag <" + _currentnode.Name + "> was not found");
2158 error = true;
2163 else
2165 if (OptionFixNestedTags)
2167 if (FindResetterNodes(prev, GetResetters(_currentnode.Name)))
2169 AddError(
2170 HtmlParseErrorCode.EndTagInvalidHere,
2171 _currentnode._line, _currentnode._lineposition,
2172 _currentnode._streamposition, _currentnode.OuterHtml,
2173 "End tag </" + _currentnode.Name + "> invalid here");
2174 error = true;
2178 if (!error)
2180 _lastnodes[_currentnode.Name] = prev._prevwithsamename;
2181 prev.CloseNode(_currentnode);
2186 // we close this node, get grandparent
2187 if (!error)
2189 if ((_lastparentnode != null) &&
2190 ((!HtmlNode.IsClosedElement(_currentnode.Name)) ||
2191 (_currentnode._starttag)))
2193 UpdateLastParentNode();
2198 internal void UpdateLastParentNode()
2202 if (_lastparentnode.Closed)
2204 _lastparentnode = _lastparentnode.ParentNode;
2207 while ((_lastparentnode != null) && (_lastparentnode.Closed));
2208 if (_lastparentnode == null)
2210 _lastparentnode = _documentnode;
2214 private string CurrentAttributeName()
2216 return _text.Substring(_currentattribute._namestartindex, _currentattribute._namelength);
2219 private string CurrentAttributeValue()
2221 return _text.Substring(_currentattribute._valuestartindex, _currentattribute._valuelength);
2224 private string CurrentNodeName()
2226 return _text.Substring(_currentnode._namestartindex, _currentnode._namelength);
2229 private string CurrentNodeOuter()
2231 return _text.Substring(_currentnode._outerstartindex, _currentnode._outerlength);
2234 private string CurrentNodeInner()
2236 return _text.Substring(_currentnode._innerstartindex, _currentnode._innerlength);
2239 /// <summary>
2240 /// Determines if the specified character is considered as a whitespace character.
2241 /// </summary>
2242 /// <param name="c">The character to check.</param>
2243 /// <returns>true if if the specified character is considered as a whitespace character.</returns>
2244 public static bool IsWhiteSpace(int c)
2246 if ((c == 10) || (c == 13) || (c == 32) || (c == 9))
2248 return true;
2250 return false;
2255 internal class EncodingFoundException: Exception
2257 private Encoding _encoding;
2259 internal EncodingFoundException(Encoding encoding)
2261 _encoding = encoding;
2264 internal Encoding Encoding
2268 return _encoding;