Bring back old behaviour in HtmlDocument. Required for compatibility reasons.
[beagle.git] / Filters / HtmlAgilityPack / HtmlDocument.cs
blob556f31b3eb3e6f307398b47d698894589238de8f
1 // HtmlAgilityPack V1.0 - Simon Mourier <simonm@microsoft.com>
3 /*
4 Copyright (C) 2003 Simon Mourier <simonm@microsoft.com>
5 All rights reserved.
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions
9 are met:
10 1. Redistributions of source code must retain the above copyright
11 notice, this list of conditions and the following disclaimer.
12 2. Redistributions in binary form must reproduce the above copyright
13 notice, this list of conditions and the following disclaimer in the
14 documentation and/or other materials provided with the distribution.
15 3. The name of the author may not be used to endorse or promote products
16 derived from this software without specific prior written permission.
18 THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
19 IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20 OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21 IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
22 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
23 NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
27 THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 using System;
31 using System.IO;
32 using System.Text;
33 using System.Diagnostics;
34 using System.Collections;
35 using System.Text.RegularExpressions;
36 using System.Xml;
37 using System.Xml.XPath;
40 // Legend: SLIM=Comment added describing changes to original HtmlAgilityPack
41 // to reduce memory consumption
42 // Once the parser is free of bugs, the comments will be taken out
43 namespace HtmlAgilityPack
45 /// <summary>
46 /// Represents the type of parsing error.
47 /// </summary>
48 public enum HtmlParseErrorCode
50 /// <summary>
51 /// A tag was not closed.
52 /// </summary>
53 TagNotClosed,
55 /// <summary>
56 /// A tag was not opened.
57 /// </summary>
58 TagNotOpened,
60 /// <summary>
61 /// There is a charset mismatch between stream and declared (META) encoding.
62 /// </summary>
63 CharsetMismatch,
65 /// <summary>
66 /// An end tag was not required.
67 /// </summary>
68 EndTagNotRequired,
70 /// <summary>
71 /// An end tag is invalid at this position.
72 /// </summary>
73 EndTagInvalidHere
76 /// <summary>
77 /// Represents a parsing error found during document parsing.
78 /// </summary>
79 public class HtmlParseError
81 private HtmlParseErrorCode _code;
82 private int _line;
83 private int _linePosition;
84 private int _streamPosition;
85 private string _sourceText;
86 private string _reason;
88 internal HtmlParseError(
89 HtmlParseErrorCode code,
90 int line,
91 int linePosition,
92 int streamPosition,
93 string sourceText,
94 string reason)
96 _code = code;
97 _line = line;
98 _linePosition = linePosition;
99 _streamPosition = streamPosition;
100 _sourceText = sourceText;
101 _reason = reason;
104 /// <summary>
105 /// Gets the type of error.
106 /// </summary>
107 public HtmlParseErrorCode Code
111 return _code;
115 /// <summary>
116 /// Gets the line number of this error in the document.
117 /// </summary>
118 public int Line
122 return _line;
126 /// <summary>
127 /// Gets the column number of this error in the document.
128 /// </summary>
129 public int LinePosition
133 return _linePosition;
137 /// <summary>
138 /// Gets the absolstream position of this error in the document, relative to the start of the document.
139 /// </summary>
140 public int StreamPosition
144 return _streamPosition;
148 /// <summary>
149 /// Gets the the full text of the line containing the error.
150 /// </summary>
151 public string SourceText
155 return _sourceText;
159 /// <summary>
160 /// Gets a description for the error.
161 /// </summary>
162 public string Reason
166 return _reason;
172 abstract class StreamAsArray {
173 public abstract bool Eof (int index);
174 public abstract char this [int index] { get;}
175 public abstract string Substring (int startindex, int length);
176 public abstract int FullLength { get;}
179 // SLIM: creating this class to wrap around a textreader
180 // to emulate ReadToEnd () behaviour
181 class ImplStreamAsArray : StreamAsArray {
182 private StreamReader _reader;
183 private int _length;
184 private int _position;
185 private bool _eof;
186 private char[] _buf_previous; // could have used only one array
187 private char[] _buf_current; // but, this is cleaner
188 private int _block_size;
190 public ImplStreamAsArray (StreamReader r)
192 _reader = r;
193 _length = 0;
194 _position = 0;
195 _eof = false;
197 _block_size = 1024;
198 _buf_previous = new char [_block_size];
199 _buf_current = new char [_block_size];
201 Read (true);
204 private void Read (bool initial)
206 if ( !initial) {
207 Array.Copy (_buf_current, _buf_previous, _block_size);
208 _position += _block_size;
210 HtmlDocument.Debug ("Debug: Read in buffer at:" + _position);
212 int num_read = _reader.Read (_buf_current, 0, _block_size);
213 if (num_read < _block_size) {
214 _eof = true;
215 _length = _position + num_read;
217 HtmlDocument.Debug ("[" + new string (_buf_current, 0, num_read) + "]");
220 public override bool Eof (int index) {
221 if (_eof)
222 return (index == _length);
223 else {
224 if (index >= _position + _block_size &&
225 index < _position + _block_size + _block_size)
226 Read (false);
227 if (_eof)
228 return (index == _length);
229 else
230 return false;
234 public override char this[int index] {
235 get {
236 if (index >= _position &&
237 index < _position + _block_size)
238 return _buf_current [index % _block_size];
239 if (index >= _position - _block_size &&
240 index < _position)
241 return _buf_previous [ index % _block_size];
242 if (index >= _position + _block_size &&
243 index < _position + _block_size + _block_size) {
244 Read (false);
245 return _buf_current [index % _block_size];
247 Console.WriteLine ("EXCEPTION!!!");
248 throw new Exception (String.Format ("{0} is out of current bounds:[{1}-{2}] and further than read-ahead",
249 index,
250 _position - _block_size,
251 _position + _block_size - 1));
255 // evil function ... you get what you pay for!
256 private string OutOfBandRead (int startindex, int length)
258 HtmlDocument.Debug ("Out of band read! From " + startindex + " to " + (startindex + length - 1));
259 ResetPosition (startindex);
260 // ahh.. now we are at the correct place
261 // create a buffer of required length
262 // who cares if the buffer size does not align well
263 // with page boundary
264 char[] temp_buf = new char [length];
265 int num_read = _reader.Read (temp_buf, 0, length);
266 if (num_read < length) {
267 // Shouldnt occur!!!
268 _eof = true;
269 _length = startindex + num_read;
271 // discard data and reset stream position
272 int t = (_eof ? _length :_position + _block_size);
273 ResetPosition (t);
274 return new String (temp_buf);
277 // streamreader does not allow seeking
278 // seek on its basestream does not reflect the position
279 // of the reader - it is governed by the buffer size
280 // of the underlying stream
281 // :( so, read character by character from beginning ...
282 private void ResetPosition (int pos)
284 _reader.DiscardBufferedData ();
285 _reader.BaseStream.Position = 0;
286 // read in chunks of block_size
287 int n1 = pos / _block_size;
288 int n2 = pos % _block_size;
289 char[] tmp = new char [_block_size];
290 // yo ho... start reading till we have reach pos
291 // hopefully, reader will buffer itself, so we can be mean and get one char at a time
292 for (int i = 0; i < n1; ++i)
293 _reader.Read (tmp, 0, _block_size);
294 for (int i = 0; i < n2; ++i)
295 _reader.Read ();
296 tmp = null;
299 public override string Substring (int startindex, int length)
301 if (length == 0) {
302 HtmlDocument.Debug ("substring:" + startindex + " " + length + " " + _position + ":");
303 return String.Empty;
305 if (length > _block_size || startindex < _position - _block_size) {
306 return OutOfBandRead (startindex, length);
308 if (startindex + length - 1 >= _position + _block_size) {
309 Read (false);
311 string substr;
312 if (startindex < _position) {
313 int len_1 = _position - startindex;
314 if (length < len_1)
315 substr = new String (_buf_previous, _block_size - len_1, length);
316 else {
317 substr = new String (_buf_previous, _block_size - len_1, len_1);
318 substr += new String (_buf_current, 0, length - len_1);
320 } else {
321 substr = new String (_buf_current, startindex - _position, length);
323 return substr;
326 // FIXME: Is this costly ?
327 public override int FullLength {
328 get {
329 return (int)_reader.BaseStream.Length;
334 // A dummy StreamAsArray wrapper around a string
335 class DummyStreamAsArray : StreamAsArray {
336 private string _base_string;
337 private int _length;
339 public DummyStreamAsArray(string str)
341 _base_string = str;
342 _length = str.Length;
345 public override bool Eof(int index)
347 return (index >= _length);
350 public new char this[int index] {
351 get { return _base_string [index]; }
354 public override string Substring (int startindex, int length)
356 return _base_string.Substring (startindex, length);
359 public override int FullLength {
360 get { return _length; }
364 /// <summary>
365 /// Represents a complete HTML document.
366 /// </summary>
367 public class HtmlDocument: IXPathNavigable
369 // SLIM: Make the parser event driven
370 // callback for FilterHtml
371 // return value is a way for the callback to signal to continue or stop parsing
372 public delegate bool NodeHandler (HtmlNode node);
373 public NodeHandler ReportNode;
374 // misnomer ... should be called event_driven_mode
375 private bool _streammode = false;
376 private bool _stop_parsing = false;
378 internal static readonly string HtmlExceptionRefNotChild = "Reference node must be a child of this node";
379 internal static readonly string HtmlExceptionUseIdAttributeFalse = "You need to set UseIdAttribute property to true to enable this feature";
381 internal Hashtable _openednodes;
382 internal Hashtable _lastnodes = new Hashtable();
383 internal Hashtable _nodesid;
384 private HtmlNode _documentnode;
385 //SLIM: internal string _text;
386 internal StreamAsArray _text;
387 private HtmlNode _currentnode;
388 private HtmlNode _lastparentnode;
389 private HtmlAttribute _currentattribute;
390 private int _index;
391 private int _line;
392 private int _lineposition, _maxlineposition;
393 private int _c;
394 private bool _fullcomment;
395 private System.Text.Encoding _streamencoding;
396 private System.Text.Encoding _declaredencoding;
397 private ArrayList _parseerrors = new ArrayList();
398 private ParseState _state, _oldstate;
399 private Crc32 _crc32 = null;
400 private bool _onlyDetectEncoding = false;
401 private int _pcdata_quote_char = '\0';
403 private static bool _debug = false;
404 internal static void Debug (string s)
406 if (_debug)
407 Console.WriteLine (s);
410 // public props
412 /// <summary>
413 /// Defines if a checksum must be computed for the document while parsing. Default is false.
414 /// </summary>
415 public bool OptionComputeChecksum = false;
417 /// <summary>
418 /// Defines if declared encoding must be read from the document.
419 /// Declared encoding is determined using the meta http-equiv="content-type" content="text/html;charset=XXXXX" html node.
420 /// Default is true.
421 /// </summary>
422 public bool OptionReadEncoding = true;
425 /// <summary>
426 /// Defines if non closed nodes will be checked at the end of parsing. Default is true.
427 /// </summary>
428 public bool OptionCheckSyntax = true;
430 /// <summary>
431 /// Defines if the 'id' attribute must be specifically used. Default is true.
432 /// </summary>
433 public bool OptionUseIdAttribute = true;
435 /// <summary>
436 /// Defines if empty nodes must be written as closed during output. Default is false.
437 /// </summary>
438 public bool OptionWriteEmptyNodes = false;
440 /// <summary>
441 /// Defines if output must conform to XML, instead of HTML.
442 /// </summary>
443 public bool OptionOutputAsXml = false;
445 /// <summary>
446 /// Defines if name must be output in uppercase. Default is false.
447 /// </summary>
448 public bool OptionOutputUpperCase = false;
450 /// <summary>
451 /// Defines if attribute value output must be optimized (not bound with double quotes if it is possible). Default is false.
452 /// </summary>
453 public bool OptionOutputOptimizeAttributeValues = false;
455 /// <summary>
456 /// Adds Debugging attributes to node. Default is false.
457 /// </summary>
458 public bool OptionAddDebuggingAttributes = false;
460 /// <summary>
461 /// Defines if source text must be extracted while parsing errors.
462 /// If the document has a lot of errors, or cascading errors, parsing performance can be dramatically affected if set to true.
463 /// Default is false.
464 /// </summary>
465 public bool OptionExtractErrorSourceText = false; // turning this on can dramatically slow performance if a lot of errors are detected
467 /// <summary>
468 /// Defines if closing for non closed nodes must be done at the end or directly in the document.
469 /// Setting this to true can actually change how browsers render the page. Default is false.
470 /// </summary>
471 public bool OptionAutoCloseOnEnd = false; // close errors at the end
473 /// <summary>
474 /// Defines if LI, TR, TH, TD tags must be partially fixed when nesting errors are detected. Default is false.
475 /// </summary>
476 public bool OptionFixNestedTags = false; // fix li, tr, th, td tags
478 /// <summary>
479 /// Defines the maximum length of source text or parse errors. Default is 100.
480 /// </summary>
481 public int OptionExtractErrorSourceTextMaxLength = 100;
483 /// <summary>
484 /// Defines the default stream encoding to use. Default is System.Text.Encoding.Default.
485 /// </summary>
486 // From http://www.w3.org/TR/REC-html40/charset.html
487 // The HTTP protocol ([RFC2616], section 3.7.1) mentions ISO-8859-1 as a default character encoding when the "charset" parameter is absent from the "Content-Type" header field.
488 // So, however we are still using UTF-8 for some unknown reason
489 //FIXME: Fix the default encoding!
490 public System.Text.Encoding OptionDefaultStreamEncoding = Encoding.UTF8;
492 /// <summary>
493 /// Gets a list of parse errors found in the document.
494 /// </summary>
495 public ArrayList ParseErrors
499 return _parseerrors;
503 /// <summary>
504 /// Gets the document's stream encoding.
505 /// </summary>
506 public System.Text.Encoding StreamEncoding
510 return _streamencoding;
514 /// <summary>
515 /// Gets the document's declared encoding.
516 /// Declared encoding is determined using the meta http-equiv="content-type" content="text/html;charset=XXXXX" html node.
517 /// </summary>
518 public System.Text.Encoding DeclaredEncoding
522 return _declaredencoding;
526 /// <summary>
527 /// Creates an instance of an HTML document.
528 /// </summary>
529 public HtmlDocument()
531 _documentnode = CreateNode(HtmlNodeType.Document, 0);
534 internal HtmlNode GetXmlDeclaration()
536 if (!_documentnode.HasChildNodes)
538 return null;
541 foreach(HtmlNode node in _documentnode._childnodes)
543 if (node.Name == "?xml") // it's ok, names are case sensitive
545 return node;
548 return null;
551 /// <summary>
552 /// Applies HTML encoding to a specified string.
553 /// </summary>
554 /// <param name="html">The input string to encode. May not be null.</param>
555 /// <returns>The encoded string.</returns>
556 public static string HtmlEncode(string html)
558 if (html == null)
560 throw new ArgumentNullException("html");
562 // replace & by &amp; but only once!
563 Regex rx = new Regex("&(?!(amp;)|(lt;)|(gt;)|(quot;))", RegexOptions.IgnoreCase);
564 return rx.Replace(html, "&amp;").Replace("<", "&lt;").Replace(">", "&gt;").Replace("\"", "&quot;");
567 /// <summary>
568 /// Detects the encoding of an HTML stream.
569 /// </summary>
570 /// <param name="stream">The input stream. May not be null.</param>
571 /// <returns>The detected encoding.</returns>
572 public Encoding DetectEncoding(Stream stream)
574 if (stream == null)
576 throw new ArgumentNullException("stream");
578 return DetectEncoding(new StreamReader(stream));
581 /// <summary>
582 /// Detects the encoding of an HTML file.
583 /// </summary>
584 /// <param name="path">Path for the file containing the HTML document to detect. May not be null.</param>
585 /// <returns>The detected encoding.</returns>
586 public Encoding DetectEncoding(string path)
588 if (path == null)
590 throw new ArgumentNullException("path");
592 StreamReader sr = new StreamReader(path, OptionDefaultStreamEncoding);
593 Encoding encoding = DetectEncoding(sr);
594 sr.Close();
595 return encoding;
598 /// <summary>
599 /// Detects the encoding of an HTML text.
600 /// </summary>
601 /// <param name="html">The input html text. May not be null.</param>
602 /// <returns>The detected encoding.</returns>
603 public Encoding DetectEncodingHtml(string html)
605 if (html == null)
607 throw new ArgumentNullException("html");
609 StringReader sr = new StringReader(html);
610 Encoding encoding = DetectEncoding(sr);
611 sr.Close();
612 return encoding;
615 /// <summary>
616 /// Detects the encoding of an HTML text provided on a TextReader.
617 /// </summary>
618 /// <param name="reader">The TextReader used to feed the HTML. May not be null.</param>
619 /// <returns>The detected encoding.</returns>
620 public Encoding DetectEncoding(TextReader reader)
622 if (reader == null)
624 throw new ArgumentNullException("reader");
626 _onlyDetectEncoding = true;
627 if (OptionCheckSyntax)
629 _openednodes = new Hashtable();
631 else
633 _openednodes = null;
636 if (OptionUseIdAttribute)
638 _nodesid = new Hashtable();
640 else
642 _nodesid = null;
645 StreamReader sr = reader as StreamReader;
646 if (sr != null)
648 _streamencoding = sr.CurrentEncoding;
649 _text = new ImplStreamAsArray (sr);
651 else
653 _streamencoding = null;
654 // Expensive, but cannot avoid since TextReader doesnt have any length of the underlying data
655 _text = new DummyStreamAsArray (reader.ReadToEnd());
657 _declaredencoding = null;
659 // SLIM: _text = reader.ReadToEnd();
660 _documentnode = CreateNode(HtmlNodeType.Document, 0);
662 // this is a hack, but it allows us not to muck with the original parsing code
665 Parse();
667 catch(EncodingFoundException ex)
669 _lastnodes.Clear();
670 return ex.Encoding;
672 return null;
675 /// <summary>
676 /// Loads an HTML document from a stream.
677 /// </summary>
678 /// <param name="stream">The input stream.</param>
679 public void Load(Stream stream)
681 Load(new StreamReader(stream, OptionDefaultStreamEncoding));
684 /// <summary>
685 /// Loads an HTML document from a stream.
686 /// </summary>
687 /// <param name="stream">The input stream.</param>
688 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the stream.</param>
689 public void Load(Stream stream, bool detectEncodingFromByteOrderMarks)
691 Load(new StreamReader(stream, detectEncodingFromByteOrderMarks));
694 /// <summary>
695 /// Loads an HTML document from a stream.
696 /// </summary>
697 /// <param name="stream">The input stream.</param>
698 /// <param name="encoding">The character encoding to use.</param>
699 public void Load(Stream stream, Encoding encoding)
701 Load(new StreamReader(stream, encoding));
704 /// <summary>
705 /// Loads an HTML document from a stream.
706 /// </summary>
707 /// <param name="stream">The input stream.</param>
708 /// <param name="encoding">The character encoding to use.</param>
709 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the stream.</param>
710 public void Load(Stream stream, Encoding encoding, bool detectEncodingFromByteOrderMarks)
712 Load(new StreamReader(stream, encoding, detectEncodingFromByteOrderMarks));
715 /// <summary>
716 /// Loads an HTML document from a stream.
717 /// </summary>
718 /// <param name="stream">The input stream.</param>
719 /// <param name="encoding">The character encoding to use.</param>
720 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the stream.</param>
721 /// <param name="buffersize">The minimum buffer size.</param>
722 public void Load(Stream stream, Encoding encoding, bool detectEncodingFromByteOrderMarks, int buffersize)
724 Load(new StreamReader(stream, encoding, detectEncodingFromByteOrderMarks, buffersize));
727 /// <summary>
728 /// Loads an HTML document from a file.
729 /// </summary>
730 /// <param name="path">The complete file path to be read. May not be null.</param>
731 public void Load(string path)
733 if (path == null)
735 throw new ArgumentNullException("path");
737 StreamReader sr = new StreamReader(path, OptionDefaultStreamEncoding);
738 Load(sr);
739 sr.Close();
742 /// <summary>
743 /// Loads an HTML document from a file.
744 /// </summary>
745 /// <param name="path">The complete file path to be read. May not be null.</param>
746 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
747 public void Load(string path, bool detectEncodingFromByteOrderMarks)
749 if (path == null)
751 throw new ArgumentNullException("path");
753 StreamReader sr = new StreamReader(path, detectEncodingFromByteOrderMarks);
754 Load(sr);
755 sr.Close();
758 /// <summary>
759 /// Loads an HTML document from a file.
760 /// </summary>
761 /// <param name="path">The complete file path to be read. May not be null.</param>
762 /// <param name="encoding">The character encoding to use. May not be null.</param>
763 public void Load(string path, Encoding encoding)
765 if (path == null)
767 throw new ArgumentNullException("path");
769 if (encoding == null)
771 throw new ArgumentNullException("encoding");
773 StreamReader sr = new StreamReader(path, encoding);
774 Load(sr);
775 sr.Close();
778 /// <summary>
779 /// Loads an HTML document from a file.
780 /// </summary>
781 /// <param name="path">The complete file path to be read. May not be null.</param>
782 /// <param name="encoding">The character encoding to use. May not be null.</param>
783 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
784 public void Load(string path, Encoding encoding, bool detectEncodingFromByteOrderMarks)
786 if (path == null)
788 throw new ArgumentNullException("path");
790 if (encoding == null)
792 throw new ArgumentNullException("encoding");
794 StreamReader sr = new StreamReader(path, encoding, detectEncodingFromByteOrderMarks);
795 Load(sr);
796 sr.Close();
799 /// <summary>
800 /// Loads an HTML document from a file.
801 /// </summary>
802 /// <param name="path">The complete file path to be read. May not be null.</param>
803 /// <param name="encoding">The character encoding to use. May not be null.</param>
804 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
805 /// <param name="buffersize">The minimum buffer size.</param>
806 public void Load(string path, Encoding encoding, bool detectEncodingFromByteOrderMarks, int buffersize)
808 if (path == null)
810 throw new ArgumentNullException("path");
812 if (encoding == null)
814 throw new ArgumentNullException("encoding");
816 StreamReader sr = new StreamReader(path, encoding, detectEncodingFromByteOrderMarks, buffersize);
817 Load(sr);
818 sr.Close();
821 /// <summary>
822 /// Loads the HTML document from the specified string.
823 /// </summary>
824 /// <param name="html">String containing the HTML document to load. May not be null.</param>
825 public void LoadHtml(string html)
827 if (html == null)
829 throw new ArgumentNullException("html");
831 StringReader sr = new StringReader(html);
832 Load(sr);
833 sr.Close();
836 /// <summary>
837 /// Detects the encoding of an HTML document from a file first, and then loads the file.
838 /// </summary>
839 /// <param name="path">The complete file path to be read.</param>
840 public void DetectEncodingAndLoad(string path)
842 DetectEncodingAndLoad(path, true);
845 /// <summary>
846 /// Detects the encoding of an HTML document from a file first, and then loads the file.
847 /// </summary>
848 /// <param name="path">The complete file path to be read. May not be null.</param>
849 /// <param name="detectEncoding">true to detect encoding, false otherwise.</param>
850 public void DetectEncodingAndLoad(string path, bool detectEncoding)
852 if (path == null)
854 throw new ArgumentNullException("path");
856 System.Text.Encoding enc;
857 if (detectEncoding)
859 enc = DetectEncoding(path);
861 else
863 enc = null;
866 if (enc == null)
868 Load(path);
870 else
872 Load(path, enc);
876 /// <summary>
877 /// Loads the HTML document from the specified TextReader.
878 /// </summary>
879 /// <param name="reader">The TextReader used to feed the HTML data into the document. May not be null.</param>
880 public void Load(TextReader reader)
882 // all Load methods pass down to this one
883 if (reader == null)
885 throw new ArgumentNullException("reader");
888 _onlyDetectEncoding = false;
890 if (OptionCheckSyntax)
892 _openednodes = new Hashtable();
894 else
896 _openednodes = null;
899 if (OptionUseIdAttribute)
901 _nodesid = new Hashtable();
903 else
905 _nodesid = null;
908 StreamReader sr = reader as StreamReader;
909 if (sr != null)
913 // trigger bom read if needed
914 sr.Peek();
916 catch
918 // void on purpose
920 _streamencoding = sr.CurrentEncoding;
921 _text = new ImplStreamAsArray (sr);
923 else
925 _streamencoding = null;
926 // Expensive, but cannot avoid since TextReader doesnt have any length of the underlying data
927 _text = new DummyStreamAsArray (reader.ReadToEnd());
929 _declaredencoding = null;
931 // SLIM: _text = reader.ReadToEnd();
932 _documentnode = CreateNode(HtmlNodeType.Document, 0);
933 Parse();
935 if (OptionCheckSyntax)
937 foreach(HtmlNode node in _openednodes.Values)
939 if (!node._starttag) // already reported
941 continue;
944 string html;
945 if (OptionExtractErrorSourceText)
947 html = node.OuterHtml;
948 if (html.Length > OptionExtractErrorSourceTextMaxLength)
950 html = html.Substring(0, OptionExtractErrorSourceTextMaxLength);
953 else
955 html = string.Empty;
957 AddError(
958 HtmlParseErrorCode.TagNotClosed,
959 node._line, node._lineposition,
960 node._streamposition, html,
961 "End tag </" + node.Name + "> was not found");
964 // we don't need this anymore
965 _openednodes.Clear();
969 internal System.Text.Encoding GetOutEncoding()
971 // when unspecified, use the stream encoding first
972 if (_declaredencoding != null)
974 return _declaredencoding;
976 else
978 if (_streamencoding != null)
980 return _streamencoding;
983 return OptionDefaultStreamEncoding;
987 /// <summary>
988 /// Gets the document's output encoding.
989 /// </summary>
990 public System.Text.Encoding Encoding
994 return GetOutEncoding();
998 /// <summary>
999 /// Saves the HTML document to the specified stream.
1000 /// </summary>
1001 /// <param name="outStream">The stream to which you want to save.</param>
1002 public void Save(Stream outStream)
1004 StreamWriter sw = new StreamWriter(outStream, GetOutEncoding());
1005 Save(sw);
1008 /// <summary>
1009 /// Saves the HTML document to the specified stream.
1010 /// </summary>
1011 /// <param name="outStream">The stream to which you want to save. May not be null.</param>
1012 /// <param name="encoding">The character encoding to use. May not be null.</param>
1013 public void Save(Stream outStream, System.Text.Encoding encoding)
1015 if (outStream == null)
1017 throw new ArgumentNullException("outStream");
1019 if (encoding == null)
1021 throw new ArgumentNullException("encoding");
1023 StreamWriter sw = new StreamWriter(outStream, encoding);
1024 Save(sw);
1027 /// <summary>
1028 /// Saves the mixed document to the specified file.
1029 /// </summary>
1030 /// <param name="filename">The location of the file where you want to save the document.</param>
1031 public void Save(string filename)
1033 StreamWriter sw = new StreamWriter(filename, false, GetOutEncoding());
1034 Save(sw);
1035 sw.Close();
1038 /// <summary>
1039 /// Saves the mixed document to the specified file.
1040 /// </summary>
1041 /// <param name="filename">The location of the file where you want to save the document. May not be null.</param>
1042 /// <param name="encoding">The character encoding to use. May not be null.</param>
1043 public void Save(string filename, System.Text.Encoding encoding)
1045 if (filename == null)
1047 throw new ArgumentNullException("filename");
1049 if (encoding == null)
1051 throw new ArgumentNullException("encoding");
1053 StreamWriter sw = new StreamWriter(filename, false, encoding);
1054 Save(sw);
1055 sw.Close();
1058 /// <summary>
1059 /// Saves the HTML document to the specified StreamWriter.
1060 /// </summary>
1061 /// <param name="writer">The StreamWriter to which you want to save.</param>
1062 public void Save(StreamWriter writer)
1064 Save((TextWriter)writer);
1067 /// <summary>
1068 /// Saves the HTML document to the specified TextWriter.
1069 /// </summary>
1070 /// <param name="writer">The TextWriter to which you want to save. May not be null.</param>
1071 public void Save(TextWriter writer)
1073 if (writer == null)
1075 throw new ArgumentNullException("writer");
1077 DocumentNode.WriteTo(writer);
1080 /// <summary>
1081 /// Saves the HTML document to the specified XmlWriter.
1082 /// </summary>
1083 /// <param name="writer">The XmlWriter to which you want to save.</param>
1084 public void Save(XmlWriter writer)
1086 DocumentNode.WriteTo(writer);
1087 writer.Flush();
1090 /// <summary>
1091 /// Creates a new XPathNavigator object for navigating this HTML document.
1092 /// </summary>
1093 /// <returns>An XPathNavigator object. The XPathNavigator is positioned on the root of the document.</returns>
1094 public XPathNavigator CreateNavigator()
1096 return new HtmlNodeNavigator(this, _documentnode);
1099 internal void SetIdForNode(HtmlNode node, string id)
1101 if (!OptionUseIdAttribute)
1103 return;
1106 if ((_nodesid == null) || (id == null))
1108 return;
1111 if (node == null)
1113 _nodesid.Remove(id.ToLower());
1115 else
1117 _nodesid[id.ToLower()] = node;
1121 /// <summary>
1122 /// Gets the HTML node with the specified 'id' attribute value.
1123 /// </summary>
1124 /// <param name="id">The attribute id to match. May not be null.</param>
1125 /// <returns>The HTML node with the matching id or null if not found.</returns>
1126 public HtmlNode GetElementbyId(string id)
1128 if (id == null)
1130 throw new ArgumentNullException("id");
1132 if (_nodesid == null)
1134 throw new Exception(HtmlExceptionUseIdAttributeFalse);
1137 return _nodesid[id.ToLower()] as HtmlNode;
1140 /// <summary>
1141 /// Creates an HTML element node with the specified name.
1142 /// </summary>
1143 /// <param name="name">The qualified name of the element. May not be null.</param>
1144 /// <returns>The new HTML node.</returns>
1145 public HtmlNode CreateElement(string name)
1147 if (name == null)
1149 throw new ArgumentNullException("name");
1151 HtmlNode node = CreateNode(HtmlNodeType.Element);
1152 node._name = name;
1153 return node;
1156 /// <summary>
1157 /// Creates an HTML comment node.
1158 /// </summary>
1159 /// <returns>The new HTML comment node.</returns>
1160 public HtmlCommentNode CreateComment()
1162 return (HtmlCommentNode)CreateNode(HtmlNodeType.Comment);
1165 /// <summary>
1166 /// Creates an HTML comment node with the specified comment text.
1167 /// </summary>
1168 /// <param name="comment">The comment text. May not be null.</param>
1169 /// <returns>The new HTML comment node.</returns>
1170 public HtmlCommentNode CreateComment(string comment)
1172 if (comment == null)
1174 throw new ArgumentNullException("comment");
1176 HtmlCommentNode c = CreateComment();
1177 c.Comment = comment;
1178 return c;
1181 /// <summary>
1182 /// Creates an HTML text node.
1183 /// </summary>
1184 /// <returns>The new HTML text node.</returns>
1185 public HtmlTextNode CreateTextNode()
1187 return (HtmlTextNode)CreateNode(HtmlNodeType.Text);
1190 /// <summary>
1191 /// Creates an HTML text node with the specified text.
1192 /// </summary>
1193 /// <param name="text">The text of the node. May not be null.</param>
1194 /// <returns>The new HTML text node.</returns>
1195 public HtmlTextNode CreateTextNode(string text)
1197 if (text == null)
1199 throw new ArgumentNullException("text");
1201 HtmlTextNode t = CreateTextNode();
1202 t.Text = text;
1203 return t;
1206 internal HtmlNode CreateNode(HtmlNodeType type)
1208 return CreateNode(type, -1);
1211 internal HtmlNode CreateNode(HtmlNodeType type, int index)
1213 switch (type)
1215 case HtmlNodeType.Comment:
1216 return new HtmlCommentNode(this, index);
1218 case HtmlNodeType.Text:
1219 return new HtmlTextNode(this, index);
1221 default:
1222 return new HtmlNode(type, this, index);
1226 internal HtmlAttribute CreateAttribute()
1228 return new HtmlAttribute(this);
1231 /// <summary>
1232 /// Creates an HTML attribute with the specified name.
1233 /// </summary>
1234 /// <param name="name">The name of the attribute. May not be null.</param>
1235 /// <returns>The new HTML attribute.</returns>
1236 public HtmlAttribute CreateAttribute(string name)
1238 if (name == null)
1240 throw new ArgumentNullException("name");
1242 HtmlAttribute att = CreateAttribute();
1243 att.Name = name;
1244 return att;
1247 /// <summary>
1248 /// Creates an HTML attribute with the specified name.
1249 /// </summary>
1250 /// <param name="name">The name of the attribute. May not be null.</param>
1251 /// <param name="value">The value of the attribute.</param>
1252 /// <returns>The new HTML attribute.</returns>
1253 public HtmlAttribute CreateAttribute(string name, string value)
1255 if (name == null)
1257 throw new ArgumentNullException("name");
1259 HtmlAttribute att = CreateAttribute(name);
1260 att.Value = value;
1261 return att;
1264 /// <summary>
1265 /// Gets the root node of the document.
1266 /// </summary>
1267 public HtmlNode DocumentNode
1271 return _documentnode;
1275 /// <summary>
1276 /// Gets the document CRC32 checksum if OptionComputeChecksum was set to true before parsing, 0 otherwise.
1277 /// </summary>
1278 public int CheckSum
1282 if (_crc32 == null)
1284 return 0;
1286 else
1288 return (int)_crc32.CheckSum;
1293 public bool StreamMode
1297 return _streammode;
1301 _streammode = value;
1305 private HtmlParseError AddError(
1306 HtmlParseErrorCode code,
1307 int line,
1308 int linePosition,
1309 int streamPosition,
1310 string sourceText,
1311 string reason)
1313 HtmlParseError err = new HtmlParseError(code, line, linePosition, streamPosition, sourceText, reason);
1314 _parseerrors.Add(err);
1315 return err;
1318 private enum ParseState
1320 Text,
1321 WhichTag,
1322 Tag,
1323 BetweenAttributes,
1324 EmptyTag,
1325 AttributeName,
1326 AttributeBeforeEquals,
1327 AttributeAfterEquals,
1328 AttributeValue,
1329 Comment,
1330 QuotedAttributeValue,
1331 ServerSideCode,
1332 PcDataQuote,
1333 PcData
1336 private void IncrementPosition()
1338 if (_crc32 != null)
1340 // REVIEW: should we add some checksum code in DecrementPosition too?
1341 _crc32.AddToCRC32(_c);
1344 _index++;
1345 _maxlineposition = _lineposition;
1346 if (_c == 10)
1348 _lineposition = 1;
1349 _line++;
1351 else
1353 _lineposition++;
1357 private void DecrementPosition()
1359 _index--;
1360 if (_lineposition == 1)
1362 _lineposition = _maxlineposition;
1363 _line--;
1365 else
1367 _lineposition--;
1371 private void Parse()
1373 int lastquote = 0;
1374 if (OptionComputeChecksum)
1376 _crc32 = new Crc32();
1379 _lastnodes = new Hashtable();
1380 _c = 0;
1381 _fullcomment = false;
1382 _parseerrors = new ArrayList();
1383 _line = 1;
1384 _lineposition = 1;
1385 _maxlineposition = 1;
1387 _state = ParseState.Text;
1388 _oldstate = _state;
1389 _documentnode._innerlength = _text.FullLength;
1390 _documentnode._outerlength = _text.FullLength;
1392 _lastparentnode = _documentnode;
1393 _currentnode = CreateNode(HtmlNodeType.Text, 0);
1394 _currentattribute = null;
1396 _index = 0;
1397 PushNodeStart(HtmlNodeType.Text, 0);
1398 // SLIM: while (_index<_text.Length)
1399 while (! _stop_parsing && ! _text.Eof (_index))
1401 _c = _text[_index];
1402 IncrementPosition();
1404 switch(_state)
1406 case ParseState.Text:
1407 if (NewCheck())
1408 continue;
1409 break;
1411 case ParseState.WhichTag:
1412 if (NewCheck())
1413 continue;
1414 if (_c == '/')
1416 PushNodeNameStart(false, _index);
1418 else
1420 PushNodeNameStart(true, _index-1);
1421 DecrementPosition();
1423 _state = ParseState.Tag;
1424 break;
1426 case ParseState.Tag:
1427 if (NewCheck())
1428 continue;
1429 if (IsWhiteSpace(_c))
1431 PushNodeNameEnd(_index-1);
1432 if (_state != ParseState.Tag)
1433 continue;
1434 _state = ParseState.BetweenAttributes;
1435 continue;
1437 if (_c == '/')
1439 PushNodeNameEnd(_index-1);
1440 if (_state != ParseState.Tag)
1441 continue;
1442 _state = ParseState.EmptyTag;
1443 continue;
1445 if (_c == '>')
1447 PushNodeNameEnd(_index-1);
1448 if (_state != ParseState.Tag)
1449 continue;
1450 PushNodeEnd(_index, false);
1451 if (_state != ParseState.Tag)
1452 continue;
1453 _state = ParseState.Text;
1454 PushNodeStart(HtmlNodeType.Text, _index);
1456 break;
1458 case ParseState.BetweenAttributes:
1459 if (NewCheck())
1460 continue;
1462 if (IsWhiteSpace(_c))
1463 continue;
1465 if ((_c == '/') || (_c == '?'))
1467 _state = ParseState.EmptyTag;
1468 continue;
1471 if (_c == '>')
1473 PushNodeEnd(_index, false);
1474 if (_state != ParseState.BetweenAttributes)
1475 continue;
1476 _state = ParseState.Text;
1477 PushNodeStart(HtmlNodeType.Text, _index);
1478 continue;
1481 PushAttributeNameStart(_index-1);
1482 _state = ParseState.AttributeName;
1483 break;
1485 case ParseState.EmptyTag:
1486 if (NewCheck())
1487 continue;
1489 if (_c == '>')
1491 PushNodeEnd(_index, true);
1492 if (_state != ParseState.EmptyTag)
1493 continue;
1494 _state = ParseState.Text;
1495 PushNodeStart(HtmlNodeType.Text, _index);
1496 continue;
1498 _state = ParseState.BetweenAttributes;
1499 break;
1501 case ParseState.AttributeName:
1502 if (NewCheck())
1503 continue;
1505 if (IsWhiteSpace(_c))
1507 PushAttributeNameEnd(_index-1);
1508 _state = ParseState.AttributeBeforeEquals;
1509 continue;
1511 if (_c == '=')
1513 PushAttributeNameEnd(_index-1);
1514 _state = ParseState.AttributeAfterEquals;
1515 continue;
1517 if (_c == '>')
1519 PushAttributeNameEnd(_index-1);
1520 PushNodeEnd(_index, false);
1521 if (_state != ParseState.AttributeName)
1522 continue;
1523 _state = ParseState.Text;
1524 PushNodeStart(HtmlNodeType.Text, _index);
1525 continue;
1527 break;
1529 case ParseState.AttributeBeforeEquals:
1530 if (NewCheck())
1531 continue;
1533 if (IsWhiteSpace(_c))
1534 continue;
1535 if (_c == '>')
1537 PushNodeEnd(_index, false);
1538 if (_state != ParseState.AttributeBeforeEquals)
1539 continue;
1540 _state = ParseState.Text;
1541 PushNodeStart(HtmlNodeType.Text, _index);
1542 continue;
1544 if (_c == '=')
1546 _state = ParseState.AttributeAfterEquals;
1547 continue;
1549 // no equals, no whitespace, it's a new attrribute starting
1550 _state = ParseState.BetweenAttributes;
1551 DecrementPosition();
1552 break;
1554 case ParseState.AttributeAfterEquals:
1555 if (NewCheck())
1556 continue;
1558 if (IsWhiteSpace(_c))
1559 continue;
1561 if ((_c == '\'') || (_c == '"'))
1563 _state = ParseState.QuotedAttributeValue;
1564 PushAttributeValueStart(_index);
1565 lastquote = _c;
1566 continue;
1568 if (_c == '>')
1570 PushNodeEnd(_index, false);
1571 if (_state != ParseState.AttributeAfterEquals)
1572 continue;
1573 _state = ParseState.Text;
1574 PushNodeStart(HtmlNodeType.Text, _index);
1575 continue;
1577 PushAttributeValueStart(_index-1);
1578 _state = ParseState.AttributeValue;
1579 break;
1581 case ParseState.AttributeValue:
1582 if (NewCheck())
1583 continue;
1585 if (IsWhiteSpace(_c))
1587 PushAttributeValueEnd(_index-1);
1588 _state = ParseState.BetweenAttributes;
1589 continue;
1592 if (_c == '>')
1594 PushAttributeValueEnd(_index-1);
1595 PushNodeEnd(_index, false);
1596 if (_state != ParseState.AttributeValue)
1597 continue;
1598 _state = ParseState.Text;
1599 PushNodeStart(HtmlNodeType.Text, _index);
1600 continue;
1602 break;
1604 case ParseState.QuotedAttributeValue:
1605 if (_c == lastquote)
1607 PushAttributeValueEnd(_index-1);
1608 _state = ParseState.BetweenAttributes;
1609 continue;
1611 if (_c == '<')
1613 //SLIM: if (_index<_text.Length)
1614 if (!_text.Eof (_index))
1616 if (_text[_index] == '%')
1618 _oldstate = _state;
1619 _state = ParseState.ServerSideCode;
1620 continue;
1624 break;
1626 case ParseState.Comment:
1627 if (_c == '>')
1629 if (_fullcomment)
1631 if ((_text[_index-2] != '-') ||
1632 (_text[_index-3] != '-'))
1634 continue;
1637 PushNodeEnd(_index, false);
1638 _state = ParseState.Text;
1639 PushNodeStart(HtmlNodeType.Text, _index);
1640 continue;
1642 break;
1644 case ParseState.ServerSideCode:
1645 if (_c == '%')
1647 //SLIM: if (_index<_text.Length)
1648 if (! _text.Eof (_index))
1650 if (_text[_index] == '>')
1652 switch(_oldstate)
1654 case ParseState.AttributeAfterEquals:
1655 _state = ParseState.AttributeValue;
1656 break;
1658 case ParseState.BetweenAttributes:
1659 PushAttributeNameEnd(_index+1);
1660 _state = ParseState.BetweenAttributes;
1661 break;
1663 default:
1664 _state = _oldstate;
1665 break;
1667 IncrementPosition();
1671 break;
1673 // handle <script>a="</script>"</script>
1674 case ParseState.PcDataQuote:
1675 if ((_c == _pcdata_quote_char) && (_text [_index - 2] != '\\')) {
1676 _pcdata_quote_char = '\0';
1677 _state = ParseState.PcData;
1679 break;
1681 case ParseState.PcData:
1682 Debug ("PCDATA " + _currentnode.Name + " " + _text.Substring(_index-1, _currentnode._namelength+2));
1683 if (_c == '\"' || _c == '\''){
1684 _pcdata_quote_char = _c;
1685 _state = ParseState.PcDataQuote;
1686 break;
1688 // look for </tag + 1 char
1690 // check buffer end
1691 //SLIM: if ((_currentnode._namelength+3)<=(_text.Length-(_index-1)))
1692 if (! _text.Eof (_currentnode._namelength + _index + 1))
1694 if (string.Compare(_text.Substring(_index-1, _currentnode._namelength+2),
1695 "</" + _currentnode.Name, true) == 0)
1697 int c = _text[_index-1 + 2 + _currentnode.Name.Length];
1698 if ((c == '>') || (IsWhiteSpace(c)))
1700 // add the script as a text node
1701 HtmlNode script = CreateNode(HtmlNodeType.Text,
1702 _currentnode._outerstartindex + _currentnode._outerlength);
1703 script._outerlength = _index-1 - script._outerstartindex;
1704 if (_streammode && ReportNode != null)
1705 _stop_parsing = ! ReportNode (script);
1706 else
1707 _currentnode.AppendChild(script);
1708 Debug ("Found script: [" + script.InnerText + "]");
1710 PushNodeStart(HtmlNodeType.Element, _index-1);
1711 PushNodeNameStart(false, _index-1 +2);
1712 _state = ParseState.Tag;
1713 IncrementPosition();
1717 break;
1721 // finish the current work
1722 if (_currentnode._namestartindex > 0)
1724 PushNodeNameEnd(_index);
1726 PushNodeEnd(_index, false);
1728 // we don't need this anymore
1729 _lastnodes.Clear();
1732 private bool NewCheck()
1734 if (_c != '<')
1736 return false;
1738 //SLIM: if (_index<_text.Length)
1739 if (! _text.Eof (_index))
1741 if (_text[_index] == '%')
1743 switch(_state)
1745 case ParseState.AttributeAfterEquals:
1746 PushAttributeValueStart(_index-1);
1747 break;
1749 case ParseState.BetweenAttributes:
1750 PushAttributeNameStart(_index-1);
1751 break;
1753 case ParseState.WhichTag:
1754 PushNodeNameStart(true, _index-1);
1755 _state = ParseState.Tag;
1756 break;
1758 _oldstate = _state;
1759 _state = ParseState.ServerSideCode;
1760 return true;
1764 PushNodeEnd(_index-1, true);
1765 _state = ParseState.WhichTag;
1766 //SLIM: if ((_index-1) <= (_text.Length-2))
1767 if (!_text.Eof (_index))
1769 if (_text[_index] == '!')
1771 PushNodeStart(HtmlNodeType.Comment, _index-1);
1772 PushNodeNameStart(true, _index);
1773 PushNodeNameEnd(_index+1);
1774 _state = ParseState.Comment;
1775 //SLIM: if (_index<(_text.Length-2))
1776 if (! _text.Eof (_index + 2))
1778 if ((_text[_index+1] == '-') &&
1779 (_text[_index+2] == '-'))
1781 _fullcomment = true;
1783 else
1785 _fullcomment = false;
1788 return true;
1791 PushNodeStart(HtmlNodeType.Element, _index-1);
1792 return true;
1795 private void ReadDocumentEncoding(HtmlNode node)
1797 if (!OptionReadEncoding)
1798 return;
1799 // format is
1800 // <meta http-equiv="content-type" content="text/html;charset=iso-8859-1" />
1802 // when we append a child, we are in node end, so attributes are already populated
1803 if (node._namelength == 4) // quick check, avoids string alloc
1805 // only these nodes can occur before meta
1806 // if we started seeing any other node, we will never see a meta node
1807 if (node.NodeType == HtmlNodeType.Element &&
1808 (node.Name != "head" && node.Name != "script" &&
1809 node.Name != "style" && node.Name != "title" &&
1810 node.Name != "head" && node.Name != "link" &&
1811 node.Name != "html" && node.Name != "meta"))
1812 throw new EncodingFoundException (null);
1813 else if (node.Name == "meta") // all nodes names are lowercase
1815 HtmlAttribute att = node.Attributes["http-equiv"];
1816 if (att != null)
1818 if (string.Compare(att.Value, "content-type", true) == 0)
1820 HtmlAttribute content = node.Attributes["content"];
1821 if (content != null)
1823 string charset = NameValuePairList.GetNameValuePairsValue(content.Value, "charset");
1824 if (charset != null)
1826 _declaredencoding = Encoding.GetEncoding(charset);
1827 if (_onlyDetectEncoding)
1829 throw new EncodingFoundException(_declaredencoding);
1832 if (_streamencoding != null)
1834 if (_declaredencoding.WindowsCodePage != _streamencoding.WindowsCodePage)
1836 AddError(
1837 HtmlParseErrorCode.CharsetMismatch,
1838 _line, _lineposition,
1839 _index, node.OuterHtml,
1840 "Encoding mismatch between StreamEncoding: " +
1841 _streamencoding.WebName + " and DeclaredEncoding: " + _declaredencoding.WebName);
1852 private void PushAttributeNameStart(int index)
1854 _currentattribute = CreateAttribute();
1855 _currentattribute._namestartindex = index;
1856 _currentattribute._line = _line;
1857 _currentattribute._lineposition = _lineposition;
1858 _currentattribute._streamposition = index;
1861 private void PushAttributeNameEnd(int index)
1863 _currentattribute._namelength = index - _currentattribute._namestartindex;
1864 _currentnode.Attributes.Append(_currentattribute);
1867 private void PushAttributeValueStart(int index)
1869 _currentattribute._valuestartindex = index;
1872 private void PushAttributeValueEnd(int index)
1874 _currentattribute._valuelength = index - _currentattribute._valuestartindex;
1877 private void PushNodeStart(HtmlNodeType type, int index)
1879 _currentnode = CreateNode(type, index);
1880 _currentnode._line = _line;
1881 _currentnode._lineposition = _lineposition;
1882 if (type == HtmlNodeType.Element)
1884 _currentnode._lineposition--;
1886 _currentnode._streamposition = index;
1889 private void PushNodeEnd(int index, bool close)
1891 _currentnode._outerlength = index - _currentnode._outerstartindex;
1893 //SLIM: inform caller
1894 if (_streammode && ReportNode != null)
1895 _stop_parsing = ! ReportNode (_currentnode);
1897 if (_debug) {
1898 if (_currentnode._nodetype == HtmlNodeType.Text)
1899 Debug ("Text:" + _currentnode.InnerText);
1900 else
1901 Debug ((_currentnode.StartTag ? "Start-" : "End-") + _currentnode.Name);
1903 if ((_currentnode._nodetype == HtmlNodeType.Text) ||
1904 (_currentnode._nodetype == HtmlNodeType.Comment))
1906 // forget about void nodes
1907 if (_currentnode._outerlength>0)
1909 _currentnode._innerlength = _currentnode._outerlength;
1910 _currentnode._innerstartindex = _currentnode._outerstartindex;
1911 // SLIM: no need to append child in stream mode
1912 // SLIM: whatever the caller needs to do, tell it to do now
1913 if (!_streammode && _lastparentnode != null)
1915 _lastparentnode.AppendChild(_currentnode);
1919 else
1921 if ((_currentnode._starttag) && (_lastparentnode != _currentnode))
1923 // add to parent node
1924 // SLIM: no need to append child in stream mode
1925 // SLIM: whatever the caller needs to do, tell it to do now
1926 if (!_streammode && _lastparentnode != null)
1928 _lastparentnode.AppendChild(_currentnode);
1931 ReadDocumentEncoding(_currentnode);
1933 // remember last node of this kind
1934 // SLIM: we still to store _currentnode to help other tags in the same level
1935 HtmlNode prev = (HtmlNode)_lastnodes[_currentnode.Name];
1936 _currentnode._prevwithsamename = prev;
1937 _lastnodes[_currentnode.Name] = _currentnode;
1939 // change parent?
1940 if ((_currentnode.NodeType == HtmlNodeType.Document) ||
1941 (_currentnode.NodeType == HtmlNodeType.Element))
1943 _lastparentnode = _currentnode;
1946 if (HtmlNode.IsCDataElement(CurrentNodeName()))
1948 _state = ParseState.PcData;
1949 return;
1952 if ((HtmlNode.IsClosedElement(_currentnode.Name)) ||
1953 (HtmlNode.IsEmptyElement(_currentnode.Name)))
1955 close = true;
1960 if ((close) || (!_currentnode._starttag))
1962 CloseCurrentNode();
1963 if ((_currentnode._nodetype == HtmlNodeType.Text) ||
1964 (_currentnode._nodetype == HtmlNodeType.Comment))
1965 _currentnode = null;
1969 private void PushNodeNameStart(bool starttag, int index)
1971 _currentnode._starttag = starttag;
1972 _currentnode._namestartindex = index;
1975 private string[] GetResetters(string name)
1977 switch (name)
1979 case "li":
1980 return new string[]{"ul"};
1982 case "tr":
1983 return new string[]{"table"};
1985 case "th":
1986 case "td":
1987 return new string[]{"tr", "table"};
1989 default:
1990 return null;
1994 private void FixNestedTags()
1996 // we are only interested by start tags, not closing tags
1997 if (!_currentnode._starttag)
1998 return;
2000 string name = CurrentNodeName().ToLower();
2001 FixNestedTag(name, GetResetters(name));
2004 private void FixNestedTag(string name, string[] resetters)
2006 if (resetters == null)
2007 return;
2009 HtmlNode prev;
2011 // if we find a previous unclosed same name node, without a resetter node between, we must close it
2012 prev = (HtmlNode)_lastnodes[name];
2013 if ((prev != null) && (!prev.Closed))
2016 // try to find a resetter node, if found, we do nothing
2017 if (FindResetterNodes(prev, resetters))
2019 return;
2022 // ok we need to close the prev now
2023 // create a fake closer node
2024 HtmlNode close = new HtmlNode(prev.NodeType, this, -1);
2025 close._endnode = close;
2026 prev.CloseNode(close);
2031 private bool FindResetterNodes(HtmlNode node, string[] names)
2033 if (names == null)
2035 return false;
2037 for(int i=0;i<names.Length;i++)
2039 if (FindResetterNode(node, names[i]) != null)
2041 return true;
2044 return false;
2047 private HtmlNode FindResetterNode(HtmlNode node, string name)
2049 HtmlNode resetter = (HtmlNode)_lastnodes[name];
2050 if (resetter == null)
2051 return null;
2052 if (resetter.Closed)
2054 return null;
2056 if (resetter._streamposition<node._streamposition)
2058 return null;
2060 return resetter;
2063 private void PushNodeNameEnd(int index)
2065 _currentnode._namelength = index - _currentnode._namestartindex;
2066 if (OptionFixNestedTags)
2068 FixNestedTags();
2072 private void CloseCurrentNode()
2074 if (_currentnode.Closed) // text or document are by def closed
2075 return;
2077 bool error = false;
2079 // find last node of this kind
2080 HtmlNode prev = (HtmlNode)_lastnodes[_currentnode.Name];
2081 if (prev == null)
2083 if (HtmlNode.IsClosedElement(_currentnode.Name))
2085 // </br> will be seen as <br>
2086 _currentnode.CloseNode(_currentnode);
2088 // add to parent node
2089 if (_lastparentnode != null)
2091 HtmlNode foundNode = null;
2092 Stack futureChild = new Stack();
2093 for (HtmlNode node = _lastparentnode.LastChild; node != null; node = node.PreviousSibling)
2095 if ((node.Name == _currentnode.Name) && (! node.HasChildNodes))
2097 foundNode = node;
2098 break;
2100 futureChild.Push(node);
2102 if (foundNode != null)
2104 HtmlNode node = null;
2105 while(futureChild.Count != 0)
2107 node = (HtmlNode)futureChild.Pop();
2108 _lastparentnode.RemoveChild(node);
2109 foundNode.AppendChild(node);
2112 else
2114 _lastparentnode.AppendChild(_currentnode);
2119 else
2121 // node has no parent
2122 // node is not a closed node
2124 if (HtmlNode.CanOverlapElement(_currentnode.Name))
2126 // this is a hack: add it as a text node
2127 HtmlNode closenode = CreateNode(HtmlNodeType.Text, _currentnode._outerstartindex);
2128 closenode._outerlength = _currentnode._outerlength;
2129 ((HtmlTextNode)closenode).Text = ((HtmlTextNode)closenode).Text.ToLower();
2130 if (_lastparentnode != null)
2132 _lastparentnode.AppendChild(closenode);
2136 else
2138 if (HtmlNode.IsEmptyElement(_currentnode.Name))
2140 AddError(
2141 HtmlParseErrorCode.EndTagNotRequired,
2142 _currentnode._line, _currentnode._lineposition,
2143 _currentnode._streamposition, _currentnode.OuterHtml,
2144 "End tag </" + _currentnode.Name + "> is not required");
2146 else
2148 // node cannot overlap, node is not empty
2149 AddError(
2150 HtmlParseErrorCode.TagNotOpened,
2151 _currentnode._line, _currentnode._lineposition,
2152 _currentnode._streamposition, _currentnode.OuterHtml,
2153 "Start tag <" + _currentnode.Name + "> was not found");
2154 error = true;
2159 else
2161 if (OptionFixNestedTags)
2163 if (FindResetterNodes(prev, GetResetters(_currentnode.Name)))
2165 AddError(
2166 HtmlParseErrorCode.EndTagInvalidHere,
2167 _currentnode._line, _currentnode._lineposition,
2168 _currentnode._streamposition, _currentnode.OuterHtml,
2169 "End tag </" + _currentnode.Name + "> invalid here");
2170 error = true;
2174 if (!error)
2176 _lastnodes[_currentnode.Name] = prev._prevwithsamename;
2177 prev.CloseNode(_currentnode);
2182 // we close this node, get grandparent
2183 if (!error)
2185 if ((_lastparentnode != null) &&
2186 ((!HtmlNode.IsClosedElement(_currentnode.Name)) ||
2187 (_currentnode._starttag)))
2189 UpdateLastParentNode();
2194 internal void UpdateLastParentNode()
2198 if (_lastparentnode.Closed)
2200 _lastparentnode = _lastparentnode.ParentNode;
2203 while ((_lastparentnode != null) && (_lastparentnode.Closed));
2204 if (_lastparentnode == null)
2206 _lastparentnode = _documentnode;
2210 private string CurrentAttributeName()
2212 return _text.Substring(_currentattribute._namestartindex, _currentattribute._namelength);
2215 private string CurrentAttributeValue()
2217 return _text.Substring(_currentattribute._valuestartindex, _currentattribute._valuelength);
2220 private string CurrentNodeName()
2222 return _text.Substring(_currentnode._namestartindex, _currentnode._namelength);
2225 private string CurrentNodeOuter()
2227 return _text.Substring(_currentnode._outerstartindex, _currentnode._outerlength);
2230 private string CurrentNodeInner()
2232 return _text.Substring(_currentnode._innerstartindex, _currentnode._innerlength);
2235 /// <summary>
2236 /// Determines if the specified character is considered as a whitespace character.
2237 /// </summary>
2238 /// <param name="c">The character to check.</param>
2239 /// <returns>true if if the specified character is considered as a whitespace character.</returns>
2240 public static bool IsWhiteSpace(int c)
2242 if ((c == 10) || (c == 13) || (c == 32) || (c == 9))
2244 return true;
2246 return false;
2251 internal class EncodingFoundException: Exception
2253 private Encoding _encoding;
2255 internal EncodingFoundException(Encoding encoding)
2257 _encoding = encoding;
2260 internal Encoding Encoding
2264 return _encoding;