configure.in, AssemblyInfo.cs: For those unfortunate earthlings without libchm, libwv...
[beagle.git] / Filters / HtmlAgilityPack / HtmlDocument.cs
blob05bc5d8cb46be6f41255f301501c0e1593efb606
1 // HtmlAgilityPack V1.0 - Simon Mourier <simonm@microsoft.com>
3 /*
4 Copyright (C) 2003 Simon Mourier <simonm@microsoft.com>
5 All rights reserved.
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions
9 are met:
10 1. Redistributions of source code must retain the above copyright
11 notice, this list of conditions and the following disclaimer.
12 2. Redistributions in binary form must reproduce the above copyright
13 notice, this list of conditions and the following disclaimer in the
14 documentation and/or other materials provided with the distribution.
15 3. The name of the author may not be used to endorse or promote products
16 derived from this software without specific prior written permission.
18 THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
19 IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20 OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21 IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
22 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
23 NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
27 THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 using System;
31 using System.IO;
32 using System.Text;
33 using System.Diagnostics;
34 using System.Collections;
35 using System.Text.RegularExpressions;
36 using System.Xml;
37 using System.Xml.XPath;
40 // Legend: SLIM=Comment added describing changes to original HtmlAgilityPack
41 // to reduce memory consumption
42 // Once the parser is free of bugs, the comments will be taken out
43 namespace HtmlAgilityPack
45 /// <summary>
46 /// Represents the type of parsing error.
47 /// </summary>
48 public enum HtmlParseErrorCode
50 /// <summary>
51 /// A tag was not closed.
52 /// </summary>
53 TagNotClosed,
55 /// <summary>
56 /// A tag was not opened.
57 /// </summary>
58 TagNotOpened,
60 /// <summary>
61 /// There is a charset mismatch between stream and declared (META) encoding.
62 /// </summary>
63 CharsetMismatch,
65 /// <summary>
66 /// An end tag was not required.
67 /// </summary>
68 EndTagNotRequired,
70 /// <summary>
71 /// An end tag is invalid at this position.
72 /// </summary>
73 EndTagInvalidHere
76 /// <summary>
77 /// Represents a parsing error found during document parsing.
78 /// </summary>
79 public class HtmlParseError
81 private HtmlParseErrorCode _code;
82 private int _line;
83 private int _linePosition;
84 private int _streamPosition;
85 private string _sourceText;
86 private string _reason;
88 internal HtmlParseError(
89 HtmlParseErrorCode code,
90 int line,
91 int linePosition,
92 int streamPosition,
93 string sourceText,
94 string reason)
96 _code = code;
97 _line = line;
98 _linePosition = linePosition;
99 _streamPosition = streamPosition;
100 _sourceText = sourceText;
101 _reason = reason;
104 /// <summary>
105 /// Gets the type of error.
106 /// </summary>
107 public HtmlParseErrorCode Code
111 return _code;
115 /// <summary>
116 /// Gets the line number of this error in the document.
117 /// </summary>
118 public int Line
122 return _line;
126 /// <summary>
127 /// Gets the column number of this error in the document.
128 /// </summary>
129 public int LinePosition
133 return _linePosition;
137 /// <summary>
138 /// Gets the absolstream position of this error in the document, relative to the start of the document.
139 /// </summary>
140 public int StreamPosition
144 return _streamPosition;
148 /// <summary>
149 /// Gets the the full text of the line containing the error.
150 /// </summary>
151 public string SourceText
155 return _sourceText;
159 /// <summary>
160 /// Gets a description for the error.
161 /// </summary>
162 public string Reason
166 return _reason;
172 abstract class StreamAsArray {
173 public abstract bool Eof (int index);
174 public abstract char this [int index] { get;}
175 public abstract string Substring (int startindex, int length);
176 public abstract int FullLength { get;}
179 // SLIM: creating this class to wrap around a textreader
180 // to emulate ReadToEnd () behaviour
181 class ImplStreamAsArray : StreamAsArray {
182 private StreamReader _reader;
183 private int _length;
184 private int _position;
185 private bool _eof;
186 private char[] _buf_previous; // could have used only one array
187 private char[] _buf_current; // but, this is cleaner
188 private int _block_size;
190 public ImplStreamAsArray (StreamReader r)
192 _reader = r;
193 _length = 0;
194 _position = 0;
195 _eof = false;
197 _block_size = 1024;
198 _buf_previous = new char [_block_size];
199 _buf_current = new char [_block_size];
201 Read (true);
204 private void Read (bool initial)
206 if ( !initial) {
207 Array.Copy (_buf_current, _buf_previous, _block_size);
208 _position += _block_size;
210 HtmlDocument.Debug ("Debug: Read in buffer at:" + _position);
212 int num_read = _reader.Read (_buf_current, 0, _block_size);
213 if (num_read < _block_size) {
214 _eof = true;
215 _length = _position + num_read;
217 HtmlDocument.Debug ("[" + new string (_buf_current, 0, num_read) + "]");
220 public override bool Eof (int index) {
221 if (_eof)
222 return (index == _length);
223 else {
224 if (index >= _position + _block_size &&
225 index < _position + _block_size + _block_size)
226 Read (false);
227 if (_eof)
228 return (index == _length);
229 else
230 return false;
234 public override char this[int index] {
235 get {
236 if (index >= _position &&
237 index < _position + _block_size)
238 return _buf_current [index % _block_size];
239 if (index >= _position - _block_size &&
240 index < _position)
241 return _buf_previous [ index % _block_size];
242 if (index >= _position + _block_size &&
243 index < _position + _block_size + _block_size) {
244 Read (false);
245 return _buf_current [index % _block_size];
247 Console.WriteLine ("EXCEPTION!!!");
248 throw new Exception (String.Format ("{0} is out of current bounds:[{1}-{2}] and further than read-ahead",
249 index,
250 _position - _block_size,
251 _position + _block_size - 1));
255 // evil function ... you get what you pay for!
256 private string OutOfBandRead (int startindex, int length)
258 HtmlDocument.Debug ("Out of band read! From " + startindex + " to " + (startindex + length - 1));
259 ResetPosition (startindex);
260 // ahh.. now we are at the correct place
261 // create a buffer of required length
262 // who cares if the buffer size does not align well
263 // with page boundary
264 char[] temp_buf = new char [length];
265 int num_read = _reader.Read (temp_buf, 0, length);
266 if (num_read < length) {
267 // Shouldnt occur!!!
268 _eof = true;
269 _length = startindex + num_read;
271 // discard data and reset stream position
272 int t = (_eof ? _length :_position + _block_size);
273 ResetPosition (t);
274 return new String (temp_buf);
277 // streamreader does not allow seeking
278 // seek on its basestream does not reflect the position
279 // of the reader - it is governed by the buffer size
280 // of the underlying stream
281 // :( so, read character by character from beginning ...
282 private void ResetPosition (int pos)
284 _reader.DiscardBufferedData ();
285 _reader.BaseStream.Position = 0;
286 // read in chunks of block_size
287 int n1 = pos / _block_size;
288 int n2 = pos % _block_size;
289 char[] tmp = new char [_block_size];
290 // yo ho... start reading till we have reach pos
291 // hopefully, reader will buffer itself, so we can be mean and get one char at a time
292 for (int i = 0; i < n1; ++i)
293 _reader.Read (tmp, 0, _block_size);
294 for (int i = 0; i < n2; ++i)
295 _reader.Read ();
296 tmp = null;
299 public override string Substring (int startindex, int length)
301 if (length == 0) {
302 HtmlDocument.Debug ("substring:" + startindex + " " + length + " " + _position + ":");
303 return String.Empty;
305 if (length > _block_size || startindex < _position - _block_size) {
306 return OutOfBandRead (startindex, length);
308 if (startindex + length - 1 >= _position + _block_size) {
309 Read (false);
311 string substr;
312 if (startindex < _position) {
313 int len_1 = _position - startindex;
314 if (length < len_1)
315 substr = new String (_buf_previous, _block_size - len_1, length);
316 else {
317 substr = new String (_buf_previous, _block_size - len_1, len_1);
318 substr += new String (_buf_current, 0, length - len_1);
320 } else {
321 substr = new String (_buf_current, startindex - _position, length);
323 return substr;
326 // FIXME: Is this costly ?
327 public override int FullLength {
328 get {
329 return (int)_reader.BaseStream.Length;
334 // A dummy StreamAsArray wrapper around a string
335 class DummyStreamAsArray : StreamAsArray {
336 private string _base_string;
337 private int _length;
339 public DummyStreamAsArray(string str)
341 _base_string = str;
342 _length = str.Length;
345 public override bool Eof(int index)
347 return (index >= _length);
350 public new char this[int index] {
351 get { return _base_string [index]; }
354 public override string Substring (int startindex, int length)
356 return _base_string.Substring (startindex, length);
359 public override int FullLength {
360 get { return _length; }
364 /// <summary>
365 /// Represents a complete HTML document.
366 /// </summary>
367 public class HtmlDocument: IXPathNavigable
369 // SLIM: Make the parser event driven
370 // callback for FilterHtml
371 // return value is a way for the callback to signal to continue or stop parsing
372 public delegate bool NodeHandler (HtmlNode node);
373 public NodeHandler ReportNode;
374 // misnomer ... should be called event_driven_mode
375 private bool _streammode = false;
376 private bool _stop_parsing = false;
378 internal static readonly string HtmlExceptionRefNotChild = "Reference node must be a child of this node";
379 internal static readonly string HtmlExceptionUseIdAttributeFalse = "You need to set UseIdAttribute property to true to enable this feature";
381 internal Hashtable _openednodes;
382 internal Hashtable _lastnodes = new Hashtable();
383 internal Hashtable _nodesid;
384 private HtmlNode _documentnode;
385 //SLIM: internal string _text;
386 internal StreamAsArray _text;
387 private HtmlNode _currentnode;
388 private HtmlNode _lastparentnode;
389 private HtmlAttribute _currentattribute;
390 private int _index;
391 private int _line;
392 private int _lineposition, _maxlineposition;
393 private int _c;
394 private bool _fullcomment;
395 private System.Text.Encoding _streamencoding;
396 private System.Text.Encoding _declaredencoding;
397 private ArrayList _parseerrors = new ArrayList();
398 private ParseState _state, _oldstate;
399 private Crc32 _crc32 = null;
400 private bool _onlyDetectEncoding = false;
401 private int _pcdata_quote_char = '\0';
403 private static bool _debug = false;
404 internal static void Debug (string s)
406 if (_debug)
407 Console.WriteLine (s);
410 // public props
412 /// <summary>
413 /// Defines if a checksum must be computed for the document while parsing. Default is false.
414 /// </summary>
415 public bool OptionComputeChecksum = false;
417 /// <summary>
418 /// Defines if declared encoding must be read from the document.
419 /// Declared encoding is determined using the meta http-equiv="content-type" content="text/html;charset=XXXXX" html node.
420 /// Default is true.
421 /// </summary>
422 public bool OptionReadEncoding = true;
425 /// <summary>
426 /// Defines if non closed nodes will be checked at the end of parsing. Default is true.
427 /// </summary>
428 public bool OptionCheckSyntax = true;
430 /// <summary>
431 /// Defines if the 'id' attribute must be specifically used. Default is true.
432 /// </summary>
433 public bool OptionUseIdAttribute = true;
435 /// <summary>
436 /// Defines if empty nodes must be written as closed during output. Default is false.
437 /// </summary>
438 public bool OptionWriteEmptyNodes = false;
440 /// <summary>
441 /// Defines if output must conform to XML, instead of HTML.
442 /// </summary>
443 public bool OptionOutputAsXml = false;
445 /// <summary>
446 /// Defines if name must be output in uppercase. Default is false.
447 /// </summary>
448 public bool OptionOutputUpperCase = false;
450 /// <summary>
451 /// Defines if attribute value output must be optimized (not bound with double quotes if it is possible). Default is false.
452 /// </summary>
453 public bool OptionOutputOptimizeAttributeValues = false;
455 /// <summary>
456 /// Adds Debugging attributes to node. Default is false.
457 /// </summary>
458 public bool OptionAddDebuggingAttributes = false;
460 /// <summary>
461 /// Defines if source text must be extracted while parsing errors.
462 /// If the document has a lot of errors, or cascading errors, parsing performance can be dramatically affected if set to true.
463 /// Default is false.
464 /// </summary>
465 public bool OptionExtractErrorSourceText = false; // turning this on can dramatically slow performance if a lot of errors are detected
467 /// <summary>
468 /// Defines if closing for non closed nodes must be done at the end or directly in the document.
469 /// Setting this to true can actually change how browsers render the page. Default is false.
470 /// </summary>
471 public bool OptionAutoCloseOnEnd = false; // close errors at the end
473 /// <summary>
474 /// Defines if LI, TR, TH, TD tags must be partially fixed when nesting errors are detected. Default is false.
475 /// </summary>
476 public bool OptionFixNestedTags = false; // fix li, tr, th, td tags
478 /// <summary>
479 /// Defines the maximum length of source text or parse errors. Default is 100.
480 /// </summary>
481 public int OptionExtractErrorSourceTextMaxLength = 100;
483 /// <summary>
484 /// Defines the default stream encoding to use. Default is System.Text.Encoding.Default.
485 /// </summary>
486 // From http://www.w3.org/TR/REC-html40/charset.html
487 // The HTTP protocol ([RFC2616], section 3.7.1) mentions ISO-8859-1 as a default character encoding when the "charset" parameter is absent from the "Content-Type" header field.
488 // So, however we are still using UTF-8 for some unknown reason
489 //FIXME: Fix the default encoding!
490 public System.Text.Encoding OptionDefaultStreamEncoding = Encoding.UTF8;
492 /// <summary>
493 /// Gets a list of parse errors found in the document.
494 /// </summary>
495 public ArrayList ParseErrors
499 return _parseerrors;
503 /// <summary>
504 /// Gets the document's stream encoding.
505 /// </summary>
506 public System.Text.Encoding StreamEncoding
510 return _streamencoding;
514 /// <summary>
515 /// Gets the document's declared encoding.
516 /// Declared encoding is determined using the meta http-equiv="content-type" content="text/html;charset=XXXXX" html node.
517 /// </summary>
518 public System.Text.Encoding DeclaredEncoding
522 return _declaredencoding;
526 /// <summary>
527 /// Creates an instance of an HTML document.
528 /// </summary>
529 public HtmlDocument()
531 _documentnode = CreateNode(HtmlNodeType.Document, 0);
534 internal HtmlNode GetXmlDeclaration()
536 if (!_documentnode.HasChildNodes)
538 return null;
541 foreach(HtmlNode node in _documentnode._childnodes)
543 if (node.Name == "?xml") // it's ok, names are case sensitive
545 return node;
548 return null;
551 /// <summary>
552 /// Applies HTML encoding to a specified string.
553 /// </summary>
554 /// <param name="html">The input string to encode. May not be null.</param>
555 /// <returns>The encoded string.</returns>
556 public static string HtmlEncode(string html)
558 if (html == null)
560 throw new ArgumentNullException("html");
562 // replace & by &amp; but only once!
563 Regex rx = new Regex("&(?!(amp;)|(lt;)|(gt;)|(quot;))", RegexOptions.IgnoreCase);
564 return rx.Replace(html, "&amp;").Replace("<", "&lt;").Replace(">", "&gt;").Replace("\"", "&quot;");
567 /// <summary>
568 /// Detects the encoding of an HTML stream.
569 /// </summary>
570 /// <param name="stream">The input stream. May not be null.</param>
571 /// <returns>The detected encoding.</returns>
572 public Encoding DetectEncoding(Stream stream)
574 if (stream == null)
576 throw new ArgumentNullException("stream");
578 return DetectEncoding(new StreamReader(stream));
581 /// <summary>
582 /// Detects the encoding of an HTML file.
583 /// </summary>
584 /// <param name="path">Path for the file containing the HTML document to detect. May not be null.</param>
585 /// <returns>The detected encoding.</returns>
586 public Encoding DetectEncoding(string path)
588 if (path == null)
590 throw new ArgumentNullException("path");
592 StreamReader sr = new StreamReader(path, OptionDefaultStreamEncoding);
593 Encoding encoding = DetectEncoding(sr);
594 sr.Close();
595 return encoding;
598 /// <summary>
599 /// Detects the encoding of an HTML text.
600 /// </summary>
601 /// <param name="html">The input html text. May not be null.</param>
602 /// <returns>The detected encoding.</returns>
603 public Encoding DetectEncodingHtml(string html)
605 if (html == null)
607 throw new ArgumentNullException("html");
609 StringReader sr = new StringReader(html);
610 Encoding encoding = DetectEncoding(sr);
611 sr.Close();
612 return encoding;
615 /// <summary>
616 /// Detects the encoding of an HTML text provided on a TextReader.
617 /// </summary>
618 /// <param name="reader">The TextReader used to feed the HTML. May not be null.</param>
619 /// <returns>The detected encoding.</returns>
620 public Encoding DetectEncoding(TextReader reader)
622 if (reader == null)
624 throw new ArgumentNullException("reader");
626 _onlyDetectEncoding = true;
627 if (OptionCheckSyntax)
629 _openednodes = new Hashtable();
631 else
633 _openednodes = null;
636 if (OptionUseIdAttribute)
638 _nodesid = new Hashtable();
640 else
642 _nodesid = null;
645 StreamReader sr = reader as StreamReader;
646 if (sr != null)
648 _streamencoding = sr.CurrentEncoding;
649 _text = new ImplStreamAsArray (sr);
651 else
653 _streamencoding = null;
654 // Expensive, but cannot avoid since TextReader doesnt have any length of the underlying data
655 _text = new DummyStreamAsArray (reader.ReadToEnd());
657 _declaredencoding = null;
659 // SLIM: _text = reader.ReadToEnd();
660 _documentnode = CreateNode(HtmlNodeType.Document, 0);
662 // this is a hack, but it allows us not to muck with the original parsing code
665 Parse();
667 catch(EncodingFoundException ex)
669 _lastnodes.Clear();
670 return ex.Encoding;
672 return null;
675 /// <summary>
676 /// Loads an HTML document from a stream.
677 /// </summary>
678 /// <param name="stream">The input stream.</param>
679 public void Load(Stream stream)
681 Load(new StreamReader(stream, OptionDefaultStreamEncoding));
684 /// <summary>
685 /// Loads an HTML document from a stream.
686 /// </summary>
687 /// <param name="stream">The input stream.</param>
688 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the stream.</param>
689 public void Load(Stream stream, bool detectEncodingFromByteOrderMarks)
691 Load(new StreamReader(stream, detectEncodingFromByteOrderMarks));
694 /// <summary>
695 /// Loads an HTML document from a stream.
696 /// </summary>
697 /// <param name="stream">The input stream.</param>
698 /// <param name="encoding">The character encoding to use.</param>
699 public void Load(Stream stream, Encoding encoding)
701 Load(new StreamReader(stream, encoding));
704 /// <summary>
705 /// Loads an HTML document from a stream.
706 /// </summary>
707 /// <param name="stream">The input stream.</param>
708 /// <param name="encoding">The character encoding to use.</param>
709 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the stream.</param>
710 public void Load(Stream stream, Encoding encoding, bool detectEncodingFromByteOrderMarks)
712 Load(new StreamReader(stream, encoding, detectEncodingFromByteOrderMarks));
715 /// <summary>
716 /// Loads an HTML document from a stream.
717 /// </summary>
718 /// <param name="stream">The input stream.</param>
719 /// <param name="encoding">The character encoding to use.</param>
720 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the stream.</param>
721 /// <param name="buffersize">The minimum buffer size.</param>
722 public void Load(Stream stream, Encoding encoding, bool detectEncodingFromByteOrderMarks, int buffersize)
724 Load(new StreamReader(stream, encoding, detectEncodingFromByteOrderMarks, buffersize));
727 /// <summary>
728 /// Loads an HTML document from a file.
729 /// </summary>
730 /// <param name="path">The complete file path to be read. May not be null.</param>
731 public void Load(string path)
733 if (path == null)
735 throw new ArgumentNullException("path");
737 StreamReader sr = new StreamReader(path, OptionDefaultStreamEncoding);
738 Load(sr);
739 sr.Close();
742 /// <summary>
743 /// Loads an HTML document from a file.
744 /// </summary>
745 /// <param name="path">The complete file path to be read. May not be null.</param>
746 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
747 public void Load(string path, bool detectEncodingFromByteOrderMarks)
749 if (path == null)
751 throw new ArgumentNullException("path");
753 StreamReader sr = new StreamReader(path, detectEncodingFromByteOrderMarks);
754 Load(sr);
755 sr.Close();
758 /// <summary>
759 /// Loads an HTML document from a file.
760 /// </summary>
761 /// <param name="path">The complete file path to be read. May not be null.</param>
762 /// <param name="encoding">The character encoding to use. May not be null.</param>
763 public void Load(string path, Encoding encoding)
765 if (path == null)
767 throw new ArgumentNullException("path");
769 if (encoding == null)
771 throw new ArgumentNullException("encoding");
773 StreamReader sr = new StreamReader(path, encoding);
774 Load(sr);
775 sr.Close();
778 /// <summary>
779 /// Loads an HTML document from a file.
780 /// </summary>
781 /// <param name="path">The complete file path to be read. May not be null.</param>
782 /// <param name="encoding">The character encoding to use. May not be null.</param>
783 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
784 public void Load(string path, Encoding encoding, bool detectEncodingFromByteOrderMarks)
786 if (path == null)
788 throw new ArgumentNullException("path");
790 if (encoding == null)
792 throw new ArgumentNullException("encoding");
794 StreamReader sr = new StreamReader(path, encoding, detectEncodingFromByteOrderMarks);
795 Load(sr);
796 sr.Close();
799 /// <summary>
800 /// Loads an HTML document from a file.
801 /// </summary>
802 /// <param name="path">The complete file path to be read. May not be null.</param>
803 /// <param name="encoding">The character encoding to use. May not be null.</param>
804 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
805 /// <param name="buffersize">The minimum buffer size.</param>
806 public void Load(string path, Encoding encoding, bool detectEncodingFromByteOrderMarks, int buffersize)
808 if (path == null)
810 throw new ArgumentNullException("path");
812 if (encoding == null)
814 throw new ArgumentNullException("encoding");
816 StreamReader sr = new StreamReader(path, encoding, detectEncodingFromByteOrderMarks, buffersize);
817 Load(sr);
818 sr.Close();
821 /// <summary>
822 /// Loads the HTML document from the specified string.
823 /// </summary>
824 /// <param name="html">String containing the HTML document to load. May not be null.</param>
825 public void LoadHtml(string html)
827 if (html == null)
829 throw new ArgumentNullException("html");
831 StringReader sr = new StringReader(html);
832 Load(sr);
833 sr.Close();
836 /// <summary>
837 /// Detects the encoding of an HTML document from a file first, and then loads the file.
838 /// </summary>
839 /// <param name="path">The complete file path to be read.</param>
840 public void DetectEncodingAndLoad(string path)
842 DetectEncodingAndLoad(path, true);
845 /// <summary>
846 /// Detects the encoding of an HTML document from a file first, and then loads the file.
847 /// </summary>
848 /// <param name="path">The complete file path to be read. May not be null.</param>
849 /// <param name="detectEncoding">true to detect encoding, false otherwise.</param>
850 public void DetectEncodingAndLoad(string path, bool detectEncoding)
852 if (path == null)
854 throw new ArgumentNullException("path");
856 System.Text.Encoding enc;
857 if (detectEncoding)
859 enc = DetectEncoding(path);
861 else
863 enc = null;
866 if (enc == null)
868 Load(path);
870 else
872 Load(path, enc);
876 /// <summary>
877 /// Loads the HTML document from the specified TextReader.
878 /// </summary>
879 /// <param name="reader">The TextReader used to feed the HTML data into the document. May not be null.</param>
880 public void Load(TextReader reader)
882 // all Load methods pass down to this one
883 if (reader == null)
885 throw new ArgumentNullException("reader");
888 _onlyDetectEncoding = false;
890 if (OptionCheckSyntax)
892 _openednodes = new Hashtable();
894 else
896 _openednodes = null;
899 if (OptionUseIdAttribute)
901 _nodesid = new Hashtable();
903 else
905 _nodesid = null;
908 StreamReader sr = reader as StreamReader;
909 if (sr != null)
913 // trigger bom read if needed
914 sr.Peek();
916 catch
918 // void on purpose
920 _streamencoding = sr.CurrentEncoding;
921 _text = new ImplStreamAsArray (sr);
923 else
925 _streamencoding = null;
926 // Expensive, but cannot avoid since TextReader doesnt have any length of the underlying data
927 _text = new DummyStreamAsArray (reader.ReadToEnd());
929 _declaredencoding = null;
931 // SLIM: _text = reader.ReadToEnd();
932 _documentnode = CreateNode(HtmlNodeType.Document, 0);
933 Parse();
935 if (OptionCheckSyntax)
937 foreach(HtmlNode node in _openednodes.Values)
939 if (!node._starttag) // already reported
941 continue;
944 string html;
945 if (OptionExtractErrorSourceText)
947 html = node.OuterHtml;
948 if (html.Length > OptionExtractErrorSourceTextMaxLength)
950 html = html.Substring(0, OptionExtractErrorSourceTextMaxLength);
953 else
955 html = string.Empty;
957 AddError(
958 HtmlParseErrorCode.TagNotClosed,
959 node._line, node._lineposition,
960 node._streamposition, html,
961 "End tag </" + node.Name + "> was not found");
964 // we don't need this anymore
965 _openednodes.Clear();
969 internal System.Text.Encoding GetOutEncoding()
971 // when unspecified, use the stream encoding first
972 if (_declaredencoding != null)
974 return _declaredencoding;
976 else
978 if (_streamencoding != null)
980 return _streamencoding;
983 return OptionDefaultStreamEncoding;
987 /// <summary>
988 /// Gets the document's output encoding.
989 /// </summary>
990 public System.Text.Encoding Encoding
994 return GetOutEncoding();
998 /// <summary>
999 /// Saves the HTML document to the specified stream.
1000 /// </summary>
1001 /// <param name="outStream">The stream to which you want to save.</param>
1002 public void Save(Stream outStream)
1004 StreamWriter sw = new StreamWriter(outStream, GetOutEncoding());
1005 Save(sw);
1008 /// <summary>
1009 /// Saves the HTML document to the specified stream.
1010 /// </summary>
1011 /// <param name="outStream">The stream to which you want to save. May not be null.</param>
1012 /// <param name="encoding">The character encoding to use. May not be null.</param>
1013 public void Save(Stream outStream, System.Text.Encoding encoding)
1015 if (outStream == null)
1017 throw new ArgumentNullException("outStream");
1019 if (encoding == null)
1021 throw new ArgumentNullException("encoding");
1023 StreamWriter sw = new StreamWriter(outStream, encoding);
1024 Save(sw);
1027 /// <summary>
1028 /// Saves the mixed document to the specified file.
1029 /// </summary>
1030 /// <param name="filename">The location of the file where you want to save the document.</param>
1031 public void Save(string filename)
1033 StreamWriter sw = new StreamWriter(filename, false, GetOutEncoding());
1034 Save(sw);
1035 sw.Close();
1038 /// <summary>
1039 /// Saves the mixed document to the specified file.
1040 /// </summary>
1041 /// <param name="filename">The location of the file where you want to save the document. May not be null.</param>
1042 /// <param name="encoding">The character encoding to use. May not be null.</param>
1043 public void Save(string filename, System.Text.Encoding encoding)
1045 if (filename == null)
1047 throw new ArgumentNullException("filename");
1049 if (encoding == null)
1051 throw new ArgumentNullException("encoding");
1053 StreamWriter sw = new StreamWriter(filename, false, encoding);
1054 Save(sw);
1055 sw.Close();
1058 /// <summary>
1059 /// Saves the HTML document to the specified StreamWriter.
1060 /// </summary>
1061 /// <param name="writer">The StreamWriter to which you want to save.</param>
1062 public void Save(StreamWriter writer)
1064 Save((TextWriter)writer);
1067 /// <summary>
1068 /// Saves the HTML document to the specified TextWriter.
1069 /// </summary>
1070 /// <param name="writer">The TextWriter to which you want to save. May not be null.</param>
1071 public void Save(TextWriter writer)
1073 if (writer == null)
1075 throw new ArgumentNullException("writer");
1077 DocumentNode.WriteTo(writer);
1080 /// <summary>
1081 /// Saves the HTML document to the specified XmlWriter.
1082 /// </summary>
1083 /// <param name="writer">The XmlWriter to which you want to save.</param>
1084 public void Save(XmlWriter writer)
1086 DocumentNode.WriteTo(writer);
1087 writer.Flush();
1090 /// <summary>
1091 /// Creates a new XPathNavigator object for navigating this HTML document.
1092 /// </summary>
1093 /// <returns>An XPathNavigator object. The XPathNavigator is positioned on the root of the document.</returns>
1094 public XPathNavigator CreateNavigator()
1096 return new HtmlNodeNavigator(this, _documentnode);
1099 internal void SetIdForNode(HtmlNode node, string id)
1101 if (!OptionUseIdAttribute)
1103 return;
1106 if ((_nodesid == null) || (id == null))
1108 return;
1111 if (node == null)
1113 _nodesid.Remove(id.ToLower());
1115 else
1117 _nodesid[id.ToLower()] = node;
1121 /// <summary>
1122 /// Gets the HTML node with the specified 'id' attribute value.
1123 /// </summary>
1124 /// <param name="id">The attribute id to match. May not be null.</param>
1125 /// <returns>The HTML node with the matching id or null if not found.</returns>
1126 public HtmlNode GetElementbyId(string id)
1128 if (id == null)
1130 throw new ArgumentNullException("id");
1132 if (_nodesid == null)
1134 throw new Exception(HtmlExceptionUseIdAttributeFalse);
1137 return _nodesid[id.ToLower()] as HtmlNode;
1140 /// <summary>
1141 /// Creates an HTML element node with the specified name.
1142 /// </summary>
1143 /// <param name="name">The qualified name of the element. May not be null.</param>
1144 /// <returns>The new HTML node.</returns>
1145 public HtmlNode CreateElement(string name)
1147 if (name == null)
1149 throw new ArgumentNullException("name");
1151 HtmlNode node = CreateNode(HtmlNodeType.Element);
1152 node._name = name;
1153 return node;
1156 /// <summary>
1157 /// Creates an HTML comment node.
1158 /// </summary>
1159 /// <returns>The new HTML comment node.</returns>
1160 public HtmlCommentNode CreateComment()
1162 return (HtmlCommentNode)CreateNode(HtmlNodeType.Comment);
1165 /// <summary>
1166 /// Creates an HTML comment node with the specified comment text.
1167 /// </summary>
1168 /// <param name="comment">The comment text. May not be null.</param>
1169 /// <returns>The new HTML comment node.</returns>
1170 public HtmlCommentNode CreateComment(string comment)
1172 if (comment == null)
1174 throw new ArgumentNullException("comment");
1176 HtmlCommentNode c = CreateComment();
1177 c.Comment = comment;
1178 return c;
1181 /// <summary>
1182 /// Creates an HTML text node.
1183 /// </summary>
1184 /// <returns>The new HTML text node.</returns>
1185 public HtmlTextNode CreateTextNode()
1187 return (HtmlTextNode)CreateNode(HtmlNodeType.Text);
1190 /// <summary>
1191 /// Creates an HTML text node with the specified text.
1192 /// </summary>
1193 /// <param name="text">The text of the node. May not be null.</param>
1194 /// <returns>The new HTML text node.</returns>
1195 public HtmlTextNode CreateTextNode(string text)
1197 if (text == null)
1199 throw new ArgumentNullException("text");
1201 HtmlTextNode t = CreateTextNode();
1202 t.Text = text;
1203 return t;
1206 internal HtmlNode CreateNode(HtmlNodeType type)
1208 return CreateNode(type, -1);
1211 internal HtmlNode CreateNode(HtmlNodeType type, int index)
1213 switch (type)
1215 case HtmlNodeType.Comment:
1216 return new HtmlCommentNode(this, index);
1218 case HtmlNodeType.Text:
1219 return new HtmlTextNode(this, index);
1221 default:
1222 return new HtmlNode(type, this, index);
1226 internal HtmlAttribute CreateAttribute()
1228 return new HtmlAttribute(this);
1231 /// <summary>
1232 /// Creates an HTML attribute with the specified name.
1233 /// </summary>
1234 /// <param name="name">The name of the attribute. May not be null.</param>
1235 /// <returns>The new HTML attribute.</returns>
1236 public HtmlAttribute CreateAttribute(string name)
1238 if (name == null)
1240 throw new ArgumentNullException("name");
1242 HtmlAttribute att = CreateAttribute();
1243 att.Name = name;
1244 return att;
1247 /// <summary>
1248 /// Creates an HTML attribute with the specified name.
1249 /// </summary>
1250 /// <param name="name">The name of the attribute. May not be null.</param>
1251 /// <param name="value">The value of the attribute.</param>
1252 /// <returns>The new HTML attribute.</returns>
1253 public HtmlAttribute CreateAttribute(string name, string value)
1255 if (name == null)
1257 throw new ArgumentNullException("name");
1259 HtmlAttribute att = CreateAttribute(name);
1260 att.Value = value;
1261 return att;
1264 /// <summary>
1265 /// Gets the root node of the document.
1266 /// </summary>
1267 public HtmlNode DocumentNode
1271 return _documentnode;
1275 /// <summary>
1276 /// Gets the document CRC32 checksum if OptionComputeChecksum was set to true before parsing, 0 otherwise.
1277 /// </summary>
1278 public int CheckSum
1282 if (_crc32 == null)
1284 return 0;
1286 else
1288 return (int)_crc32.CheckSum;
1293 public bool StreamMode
1297 return _streammode;
1301 _streammode = value;
1305 private HtmlParseError AddError(
1306 HtmlParseErrorCode code,
1307 int line,
1308 int linePosition,
1309 int streamPosition,
1310 string sourceText,
1311 string reason)
1313 HtmlParseError err = new HtmlParseError(code, line, linePosition, streamPosition, sourceText, reason);
1314 _parseerrors.Add(err);
1315 return err;
1318 private enum ParseState
1320 Text,
1321 WhichTag,
1322 Tag,
1323 BetweenAttributes,
1324 EmptyTag,
1325 AttributeName,
1326 AttributeBeforeEquals,
1327 AttributeAfterEquals,
1328 AttributeValue,
1329 Comment,
1330 QuotedAttributeValue,
1331 ServerSideCode,
1332 PcDataQuote,
1333 PcData
1336 private void IncrementPosition()
1338 if (_crc32 != null)
1340 // REVIEW: should we add some checksum code in DecrementPosition too?
1341 _crc32.AddToCRC32(_c);
1344 _index++;
1345 _maxlineposition = _lineposition;
1346 if (_c == 10)
1348 _lineposition = 1;
1349 _line++;
1351 else
1353 _lineposition++;
1357 private void DecrementPosition()
1359 _index--;
1360 if (_lineposition == 1)
1362 _lineposition = _maxlineposition;
1363 _line--;
1365 else
1367 _lineposition--;
1371 private void Parse()
1373 int lastquote = 0;
1374 if (OptionComputeChecksum)
1376 _crc32 = new Crc32();
1379 _lastnodes = new Hashtable();
1380 _c = 0;
1381 _fullcomment = false;
1382 _parseerrors = new ArrayList();
1383 _line = 1;
1384 _lineposition = 1;
1385 _maxlineposition = 1;
1387 _state = ParseState.Text;
1388 _oldstate = _state;
1389 _documentnode._innerlength = _text.FullLength;
1390 _documentnode._outerlength = _text.FullLength;
1392 _lastparentnode = _documentnode;
1393 _currentnode = CreateNode(HtmlNodeType.Text, 0);
1394 _currentattribute = null;
1396 _index = 0;
1397 PushNodeStart(HtmlNodeType.Text, 0);
1398 // SLIM: while (_index<_text.Length)
1399 while (! _stop_parsing && ! _text.Eof (_index))
1401 _c = _text[_index];
1402 IncrementPosition();
1404 switch(_state)
1406 case ParseState.Text:
1407 if (NewCheck())
1408 continue;
1409 break;
1411 case ParseState.WhichTag:
1412 if (NewCheck())
1413 continue;
1414 if (_c == '/')
1416 PushNodeNameStart(false, _index);
1418 else
1420 PushNodeNameStart(true, _index-1);
1421 DecrementPosition();
1423 _state = ParseState.Tag;
1424 break;
1426 case ParseState.Tag:
1427 if (NewCheck())
1428 continue;
1429 if (IsWhiteSpace(_c))
1431 PushNodeNameEnd(_index-1);
1432 if (_state != ParseState.Tag)
1433 continue;
1434 _state = ParseState.BetweenAttributes;
1435 continue;
1437 if (_c == '/')
1439 PushNodeNameEnd(_index-1);
1440 if (_state != ParseState.Tag)
1441 continue;
1442 _state = ParseState.EmptyTag;
1443 continue;
1445 if (_c == '>')
1447 PushNodeNameEnd(_index-1);
1448 if (_state != ParseState.Tag)
1449 continue;
1450 PushNodeEnd(_index, false);
1451 if (_state != ParseState.Tag)
1452 continue;
1453 _state = ParseState.Text;
1454 PushNodeStart(HtmlNodeType.Text, _index);
1456 break;
1458 case ParseState.BetweenAttributes:
1459 if (NewCheck())
1460 continue;
1462 if (IsWhiteSpace(_c))
1463 continue;
1465 if ((_c == '/') || (_c == '?'))
1467 _state = ParseState.EmptyTag;
1468 continue;
1471 if (_c == '>')
1473 PushNodeEnd(_index, false);
1474 if (_state != ParseState.BetweenAttributes)
1475 continue;
1476 _state = ParseState.Text;
1477 PushNodeStart(HtmlNodeType.Text, _index);
1478 continue;
1481 PushAttributeNameStart(_index-1);
1482 _state = ParseState.AttributeName;
1483 break;
1485 case ParseState.EmptyTag:
1486 if (NewCheck())
1487 continue;
1489 if (_c == '>')
1491 PushNodeEnd(_index, true);
1492 if (_state != ParseState.EmptyTag)
1493 continue;
1494 _state = ParseState.Text;
1495 PushNodeStart(HtmlNodeType.Text, _index);
1496 continue;
1498 _state = ParseState.BetweenAttributes;
1499 break;
1501 case ParseState.AttributeName:
1502 if (NewCheck())
1503 continue;
1505 if (IsWhiteSpace(_c))
1507 PushAttributeNameEnd(_index-1);
1508 _state = ParseState.AttributeBeforeEquals;
1509 continue;
1511 if (_c == '=')
1513 PushAttributeNameEnd(_index-1);
1514 _state = ParseState.AttributeAfterEquals;
1515 continue;
1517 if (_c == '>')
1519 PushAttributeNameEnd(_index-1);
1520 PushNodeEnd(_index, false);
1521 if (_state != ParseState.AttributeName)
1522 continue;
1523 _state = ParseState.Text;
1524 PushNodeStart(HtmlNodeType.Text, _index);
1525 continue;
1527 break;
1529 case ParseState.AttributeBeforeEquals:
1530 if (NewCheck())
1531 continue;
1533 if (IsWhiteSpace(_c))
1534 continue;
1535 if (_c == '>')
1537 PushNodeEnd(_index, false);
1538 if (_state != ParseState.AttributeBeforeEquals)
1539 continue;
1540 _state = ParseState.Text;
1541 PushNodeStart(HtmlNodeType.Text, _index);
1542 continue;
1544 if (_c == '=')
1546 _state = ParseState.AttributeAfterEquals;
1547 continue;
1549 // no equals, no whitespace, it's a new attrribute starting
1550 _state = ParseState.BetweenAttributes;
1551 DecrementPosition();
1552 break;
1554 case ParseState.AttributeAfterEquals:
1555 if (NewCheck())
1556 continue;
1558 if (IsWhiteSpace(_c))
1559 continue;
1561 if ((_c == '\'') || (_c == '"'))
1563 _state = ParseState.QuotedAttributeValue;
1564 PushAttributeValueStart(_index);
1565 lastquote = _c;
1566 continue;
1568 if (_c == '>')
1570 PushNodeEnd(_index, false);
1571 if (_state != ParseState.AttributeAfterEquals)
1572 continue;
1573 _state = ParseState.Text;
1574 PushNodeStart(HtmlNodeType.Text, _index);
1575 continue;
1577 PushAttributeValueStart(_index-1);
1578 _state = ParseState.AttributeValue;
1579 break;
1581 case ParseState.AttributeValue:
1582 if (NewCheck())
1583 continue;
1585 if (IsWhiteSpace(_c))
1587 PushAttributeValueEnd(_index-1);
1588 _state = ParseState.BetweenAttributes;
1589 continue;
1592 if (_c == '>')
1594 PushAttributeValueEnd(_index-1);
1595 PushNodeEnd(_index, false);
1596 if (_state != ParseState.AttributeValue)
1597 continue;
1598 _state = ParseState.Text;
1599 PushNodeStart(HtmlNodeType.Text, _index);
1600 continue;
1602 break;
1604 case ParseState.QuotedAttributeValue:
1605 if (_c == lastquote)
1607 PushAttributeValueEnd(_index-1);
1608 _state = ParseState.BetweenAttributes;
1609 continue;
1611 if (_c == '<')
1613 //SLIM: if (_index<_text.Length)
1614 if (!_text.Eof (_index))
1616 if (_text[_index] == '%')
1618 _oldstate = _state;
1619 _state = ParseState.ServerSideCode;
1620 continue;
1624 break;
1626 case ParseState.Comment:
1627 if (_c == '>')
1629 if (_fullcomment)
1631 if ((_text[_index-2] != '-') ||
1632 (_text[_index-3] != '-'))
1634 continue;
1637 PushNodeEnd(_index, false);
1638 _state = ParseState.Text;
1639 PushNodeStart(HtmlNodeType.Text, _index);
1640 continue;
1642 break;
1644 case ParseState.ServerSideCode:
1645 if (_c == '%')
1647 //SLIM: if (_index<_text.Length)
1648 if (! _text.Eof (_index))
1650 if (_text[_index] == '>')
1652 switch(_oldstate)
1654 case ParseState.AttributeAfterEquals:
1655 _state = ParseState.AttributeValue;
1656 break;
1658 case ParseState.BetweenAttributes:
1659 PushAttributeNameEnd(_index+1);
1660 _state = ParseState.BetweenAttributes;
1661 break;
1663 default:
1664 _state = _oldstate;
1665 break;
1667 IncrementPosition();
1671 break;
1673 // handle <script>a="</script>"</script>
1674 case ParseState.PcDataQuote:
1675 if ((_c == _pcdata_quote_char) && (_text [_index - 2] != '\\')) {
1676 _pcdata_quote_char = '\0';
1677 _state = ParseState.PcData;
1679 break;
1681 case ParseState.PcData:
1682 Debug ("PCDATA " + _currentnode.Name + " " + _text.Substring(_index-1, _currentnode._namelength+2));
1683 if (_c == '\"' || _c == '\''){
1684 _pcdata_quote_char = _c;
1685 _state = ParseState.PcDataQuote;
1686 break;
1688 // look for </tag + 1 char
1690 // check buffer end
1691 //SLIM: if ((_currentnode._namelength+3)<=(_text.Length-(_index-1)))
1692 if (! _text.Eof (_currentnode._namelength + _index + 1))
1694 if (string.Compare(_text.Substring(_index-1, _currentnode._namelength+2),
1695 "</" + _currentnode.Name, true) == 0)
1697 int c = _text[_index-1 + 2 + _currentnode.Name.Length];
1698 if ((c == '>') || (IsWhiteSpace(c)))
1700 // add the script as a text node
1701 HtmlNode script = CreateNode(HtmlNodeType.Text,
1702 _currentnode._outerstartindex + _currentnode._outerlength);
1703 script._outerlength = _index-1 - script._outerstartindex;
1704 if (_streammode && ReportNode != null)
1705 _stop_parsing = ! ReportNode (script);
1706 else
1707 _currentnode.AppendChild(script);
1708 Debug ("Found script: [" + script.InnerText + "]");
1710 PushNodeStart(HtmlNodeType.Element, _index-1);
1711 PushNodeNameStart(false, _index-1 +2);
1712 _state = ParseState.Tag;
1713 IncrementPosition();
1717 break;
1721 // finish the current work
1722 if (_currentnode._namestartindex > 0)
1724 PushNodeNameEnd(_index);
1726 PushNodeEnd(_index, false);
1728 // we don't need this anymore
1729 _lastnodes.Clear();
1732 private bool NewCheck()
1734 if (_c != '<')
1736 return false;
1738 //SLIM: if (_index<_text.Length)
1739 if (! _text.Eof (_index))
1741 if (_text[_index] == '%')
1743 switch(_state)
1745 case ParseState.AttributeAfterEquals:
1746 PushAttributeValueStart(_index-1);
1747 break;
1749 case ParseState.BetweenAttributes:
1750 PushAttributeNameStart(_index-1);
1751 break;
1753 case ParseState.WhichTag:
1754 PushNodeNameStart(true, _index-1);
1755 _state = ParseState.Tag;
1756 break;
1758 _oldstate = _state;
1759 _state = ParseState.ServerSideCode;
1760 return true;
1764 PushNodeEnd(_index-1, true);
1765 _state = ParseState.WhichTag;
1766 //SLIM: if ((_index-1) <= (_text.Length-2))
1767 if (!_text.Eof (_index))
1769 if (_text[_index] == '!')
1771 PushNodeStart(HtmlNodeType.Comment, _index-1);
1772 PushNodeNameStart(true, _index);
1773 PushNodeNameEnd(_index+1);
1774 _state = ParseState.Comment;
1775 //SLIM: if (_index<(_text.Length-2))
1776 if (! _text.Eof (_index + 2))
1778 if ((_text[_index+1] == '-') &&
1779 (_text[_index+2] == '-'))
1781 _fullcomment = true;
1783 else
1785 _fullcomment = false;
1788 return true;
1791 PushNodeStart(HtmlNodeType.Element, _index-1);
1792 return true;
1795 private void ReadDocumentEncoding(HtmlNode node)
1797 if (!OptionReadEncoding)
1798 return;
1799 // format is
1800 // <meta http-equiv="content-type" content="text/html;charset=iso-8859-1" />
1802 // when we append a child, we are in node end, so attributes are already populated
1803 if (node._namelength == 4) // quick check, avoids string alloc
1805 // only these nodes can occur before meta
1806 // if we started seeing any other node, we will never see a meta node
1807 if (node.NodeType == HtmlNodeType.Element &&
1808 (node.Name != "head" && node.Name != "script" &&
1809 node.Name != "style" && node.Name != "title" &&
1810 node.Name != "head" && node.Name != "link" &&
1811 node.Name != "html" && node.Name != "meta")) {
1812 _declaredencoding = null;
1813 if (_onlyDetectEncoding)
1814 throw new EncodingFoundException (null);
1815 else
1816 return;
1817 // FIXME: Should also handle declaredencoding mismatch with detected
1818 // encoding, as done below. None of the current filters run in error
1819 // detection mode currently, so its not needed now.
1821 else if (node.Name == "meta") // all nodes names are lowercase
1823 HtmlAttribute att = node.Attributes["http-equiv"];
1824 if (att != null)
1826 if (string.Compare(att.Value, "content-type", true) == 0)
1828 HtmlAttribute content = node.Attributes["content"];
1829 if (content != null)
1831 string charset = NameValuePairList.GetNameValuePairsValue(content.Value, "charset");
1832 if (charset != null)
1834 _declaredencoding = Encoding.GetEncoding(charset);
1835 if (_onlyDetectEncoding)
1837 throw new EncodingFoundException(_declaredencoding);
1840 if (_streamencoding != null)
1842 if (_declaredencoding.WindowsCodePage != _streamencoding.WindowsCodePage)
1844 AddError(
1845 HtmlParseErrorCode.CharsetMismatch,
1846 _line, _lineposition,
1847 _index, node.OuterHtml,
1848 "Encoding mismatch between StreamEncoding: " +
1849 _streamencoding.WebName + " and DeclaredEncoding: " + _declaredencoding.WebName);
1860 private void PushAttributeNameStart(int index)
1862 _currentattribute = CreateAttribute();
1863 _currentattribute._namestartindex = index;
1864 _currentattribute._line = _line;
1865 _currentattribute._lineposition = _lineposition;
1866 _currentattribute._streamposition = index;
1869 private void PushAttributeNameEnd(int index)
1871 _currentattribute._namelength = index - _currentattribute._namestartindex;
1872 _currentnode.Attributes.Append(_currentattribute);
1875 private void PushAttributeValueStart(int index)
1877 _currentattribute._valuestartindex = index;
1880 private void PushAttributeValueEnd(int index)
1882 _currentattribute._valuelength = index - _currentattribute._valuestartindex;
1885 private void PushNodeStart(HtmlNodeType type, int index)
1887 _currentnode = CreateNode(type, index);
1888 _currentnode._line = _line;
1889 _currentnode._lineposition = _lineposition;
1890 if (type == HtmlNodeType.Element)
1892 _currentnode._lineposition--;
1894 _currentnode._streamposition = index;
1897 private void PushNodeEnd(int index, bool close)
1899 _currentnode._outerlength = index - _currentnode._outerstartindex;
1901 //SLIM: inform caller
1902 if (_streammode && ReportNode != null)
1903 _stop_parsing = ! ReportNode (_currentnode);
1905 if (_debug) {
1906 if (_currentnode._nodetype == HtmlNodeType.Text)
1907 Debug ("Text:" + _currentnode.InnerText);
1908 else
1909 Debug ((_currentnode.StartTag ? "Start-" : "End-") + _currentnode.Name);
1911 if ((_currentnode._nodetype == HtmlNodeType.Text) ||
1912 (_currentnode._nodetype == HtmlNodeType.Comment))
1914 // forget about void nodes
1915 if (_currentnode._outerlength>0)
1917 _currentnode._innerlength = _currentnode._outerlength;
1918 _currentnode._innerstartindex = _currentnode._outerstartindex;
1919 // SLIM: no need to append child in stream mode
1920 // SLIM: whatever the caller needs to do, tell it to do now
1921 if (!_streammode && _lastparentnode != null)
1923 _lastparentnode.AppendChild(_currentnode);
1927 else
1929 if ((_currentnode._starttag) && (_lastparentnode != _currentnode))
1931 // add to parent node
1932 // SLIM: no need to append child in stream mode
1933 // SLIM: whatever the caller needs to do, tell it to do now
1934 if (!_streammode && _lastparentnode != null)
1936 _lastparentnode.AppendChild(_currentnode);
1939 ReadDocumentEncoding(_currentnode);
1941 // remember last node of this kind
1942 // SLIM: we still to store _currentnode to help other tags in the same level
1943 HtmlNode prev = (HtmlNode)_lastnodes[_currentnode.Name];
1944 _currentnode._prevwithsamename = prev;
1945 _lastnodes[_currentnode.Name] = _currentnode;
1947 // change parent?
1948 if ((_currentnode.NodeType == HtmlNodeType.Document) ||
1949 (_currentnode.NodeType == HtmlNodeType.Element))
1951 _lastparentnode = _currentnode;
1954 if (HtmlNode.IsCDataElement(CurrentNodeName()))
1956 _state = ParseState.PcData;
1957 return;
1960 if ((HtmlNode.IsClosedElement(_currentnode.Name)) ||
1961 (HtmlNode.IsEmptyElement(_currentnode.Name)))
1963 close = true;
1968 if ((close) || (!_currentnode._starttag))
1970 CloseCurrentNode();
1971 if ((_currentnode._nodetype == HtmlNodeType.Text) ||
1972 (_currentnode._nodetype == HtmlNodeType.Comment))
1973 _currentnode = null;
1977 private void PushNodeNameStart(bool starttag, int index)
1979 _currentnode._starttag = starttag;
1980 _currentnode._namestartindex = index;
1983 private string[] GetResetters(string name)
1985 switch (name)
1987 case "li":
1988 return new string[]{"ul"};
1990 case "tr":
1991 return new string[]{"table"};
1993 case "th":
1994 case "td":
1995 return new string[]{"tr", "table"};
1997 default:
1998 return null;
2002 private void FixNestedTags()
2004 // we are only interested by start tags, not closing tags
2005 if (!_currentnode._starttag)
2006 return;
2008 string name = CurrentNodeName().ToLower();
2009 FixNestedTag(name, GetResetters(name));
2012 private void FixNestedTag(string name, string[] resetters)
2014 if (resetters == null)
2015 return;
2017 HtmlNode prev;
2019 // if we find a previous unclosed same name node, without a resetter node between, we must close it
2020 prev = (HtmlNode)_lastnodes[name];
2021 if ((prev != null) && (!prev.Closed))
2024 // try to find a resetter node, if found, we do nothing
2025 if (FindResetterNodes(prev, resetters))
2027 return;
2030 // ok we need to close the prev now
2031 // create a fake closer node
2032 HtmlNode close = new HtmlNode(prev.NodeType, this, -1);
2033 close._endnode = close;
2034 prev.CloseNode(close);
2039 private bool FindResetterNodes(HtmlNode node, string[] names)
2041 if (names == null)
2043 return false;
2045 for(int i=0;i<names.Length;i++)
2047 if (FindResetterNode(node, names[i]) != null)
2049 return true;
2052 return false;
2055 private HtmlNode FindResetterNode(HtmlNode node, string name)
2057 HtmlNode resetter = (HtmlNode)_lastnodes[name];
2058 if (resetter == null)
2059 return null;
2060 if (resetter.Closed)
2062 return null;
2064 if (resetter._streamposition<node._streamposition)
2066 return null;
2068 return resetter;
2071 private void PushNodeNameEnd(int index)
2073 _currentnode._namelength = index - _currentnode._namestartindex;
2074 if (OptionFixNestedTags)
2076 FixNestedTags();
2080 private void CloseCurrentNode()
2082 if (_currentnode.Closed) // text or document are by def closed
2083 return;
2085 bool error = false;
2087 // find last node of this kind
2088 HtmlNode prev = (HtmlNode)_lastnodes[_currentnode.Name];
2089 if (prev == null)
2091 if (HtmlNode.IsClosedElement(_currentnode.Name))
2093 // </br> will be seen as <br>
2094 _currentnode.CloseNode(_currentnode);
2096 // add to parent node
2097 if (_lastparentnode != null)
2099 HtmlNode foundNode = null;
2100 Stack futureChild = new Stack();
2101 for (HtmlNode node = _lastparentnode.LastChild; node != null; node = node.PreviousSibling)
2103 if ((node.Name == _currentnode.Name) && (! node.HasChildNodes))
2105 foundNode = node;
2106 break;
2108 futureChild.Push(node);
2110 if (foundNode != null)
2112 HtmlNode node = null;
2113 while(futureChild.Count != 0)
2115 node = (HtmlNode)futureChild.Pop();
2116 _lastparentnode.RemoveChild(node);
2117 foundNode.AppendChild(node);
2120 else
2122 _lastparentnode.AppendChild(_currentnode);
2127 else
2129 // node has no parent
2130 // node is not a closed node
2132 if (HtmlNode.CanOverlapElement(_currentnode.Name))
2134 // this is a hack: add it as a text node
2135 HtmlNode closenode = CreateNode(HtmlNodeType.Text, _currentnode._outerstartindex);
2136 closenode._outerlength = _currentnode._outerlength;
2137 ((HtmlTextNode)closenode).Text = ((HtmlTextNode)closenode).Text.ToLower();
2138 if (_lastparentnode != null)
2140 _lastparentnode.AppendChild(closenode);
2144 else
2146 if (HtmlNode.IsEmptyElement(_currentnode.Name))
2148 AddError(
2149 HtmlParseErrorCode.EndTagNotRequired,
2150 _currentnode._line, _currentnode._lineposition,
2151 _currentnode._streamposition, _currentnode.OuterHtml,
2152 "End tag </" + _currentnode.Name + "> is not required");
2154 else
2156 // node cannot overlap, node is not empty
2157 AddError(
2158 HtmlParseErrorCode.TagNotOpened,
2159 _currentnode._line, _currentnode._lineposition,
2160 _currentnode._streamposition, _currentnode.OuterHtml,
2161 "Start tag <" + _currentnode.Name + "> was not found");
2162 error = true;
2167 else
2169 if (OptionFixNestedTags)
2171 if (FindResetterNodes(prev, GetResetters(_currentnode.Name)))
2173 AddError(
2174 HtmlParseErrorCode.EndTagInvalidHere,
2175 _currentnode._line, _currentnode._lineposition,
2176 _currentnode._streamposition, _currentnode.OuterHtml,
2177 "End tag </" + _currentnode.Name + "> invalid here");
2178 error = true;
2182 if (!error)
2184 _lastnodes[_currentnode.Name] = prev._prevwithsamename;
2185 prev.CloseNode(_currentnode);
2190 // we close this node, get grandparent
2191 if (!error)
2193 if ((_lastparentnode != null) &&
2194 ((!HtmlNode.IsClosedElement(_currentnode.Name)) ||
2195 (_currentnode._starttag)))
2197 UpdateLastParentNode();
2202 internal void UpdateLastParentNode()
2206 if (_lastparentnode.Closed)
2208 _lastparentnode = _lastparentnode.ParentNode;
2211 while ((_lastparentnode != null) && (_lastparentnode.Closed));
2212 if (_lastparentnode == null)
2214 _lastparentnode = _documentnode;
2218 private string CurrentAttributeName()
2220 return _text.Substring(_currentattribute._namestartindex, _currentattribute._namelength);
2223 private string CurrentAttributeValue()
2225 return _text.Substring(_currentattribute._valuestartindex, _currentattribute._valuelength);
2228 private string CurrentNodeName()
2230 return _text.Substring(_currentnode._namestartindex, _currentnode._namelength);
2233 private string CurrentNodeOuter()
2235 return _text.Substring(_currentnode._outerstartindex, _currentnode._outerlength);
2238 private string CurrentNodeInner()
2240 return _text.Substring(_currentnode._innerstartindex, _currentnode._innerlength);
2243 /// <summary>
2244 /// Determines if the specified character is considered as a whitespace character.
2245 /// </summary>
2246 /// <param name="c">The character to check.</param>
2247 /// <returns>true if if the specified character is considered as a whitespace character.</returns>
2248 public static bool IsWhiteSpace(int c)
2250 if ((c == 10) || (c == 13) || (c == 32) || (c == 9))
2252 return true;
2254 return false;
2259 internal class EncodingFoundException: Exception
2261 private Encoding _encoding;
2263 internal EncodingFoundException(Encoding encoding)
2265 _encoding = encoding;
2268 internal Encoding Encoding
2272 return _encoding;