Forgot to bump FSQ version. Weekend syndrome.
[beagle.git] / Filters / HtmlAgilityPack / HtmlDocument.cs
blob6111f4851e4729072ed248360c6c51564e4fbbc5
1 // HtmlAgilityPack V1.0 - Simon Mourier <simonm@microsoft.com>
3 /*
4 Copyright (C) 2003 Simon Mourier <simonm@microsoft.com>
5 All rights reserved.
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions
9 are met:
10 1. Redistributions of source code must retain the above copyright
11 notice, this list of conditions and the following disclaimer.
12 2. Redistributions in binary form must reproduce the above copyright
13 notice, this list of conditions and the following disclaimer in the
14 documentation and/or other materials provided with the distribution.
15 3. The name of the author may not be used to endorse or promote products
16 derived from this software without specific prior written permission.
18 THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
19 IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20 OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21 IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
22 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
23 NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
27 THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 using System;
31 using System.IO;
32 using System.Text;
33 using System.Diagnostics;
34 using System.Collections;
35 using System.Text.RegularExpressions;
36 using System.Xml;
37 using System.Xml.XPath;
40 // Legend: SLIM=Comment added describing changes to original HtmlAgilityPack
41 // to reduce memory consumption
42 // Once the parser is free of bugs, the comments will be taken out
43 namespace HtmlAgilityPack
45 /// <summary>
46 /// Represents the type of parsing error.
47 /// </summary>
48 public enum HtmlParseErrorCode
50 /// <summary>
51 /// A tag was not closed.
52 /// </summary>
53 TagNotClosed,
55 /// <summary>
56 /// A tag was not opened.
57 /// </summary>
58 TagNotOpened,
60 /// <summary>
61 /// There is a charset mismatch between stream and declared (META) encoding.
62 /// </summary>
63 CharsetMismatch,
65 /// <summary>
66 /// An end tag was not required.
67 /// </summary>
68 EndTagNotRequired,
70 /// <summary>
71 /// An end tag is invalid at this position.
72 /// </summary>
73 EndTagInvalidHere
76 /// <summary>
77 /// Represents a parsing error found during document parsing.
78 /// </summary>
79 public class HtmlParseError
81 private HtmlParseErrorCode _code;
82 private int _line;
83 private int _linePosition;
84 private int _streamPosition;
85 private string _sourceText;
86 private string _reason;
88 internal HtmlParseError(
89 HtmlParseErrorCode code,
90 int line,
91 int linePosition,
92 int streamPosition,
93 string sourceText,
94 string reason)
96 _code = code;
97 _line = line;
98 _linePosition = linePosition;
99 _streamPosition = streamPosition;
100 _sourceText = sourceText;
101 _reason = reason;
104 /// <summary>
105 /// Gets the type of error.
106 /// </summary>
107 public HtmlParseErrorCode Code
111 return _code;
115 /// <summary>
116 /// Gets the line number of this error in the document.
117 /// </summary>
118 public int Line
122 return _line;
126 /// <summary>
127 /// Gets the column number of this error in the document.
128 /// </summary>
129 public int LinePosition
133 return _linePosition;
137 /// <summary>
138 /// Gets the absolstream position of this error in the document, relative to the start of the document.
139 /// </summary>
140 public int StreamPosition
144 return _streamPosition;
148 /// <summary>
149 /// Gets the the full text of the line containing the error.
150 /// </summary>
151 public string SourceText
155 return _sourceText;
159 /// <summary>
160 /// Gets a description for the error.
161 /// </summary>
162 public string Reason
166 return _reason;
171 // SLIM: creating this class to wrap around a textreader
172 // to emulate ReadToEnd () behaviour
173 class StreamAsArray {
174 private StreamReader _reader;
175 private int _length;
176 private int _position;
177 private bool _eof;
178 private char[] _buf_previous; // could have used only one array
179 private char[] _buf_current; // but, this is cleaner
180 private int _block_size;
182 public StreamAsArray (StreamReader r)
184 _reader = r;
185 _length = 0;
186 _position = 0;
187 _eof = false;
189 _block_size = 1024;
190 _buf_previous = new char [_block_size];
191 _buf_current = new char [_block_size];
193 Read (true);
196 private void Read (bool initial)
198 if ( !initial) {
199 Array.Copy (_buf_current, _buf_previous, _block_size);
200 _position += _block_size;
202 HtmlDocument.Debug ("Debug: Read in buffer at:" + _position);
204 int num_read = _reader.Read (_buf_current, 0, _block_size);
205 if (num_read < _block_size) {
206 _eof = true;
207 _length = _position + num_read;
209 HtmlDocument.Debug ("[" + new string (_buf_current, 0, num_read) + "]");
212 public bool Eof (int index) {
213 if (_eof)
214 return (index == _length);
215 else {
216 if (index >= _position + _block_size &&
217 index < _position + _block_size + _block_size)
218 Read (false);
219 if (_eof)
220 return (index == _length);
221 else
222 return false;
226 public new char this[int index] {
227 get {
228 if (index >= _position &&
229 index < _position + _block_size)
230 return _buf_current [index % _block_size];
231 if (index >= _position - _block_size &&
232 index < _position)
233 return _buf_previous [ index % _block_size];
234 if (index >= _position + _block_size &&
235 index < _position + _block_size + _block_size) {
236 Read (false);
237 return _buf_current [index % _block_size];
239 Console.WriteLine ("EXCEPTION!!!");
240 throw new Exception (String.Format ("{0} is out of current bounds:[{1}-{2}] and further than read-ahead",
241 index,
242 _position - _block_size,
243 _position + _block_size - 1));
247 // evil function ... you get what you pay for!
248 private string OutOfBandRead (int startindex, int length)
250 HtmlDocument.Debug ("Out of band read! From " + startindex + " to " + (startindex + length - 1));
251 ResetPosition (startindex);
252 // ahh.. now we are at the correct place
253 // create a buffer of required length
254 // who cares if the buffer size does not align well
255 // with page boundary
256 char[] temp_buf = new char [length];
257 int num_read = _reader.Read (temp_buf, 0, length);
258 if (num_read < length) {
259 // Shouldnt occur!!!
260 _eof = true;
261 _length = startindex + num_read;
263 // discard data and reset stream position
264 int t = (_eof ? _length :_position + _block_size);
265 ResetPosition (t);
266 return new String (temp_buf);
269 // streamreader does not allow seeking
270 // seek on its basestream does not reflect the position
271 // of the reader - it is governed by the buffer size
272 // of the underlying stream
273 // :( so, read character by character from beginning ...
274 private void ResetPosition (int pos)
276 _reader.DiscardBufferedData ();
277 _reader.BaseStream.Position = 0;
278 // read in chunks of block_size
279 int n1 = pos / _block_size;
280 int n2 = pos % _block_size;
281 char[] tmp = new char [_block_size];
282 // yo ho... start reading till we have reach pos
283 // hopefully, reader will buffer itself, so we can be mean and get one char at a time
284 for (int i = 0; i < n1; ++i)
285 _reader.Read (tmp, 0, _block_size);
286 for (int i = 0; i < n2; ++i)
287 _reader.Read ();
288 tmp = null;
291 public string Substring (int startindex, int length)
293 if (length == 0) {
294 HtmlDocument.Debug ("substring:" + startindex + " " + length + " " + _position + ":");
295 return String.Empty;
297 if (length > _block_size || startindex < _position - _block_size) {
298 return OutOfBandRead (startindex, length);
300 if (startindex + length - 1 >= _position + _block_size) {
301 Read (false);
303 string substr;
304 if (startindex < _position) {
305 int len_1 = _position - startindex;
306 if (length < len_1)
307 substr = new String (_buf_previous, _block_size - len_1, length);
308 else {
309 substr = new String (_buf_previous, _block_size - len_1, len_1);
310 substr += new String (_buf_current, 0, length - len_1);
312 } else {
313 substr = new String (_buf_current, startindex - _position, length);
315 return substr;
318 // FIXME: Is this costly ?
319 public int FullLength {
320 get {
321 return (int)_reader.BaseStream.Length;
326 /// <summary>
327 /// Represents a complete HTML document.
328 /// </summary>
329 public class HtmlDocument: IXPathNavigable
331 // SLIM: Make the parser event driven
332 // callback for FilterHtml
333 // return value is a way for the callback to signal to continue or stop parsing
334 public delegate bool NodeHandler (HtmlNode node);
335 public NodeHandler ReportNode;
336 // misnomer ... should be called event_driven_mode
337 private bool _streammode = false;
338 private bool _stop_parsing = false;
340 internal static readonly string HtmlExceptionRefNotChild = "Reference node must be a child of this node";
341 internal static readonly string HtmlExceptionUseIdAttributeFalse = "You need to set UseIdAttribute property to true to enable this feature";
343 internal Hashtable _openednodes;
344 internal Hashtable _lastnodes = new Hashtable();
345 internal Hashtable _nodesid;
346 private HtmlNode _documentnode;
347 //SLIM: internal string _text;
348 internal StreamAsArray _text;
349 private HtmlNode _currentnode;
350 private HtmlNode _lastparentnode;
351 private HtmlAttribute _currentattribute;
352 private int _index;
353 private int _line;
354 private int _lineposition, _maxlineposition;
355 private int _c;
356 private bool _fullcomment;
357 private System.Text.Encoding _streamencoding;
358 private System.Text.Encoding _declaredencoding;
359 private ArrayList _parseerrors = new ArrayList();
360 private ParseState _state, _oldstate;
361 private Crc32 _crc32 = null;
362 private bool _onlyDetectEncoding = false;
363 private int _pcdata_quote_char = '\0';
365 private static bool _debug = false;
366 internal static void Debug (string s)
368 if (_debug)
369 Console.WriteLine (s);
372 // public props
374 /// <summary>
375 /// Defines if a checksum must be computed for the document while parsing. Default is false.
376 /// </summary>
377 public bool OptionComputeChecksum = false;
379 /// <summary>
380 /// Defines if declared encoding must be read from the document.
381 /// Declared encoding is determined using the meta http-equiv="content-type" content="text/html;charset=XXXXX" html node.
382 /// Default is true.
383 /// </summary>
384 public bool OptionReadEncoding = true;
387 /// <summary>
388 /// Defines if non closed nodes will be checked at the end of parsing. Default is true.
389 /// </summary>
390 public bool OptionCheckSyntax = true;
392 /// <summary>
393 /// Defines if the 'id' attribute must be specifically used. Default is true.
394 /// </summary>
395 public bool OptionUseIdAttribute = true;
397 /// <summary>
398 /// Defines if empty nodes must be written as closed during output. Default is false.
399 /// </summary>
400 public bool OptionWriteEmptyNodes = false;
402 /// <summary>
403 /// Defines if output must conform to XML, instead of HTML.
404 /// </summary>
405 public bool OptionOutputAsXml = false;
407 /// <summary>
408 /// Defines if name must be output in uppercase. Default is false.
409 /// </summary>
410 public bool OptionOutputUpperCase = false;
412 /// <summary>
413 /// Defines if attribute value output must be optimized (not bound with double quotes if it is possible). Default is false.
414 /// </summary>
415 public bool OptionOutputOptimizeAttributeValues = false;
417 /// <summary>
418 /// Adds Debugging attributes to node. Default is false.
419 /// </summary>
420 public bool OptionAddDebuggingAttributes = false;
422 /// <summary>
423 /// Defines if source text must be extracted while parsing errors.
424 /// If the document has a lot of errors, or cascading errors, parsing performance can be dramatically affected if set to true.
425 /// Default is false.
426 /// </summary>
427 public bool OptionExtractErrorSourceText = false; // turning this on can dramatically slow performance if a lot of errors are detected
429 /// <summary>
430 /// Defines if closing for non closed nodes must be done at the end or directly in the document.
431 /// Setting this to true can actually change how browsers render the page. Default is false.
432 /// </summary>
433 public bool OptionAutoCloseOnEnd = false; // close errors at the end
435 /// <summary>
436 /// Defines if LI, TR, TH, TD tags must be partially fixed when nesting errors are detected. Default is false.
437 /// </summary>
438 public bool OptionFixNestedTags = false; // fix li, tr, th, td tags
440 /// <summary>
441 /// Defines the maximum length of source text or parse errors. Default is 100.
442 /// </summary>
443 public int OptionExtractErrorSourceTextMaxLength = 100;
445 /// <summary>
446 /// Defines the default stream encoding to use. Default is System.Text.Encoding.Default.
447 /// </summary>
448 // From http://www.w3.org/TR/REC-html40/charset.html
449 // The HTTP protocol ([RFC2616], section 3.7.1) mentions ISO-8859-1 as a default character encoding when the "charset" parameter is absent from the "Content-Type" header field.
450 // So, however we are still using UTF-8 for some unknown reason
451 //FIXME: Fix the default encoding!
452 public System.Text.Encoding OptionDefaultStreamEncoding = Encoding.UTF8;
454 /// <summary>
455 /// Gets a list of parse errors found in the document.
456 /// </summary>
457 public ArrayList ParseErrors
461 return _parseerrors;
465 /// <summary>
466 /// Gets the document's stream encoding.
467 /// </summary>
468 public System.Text.Encoding StreamEncoding
472 return _streamencoding;
476 /// <summary>
477 /// Gets the document's declared encoding.
478 /// Declared encoding is determined using the meta http-equiv="content-type" content="text/html;charset=XXXXX" html node.
479 /// </summary>
480 public System.Text.Encoding DeclaredEncoding
484 return _declaredencoding;
488 /// <summary>
489 /// Creates an instance of an HTML document.
490 /// </summary>
491 public HtmlDocument()
493 _documentnode = CreateNode(HtmlNodeType.Document, 0);
496 internal HtmlNode GetXmlDeclaration()
498 if (!_documentnode.HasChildNodes)
500 return null;
503 foreach(HtmlNode node in _documentnode._childnodes)
505 if (node.Name == "?xml") // it's ok, names are case sensitive
507 return node;
510 return null;
513 /// <summary>
514 /// Applies HTML encoding to a specified string.
515 /// </summary>
516 /// <param name="html">The input string to encode. May not be null.</param>
517 /// <returns>The encoded string.</returns>
518 public static string HtmlEncode(string html)
520 if (html == null)
522 throw new ArgumentNullException("html");
524 // replace & by &amp; but only once!
525 Regex rx = new Regex("&(?!(amp;)|(lt;)|(gt;)|(quot;))", RegexOptions.IgnoreCase);
526 return rx.Replace(html, "&amp;").Replace("<", "&lt;").Replace(">", "&gt;").Replace("\"", "&quot;");
529 /// <summary>
530 /// Detects the encoding of an HTML stream.
531 /// </summary>
532 /// <param name="stream">The input stream. May not be null.</param>
533 /// <returns>The detected encoding.</returns>
534 public Encoding DetectEncoding(Stream stream)
536 if (stream == null)
538 throw new ArgumentNullException("stream");
540 return DetectEncoding(new StreamReader(stream));
543 /// <summary>
544 /// Detects the encoding of an HTML file.
545 /// </summary>
546 /// <param name="path">Path for the file containing the HTML document to detect. May not be null.</param>
547 /// <returns>The detected encoding.</returns>
548 public Encoding DetectEncoding(string path)
550 if (path == null)
552 throw new ArgumentNullException("path");
554 StreamReader sr = new StreamReader(path, OptionDefaultStreamEncoding);
555 Encoding encoding = DetectEncoding(sr);
556 sr.Close();
557 return encoding;
560 /// <summary>
561 /// Detects the encoding of an HTML text.
562 /// </summary>
563 /// <param name="html">The input html text. May not be null.</param>
564 /// <returns>The detected encoding.</returns>
565 public Encoding DetectEncodingHtml(string html)
567 if (html == null)
569 throw new ArgumentNullException("html");
571 StringReader sr = new StringReader(html);
572 Encoding encoding = DetectEncoding(sr);
573 sr.Close();
574 return encoding;
577 /// <summary>
578 /// Detects the encoding of an HTML text provided on a TextReader.
579 /// </summary>
580 /// <param name="reader">The TextReader used to feed the HTML. May not be null.</param>
581 /// <returns>The detected encoding.</returns>
582 public Encoding DetectEncoding(TextReader reader)
584 if (reader == null)
586 throw new ArgumentNullException("reader");
588 _onlyDetectEncoding = true;
589 if (OptionCheckSyntax)
591 _openednodes = new Hashtable();
593 else
595 _openednodes = null;
598 if (OptionUseIdAttribute)
600 _nodesid = new Hashtable();
602 else
604 _nodesid = null;
607 StreamReader sr = reader as StreamReader;
608 if (sr != null)
610 _streamencoding = sr.CurrentEncoding;
612 else
614 _streamencoding = null;
616 _declaredencoding = null;
618 // SLIM: _text = reader.ReadToEnd();
619 _text = new StreamAsArray (sr);
620 _documentnode = CreateNode(HtmlNodeType.Document, 0);
622 // this is a hack, but it allows us not to muck with the original parsing code
625 Parse();
627 catch(EncodingFoundException ex)
629 _lastnodes.Clear();
630 return ex.Encoding;
632 return null;
635 /// <summary>
636 /// Loads an HTML document from a stream.
637 /// </summary>
638 /// <param name="stream">The input stream.</param>
639 public void Load(Stream stream)
641 Load(new StreamReader(stream, OptionDefaultStreamEncoding));
644 /// <summary>
645 /// Loads an HTML document from a stream.
646 /// </summary>
647 /// <param name="stream">The input stream.</param>
648 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the stream.</param>
649 public void Load(Stream stream, bool detectEncodingFromByteOrderMarks)
651 Load(new StreamReader(stream, detectEncodingFromByteOrderMarks));
654 /// <summary>
655 /// Loads an HTML document from a stream.
656 /// </summary>
657 /// <param name="stream">The input stream.</param>
658 /// <param name="encoding">The character encoding to use.</param>
659 public void Load(Stream stream, Encoding encoding)
661 Load(new StreamReader(stream, encoding));
664 /// <summary>
665 /// Loads an HTML document from a stream.
666 /// </summary>
667 /// <param name="stream">The input stream.</param>
668 /// <param name="encoding">The character encoding to use.</param>
669 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the stream.</param>
670 public void Load(Stream stream, Encoding encoding, bool detectEncodingFromByteOrderMarks)
672 Load(new StreamReader(stream, encoding, detectEncodingFromByteOrderMarks));
675 /// <summary>
676 /// Loads an HTML document from a stream.
677 /// </summary>
678 /// <param name="stream">The input stream.</param>
679 /// <param name="encoding">The character encoding to use.</param>
680 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the stream.</param>
681 /// <param name="buffersize">The minimum buffer size.</param>
682 public void Load(Stream stream, Encoding encoding, bool detectEncodingFromByteOrderMarks, int buffersize)
684 Load(new StreamReader(stream, encoding, detectEncodingFromByteOrderMarks, buffersize));
687 /// <summary>
688 /// Loads an HTML document from a file.
689 /// </summary>
690 /// <param name="path">The complete file path to be read. May not be null.</param>
691 public void Load(string path)
693 if (path == null)
695 throw new ArgumentNullException("path");
697 StreamReader sr = new StreamReader(path, OptionDefaultStreamEncoding);
698 Load(sr);
699 sr.Close();
702 /// <summary>
703 /// Loads an HTML document from a file.
704 /// </summary>
705 /// <param name="path">The complete file path to be read. May not be null.</param>
706 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
707 public void Load(string path, bool detectEncodingFromByteOrderMarks)
709 if (path == null)
711 throw new ArgumentNullException("path");
713 StreamReader sr = new StreamReader(path, detectEncodingFromByteOrderMarks);
714 Load(sr);
715 sr.Close();
718 /// <summary>
719 /// Loads an HTML document from a file.
720 /// </summary>
721 /// <param name="path">The complete file path to be read. May not be null.</param>
722 /// <param name="encoding">The character encoding to use. May not be null.</param>
723 public void Load(string path, Encoding encoding)
725 if (path == null)
727 throw new ArgumentNullException("path");
729 if (encoding == null)
731 throw new ArgumentNullException("encoding");
733 StreamReader sr = new StreamReader(path, encoding);
734 Load(sr);
735 sr.Close();
738 /// <summary>
739 /// Loads an HTML document from a file.
740 /// </summary>
741 /// <param name="path">The complete file path to be read. May not be null.</param>
742 /// <param name="encoding">The character encoding to use. May not be null.</param>
743 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
744 public void Load(string path, Encoding encoding, bool detectEncodingFromByteOrderMarks)
746 if (path == null)
748 throw new ArgumentNullException("path");
750 if (encoding == null)
752 throw new ArgumentNullException("encoding");
754 StreamReader sr = new StreamReader(path, encoding, detectEncodingFromByteOrderMarks);
755 Load(sr);
756 sr.Close();
759 /// <summary>
760 /// Loads an HTML document from a file.
761 /// </summary>
762 /// <param name="path">The complete file path to be read. May not be null.</param>
763 /// <param name="encoding">The character encoding to use. May not be null.</param>
764 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
765 /// <param name="buffersize">The minimum buffer size.</param>
766 public void Load(string path, Encoding encoding, bool detectEncodingFromByteOrderMarks, int buffersize)
768 if (path == null)
770 throw new ArgumentNullException("path");
772 if (encoding == null)
774 throw new ArgumentNullException("encoding");
776 StreamReader sr = new StreamReader(path, encoding, detectEncodingFromByteOrderMarks, buffersize);
777 Load(sr);
778 sr.Close();
781 /// <summary>
782 /// Loads the HTML document from the specified string.
783 /// </summary>
784 /// <param name="html">String containing the HTML document to load. May not be null.</param>
785 public void LoadHtml(string html)
787 if (html == null)
789 throw new ArgumentNullException("html");
791 StringReader sr = new StringReader(html);
792 Load(sr);
793 sr.Close();
796 /// <summary>
797 /// Detects the encoding of an HTML document from a file first, and then loads the file.
798 /// </summary>
799 /// <param name="path">The complete file path to be read.</param>
800 public void DetectEncodingAndLoad(string path)
802 DetectEncodingAndLoad(path, true);
805 /// <summary>
806 /// Detects the encoding of an HTML document from a file first, and then loads the file.
807 /// </summary>
808 /// <param name="path">The complete file path to be read. May not be null.</param>
809 /// <param name="detectEncoding">true to detect encoding, false otherwise.</param>
810 public void DetectEncodingAndLoad(string path, bool detectEncoding)
812 if (path == null)
814 throw new ArgumentNullException("path");
816 System.Text.Encoding enc;
817 if (detectEncoding)
819 enc = DetectEncoding(path);
821 else
823 enc = null;
826 if (enc == null)
828 Load(path);
830 else
832 Load(path, enc);
836 /// <summary>
837 /// Loads the HTML document from the specified TextReader.
838 /// </summary>
839 /// <param name="reader">The TextReader used to feed the HTML data into the document. May not be null.</param>
840 public void Load(TextReader reader)
842 // all Load methods pass down to this one
843 if (reader == null)
845 throw new ArgumentNullException("reader");
848 _onlyDetectEncoding = false;
850 if (OptionCheckSyntax)
852 _openednodes = new Hashtable();
854 else
856 _openednodes = null;
859 if (OptionUseIdAttribute)
861 _nodesid = new Hashtable();
863 else
865 _nodesid = null;
868 StreamReader sr = reader as StreamReader;
869 if (sr != null)
873 // trigger bom read if needed
874 sr.Peek();
876 catch
878 // void on purpose
880 _streamencoding = sr.CurrentEncoding;
882 else
884 _streamencoding = null;
886 _declaredencoding = null;
888 // SLIM: _text = reader.ReadToEnd();
889 _text = new StreamAsArray (sr);
890 _documentnode = CreateNode(HtmlNodeType.Document, 0);
891 Parse();
893 if (OptionCheckSyntax)
895 foreach(HtmlNode node in _openednodes.Values)
897 if (!node._starttag) // already reported
899 continue;
902 string html;
903 if (OptionExtractErrorSourceText)
905 html = node.OuterHtml;
906 if (html.Length > OptionExtractErrorSourceTextMaxLength)
908 html = html.Substring(0, OptionExtractErrorSourceTextMaxLength);
911 else
913 html = string.Empty;
915 AddError(
916 HtmlParseErrorCode.TagNotClosed,
917 node._line, node._lineposition,
918 node._streamposition, html,
919 "End tag </" + node.Name + "> was not found");
922 // we don't need this anymore
923 _openednodes.Clear();
927 internal System.Text.Encoding GetOutEncoding()
929 // when unspecified, use the stream encoding first
930 if (_declaredencoding != null)
932 return _declaredencoding;
934 else
936 if (_streamencoding != null)
938 return _streamencoding;
941 return OptionDefaultStreamEncoding;
945 /// <summary>
946 /// Gets the document's output encoding.
947 /// </summary>
948 public System.Text.Encoding Encoding
952 return GetOutEncoding();
956 /// <summary>
957 /// Saves the HTML document to the specified stream.
958 /// </summary>
959 /// <param name="outStream">The stream to which you want to save.</param>
960 public void Save(Stream outStream)
962 StreamWriter sw = new StreamWriter(outStream, GetOutEncoding());
963 Save(sw);
966 /// <summary>
967 /// Saves the HTML document to the specified stream.
968 /// </summary>
969 /// <param name="outStream">The stream to which you want to save. May not be null.</param>
970 /// <param name="encoding">The character encoding to use. May not be null.</param>
971 public void Save(Stream outStream, System.Text.Encoding encoding)
973 if (outStream == null)
975 throw new ArgumentNullException("outStream");
977 if (encoding == null)
979 throw new ArgumentNullException("encoding");
981 StreamWriter sw = new StreamWriter(outStream, encoding);
982 Save(sw);
985 /// <summary>
986 /// Saves the mixed document to the specified file.
987 /// </summary>
988 /// <param name="filename">The location of the file where you want to save the document.</param>
989 public void Save(string filename)
991 StreamWriter sw = new StreamWriter(filename, false, GetOutEncoding());
992 Save(sw);
993 sw.Close();
996 /// <summary>
997 /// Saves the mixed document to the specified file.
998 /// </summary>
999 /// <param name="filename">The location of the file where you want to save the document. May not be null.</param>
1000 /// <param name="encoding">The character encoding to use. May not be null.</param>
1001 public void Save(string filename, System.Text.Encoding encoding)
1003 if (filename == null)
1005 throw new ArgumentNullException("filename");
1007 if (encoding == null)
1009 throw new ArgumentNullException("encoding");
1011 StreamWriter sw = new StreamWriter(filename, false, encoding);
1012 Save(sw);
1013 sw.Close();
1016 /// <summary>
1017 /// Saves the HTML document to the specified StreamWriter.
1018 /// </summary>
1019 /// <param name="writer">The StreamWriter to which you want to save.</param>
1020 public void Save(StreamWriter writer)
1022 Save((TextWriter)writer);
1025 /// <summary>
1026 /// Saves the HTML document to the specified TextWriter.
1027 /// </summary>
1028 /// <param name="writer">The TextWriter to which you want to save. May not be null.</param>
1029 public void Save(TextWriter writer)
1031 if (writer == null)
1033 throw new ArgumentNullException("writer");
1035 DocumentNode.WriteTo(writer);
1038 /// <summary>
1039 /// Saves the HTML document to the specified XmlWriter.
1040 /// </summary>
1041 /// <param name="writer">The XmlWriter to which you want to save.</param>
1042 public void Save(XmlWriter writer)
1044 DocumentNode.WriteTo(writer);
1045 writer.Flush();
1048 /// <summary>
1049 /// Creates a new XPathNavigator object for navigating this HTML document.
1050 /// </summary>
1051 /// <returns>An XPathNavigator object. The XPathNavigator is positioned on the root of the document.</returns>
1052 public XPathNavigator CreateNavigator()
1054 return new HtmlNodeNavigator(this, _documentnode);
1057 internal void SetIdForNode(HtmlNode node, string id)
1059 if (!OptionUseIdAttribute)
1061 return;
1064 if ((_nodesid == null) || (id == null))
1066 return;
1069 if (node == null)
1071 _nodesid.Remove(id.ToLower());
1073 else
1075 _nodesid[id.ToLower()] = node;
1079 /// <summary>
1080 /// Gets the HTML node with the specified 'id' attribute value.
1081 /// </summary>
1082 /// <param name="id">The attribute id to match. May not be null.</param>
1083 /// <returns>The HTML node with the matching id or null if not found.</returns>
1084 public HtmlNode GetElementbyId(string id)
1086 if (id == null)
1088 throw new ArgumentNullException("id");
1090 if (_nodesid == null)
1092 throw new Exception(HtmlExceptionUseIdAttributeFalse);
1095 return _nodesid[id.ToLower()] as HtmlNode;
1098 /// <summary>
1099 /// Creates an HTML element node with the specified name.
1100 /// </summary>
1101 /// <param name="name">The qualified name of the element. May not be null.</param>
1102 /// <returns>The new HTML node.</returns>
1103 public HtmlNode CreateElement(string name)
1105 if (name == null)
1107 throw new ArgumentNullException("name");
1109 HtmlNode node = CreateNode(HtmlNodeType.Element);
1110 node._name = name;
1111 return node;
1114 /// <summary>
1115 /// Creates an HTML comment node.
1116 /// </summary>
1117 /// <returns>The new HTML comment node.</returns>
1118 public HtmlCommentNode CreateComment()
1120 return (HtmlCommentNode)CreateNode(HtmlNodeType.Comment);
1123 /// <summary>
1124 /// Creates an HTML comment node with the specified comment text.
1125 /// </summary>
1126 /// <param name="comment">The comment text. May not be null.</param>
1127 /// <returns>The new HTML comment node.</returns>
1128 public HtmlCommentNode CreateComment(string comment)
1130 if (comment == null)
1132 throw new ArgumentNullException("comment");
1134 HtmlCommentNode c = CreateComment();
1135 c.Comment = comment;
1136 return c;
1139 /// <summary>
1140 /// Creates an HTML text node.
1141 /// </summary>
1142 /// <returns>The new HTML text node.</returns>
1143 public HtmlTextNode CreateTextNode()
1145 return (HtmlTextNode)CreateNode(HtmlNodeType.Text);
1148 /// <summary>
1149 /// Creates an HTML text node with the specified text.
1150 /// </summary>
1151 /// <param name="text">The text of the node. May not be null.</param>
1152 /// <returns>The new HTML text node.</returns>
1153 public HtmlTextNode CreateTextNode(string text)
1155 if (text == null)
1157 throw new ArgumentNullException("text");
1159 HtmlTextNode t = CreateTextNode();
1160 t.Text = text;
1161 return t;
1164 internal HtmlNode CreateNode(HtmlNodeType type)
1166 return CreateNode(type, -1);
1169 internal HtmlNode CreateNode(HtmlNodeType type, int index)
1171 switch (type)
1173 case HtmlNodeType.Comment:
1174 return new HtmlCommentNode(this, index);
1176 case HtmlNodeType.Text:
1177 return new HtmlTextNode(this, index);
1179 default:
1180 return new HtmlNode(type, this, index);
1184 internal HtmlAttribute CreateAttribute()
1186 return new HtmlAttribute(this);
1189 /// <summary>
1190 /// Creates an HTML attribute with the specified name.
1191 /// </summary>
1192 /// <param name="name">The name of the attribute. May not be null.</param>
1193 /// <returns>The new HTML attribute.</returns>
1194 public HtmlAttribute CreateAttribute(string name)
1196 if (name == null)
1198 throw new ArgumentNullException("name");
1200 HtmlAttribute att = CreateAttribute();
1201 att.Name = name;
1202 return att;
1205 /// <summary>
1206 /// Creates an HTML attribute with the specified name.
1207 /// </summary>
1208 /// <param name="name">The name of the attribute. May not be null.</param>
1209 /// <param name="value">The value of the attribute.</param>
1210 /// <returns>The new HTML attribute.</returns>
1211 public HtmlAttribute CreateAttribute(string name, string value)
1213 if (name == null)
1215 throw new ArgumentNullException("name");
1217 HtmlAttribute att = CreateAttribute(name);
1218 att.Value = value;
1219 return att;
1222 /// <summary>
1223 /// Gets the root node of the document.
1224 /// </summary>
1225 public HtmlNode DocumentNode
1229 return _documentnode;
1233 /// <summary>
1234 /// Gets the document CRC32 checksum if OptionComputeChecksum was set to true before parsing, 0 otherwise.
1235 /// </summary>
1236 public int CheckSum
1240 if (_crc32 == null)
1242 return 0;
1244 else
1246 return (int)_crc32.CheckSum;
1251 public bool StreamMode
1255 return _streammode;
1259 _streammode = value;
1263 private HtmlParseError AddError(
1264 HtmlParseErrorCode code,
1265 int line,
1266 int linePosition,
1267 int streamPosition,
1268 string sourceText,
1269 string reason)
1271 HtmlParseError err = new HtmlParseError(code, line, linePosition, streamPosition, sourceText, reason);
1272 _parseerrors.Add(err);
1273 return err;
1276 private enum ParseState
1278 Text,
1279 WhichTag,
1280 Tag,
1281 BetweenAttributes,
1282 EmptyTag,
1283 AttributeName,
1284 AttributeBeforeEquals,
1285 AttributeAfterEquals,
1286 AttributeValue,
1287 Comment,
1288 QuotedAttributeValue,
1289 ServerSideCode,
1290 PcDataQuote,
1291 PcData
1294 private void IncrementPosition()
1296 if (_crc32 != null)
1298 // REVIEW: should we add some checksum code in DecrementPosition too?
1299 _crc32.AddToCRC32(_c);
1302 _index++;
1303 _maxlineposition = _lineposition;
1304 if (_c == 10)
1306 _lineposition = 1;
1307 _line++;
1309 else
1311 _lineposition++;
1315 private void DecrementPosition()
1317 _index--;
1318 if (_lineposition == 1)
1320 _lineposition = _maxlineposition;
1321 _line--;
1323 else
1325 _lineposition--;
1329 private void Parse()
1331 int lastquote = 0;
1332 if (OptionComputeChecksum)
1334 _crc32 = new Crc32();
1337 _lastnodes = new Hashtable();
1338 _c = 0;
1339 _fullcomment = false;
1340 _parseerrors = new ArrayList();
1341 _line = 1;
1342 _lineposition = 1;
1343 _maxlineposition = 1;
1345 _state = ParseState.Text;
1346 _oldstate = _state;
1347 _documentnode._innerlength = _text.FullLength;
1348 _documentnode._outerlength = _text.FullLength;
1350 _lastparentnode = _documentnode;
1351 _currentnode = CreateNode(HtmlNodeType.Text, 0);
1352 _currentattribute = null;
1354 _index = 0;
1355 PushNodeStart(HtmlNodeType.Text, 0);
1356 // SLIM: while (_index<_text.Length)
1357 while (! _stop_parsing && ! _text.Eof (_index))
1359 _c = _text[_index];
1360 IncrementPosition();
1362 switch(_state)
1364 case ParseState.Text:
1365 if (NewCheck())
1366 continue;
1367 break;
1369 case ParseState.WhichTag:
1370 if (NewCheck())
1371 continue;
1372 if (_c == '/')
1374 PushNodeNameStart(false, _index);
1376 else
1378 PushNodeNameStart(true, _index-1);
1379 DecrementPosition();
1381 _state = ParseState.Tag;
1382 break;
1384 case ParseState.Tag:
1385 if (NewCheck())
1386 continue;
1387 if (IsWhiteSpace(_c))
1389 PushNodeNameEnd(_index-1);
1390 if (_state != ParseState.Tag)
1391 continue;
1392 _state = ParseState.BetweenAttributes;
1393 continue;
1395 if (_c == '/')
1397 PushNodeNameEnd(_index-1);
1398 if (_state != ParseState.Tag)
1399 continue;
1400 _state = ParseState.EmptyTag;
1401 continue;
1403 if (_c == '>')
1405 PushNodeNameEnd(_index-1);
1406 if (_state != ParseState.Tag)
1407 continue;
1408 PushNodeEnd(_index, false);
1409 if (_state != ParseState.Tag)
1410 continue;
1411 _state = ParseState.Text;
1412 PushNodeStart(HtmlNodeType.Text, _index);
1414 break;
1416 case ParseState.BetweenAttributes:
1417 if (NewCheck())
1418 continue;
1420 if (IsWhiteSpace(_c))
1421 continue;
1423 if ((_c == '/') || (_c == '?'))
1425 _state = ParseState.EmptyTag;
1426 continue;
1429 if (_c == '>')
1431 PushNodeEnd(_index, false);
1432 if (_state != ParseState.BetweenAttributes)
1433 continue;
1434 _state = ParseState.Text;
1435 PushNodeStart(HtmlNodeType.Text, _index);
1436 continue;
1439 PushAttributeNameStart(_index-1);
1440 _state = ParseState.AttributeName;
1441 break;
1443 case ParseState.EmptyTag:
1444 if (NewCheck())
1445 continue;
1447 if (_c == '>')
1449 PushNodeEnd(_index, true);
1450 if (_state != ParseState.EmptyTag)
1451 continue;
1452 _state = ParseState.Text;
1453 PushNodeStart(HtmlNodeType.Text, _index);
1454 continue;
1456 _state = ParseState.BetweenAttributes;
1457 break;
1459 case ParseState.AttributeName:
1460 if (NewCheck())
1461 continue;
1463 if (IsWhiteSpace(_c))
1465 PushAttributeNameEnd(_index-1);
1466 _state = ParseState.AttributeBeforeEquals;
1467 continue;
1469 if (_c == '=')
1471 PushAttributeNameEnd(_index-1);
1472 _state = ParseState.AttributeAfterEquals;
1473 continue;
1475 if (_c == '>')
1477 PushAttributeNameEnd(_index-1);
1478 PushNodeEnd(_index, false);
1479 if (_state != ParseState.AttributeName)
1480 continue;
1481 _state = ParseState.Text;
1482 PushNodeStart(HtmlNodeType.Text, _index);
1483 continue;
1485 break;
1487 case ParseState.AttributeBeforeEquals:
1488 if (NewCheck())
1489 continue;
1491 if (IsWhiteSpace(_c))
1492 continue;
1493 if (_c == '>')
1495 PushNodeEnd(_index, false);
1496 if (_state != ParseState.AttributeBeforeEquals)
1497 continue;
1498 _state = ParseState.Text;
1499 PushNodeStart(HtmlNodeType.Text, _index);
1500 continue;
1502 if (_c == '=')
1504 _state = ParseState.AttributeAfterEquals;
1505 continue;
1507 // no equals, no whitespace, it's a new attrribute starting
1508 _state = ParseState.BetweenAttributes;
1509 DecrementPosition();
1510 break;
1512 case ParseState.AttributeAfterEquals:
1513 if (NewCheck())
1514 continue;
1516 if (IsWhiteSpace(_c))
1517 continue;
1519 if ((_c == '\'') || (_c == '"'))
1521 _state = ParseState.QuotedAttributeValue;
1522 PushAttributeValueStart(_index);
1523 lastquote = _c;
1524 continue;
1526 if (_c == '>')
1528 PushNodeEnd(_index, false);
1529 if (_state != ParseState.AttributeAfterEquals)
1530 continue;
1531 _state = ParseState.Text;
1532 PushNodeStart(HtmlNodeType.Text, _index);
1533 continue;
1535 PushAttributeValueStart(_index-1);
1536 _state = ParseState.AttributeValue;
1537 break;
1539 case ParseState.AttributeValue:
1540 if (NewCheck())
1541 continue;
1543 if (IsWhiteSpace(_c))
1545 PushAttributeValueEnd(_index-1);
1546 _state = ParseState.BetweenAttributes;
1547 continue;
1550 if (_c == '>')
1552 PushAttributeValueEnd(_index-1);
1553 PushNodeEnd(_index, false);
1554 if (_state != ParseState.AttributeValue)
1555 continue;
1556 _state = ParseState.Text;
1557 PushNodeStart(HtmlNodeType.Text, _index);
1558 continue;
1560 break;
1562 case ParseState.QuotedAttributeValue:
1563 if (_c == lastquote)
1565 PushAttributeValueEnd(_index-1);
1566 _state = ParseState.BetweenAttributes;
1567 continue;
1569 if (_c == '<')
1571 //SLIM: if (_index<_text.Length)
1572 if (!_text.Eof (_index))
1574 if (_text[_index] == '%')
1576 _oldstate = _state;
1577 _state = ParseState.ServerSideCode;
1578 continue;
1582 break;
1584 case ParseState.Comment:
1585 if (_c == '>')
1587 if (_fullcomment)
1589 if ((_text[_index-2] != '-') ||
1590 (_text[_index-3] != '-'))
1592 continue;
1595 PushNodeEnd(_index, false);
1596 _state = ParseState.Text;
1597 PushNodeStart(HtmlNodeType.Text, _index);
1598 continue;
1600 break;
1602 case ParseState.ServerSideCode:
1603 if (_c == '%')
1605 //SLIM: if (_index<_text.Length)
1606 if (! _text.Eof (_index))
1608 if (_text[_index] == '>')
1610 switch(_oldstate)
1612 case ParseState.AttributeAfterEquals:
1613 _state = ParseState.AttributeValue;
1614 break;
1616 case ParseState.BetweenAttributes:
1617 PushAttributeNameEnd(_index+1);
1618 _state = ParseState.BetweenAttributes;
1619 break;
1621 default:
1622 _state = _oldstate;
1623 break;
1625 IncrementPosition();
1629 break;
1631 // handle <script>a="</script>"</script>
1632 case ParseState.PcDataQuote:
1633 if ((_c == _pcdata_quote_char) && (_text [_index - 2] != '\\')) {
1634 _pcdata_quote_char = '\0';
1635 _state = ParseState.PcData;
1637 break;
1639 case ParseState.PcData:
1640 Debug ("PCDATA " + _currentnode.Name + " " + _text.Substring(_index-1, _currentnode._namelength+2));
1641 if (_c == '\"' || _c == '\''){
1642 _pcdata_quote_char = _c;
1643 _state = ParseState.PcDataQuote;
1644 break;
1646 // look for </tag + 1 char
1648 // check buffer end
1649 //SLIM: if ((_currentnode._namelength+3)<=(_text.Length-(_index-1)))
1650 if (! _text.Eof (_currentnode._namelength + _index + 1))
1652 if (string.Compare(_text.Substring(_index-1, _currentnode._namelength+2),
1653 "</" + _currentnode.Name, true) == 0)
1655 int c = _text[_index-1 + 2 + _currentnode.Name.Length];
1656 if ((c == '>') || (IsWhiteSpace(c)))
1658 // add the script as a text node
1659 HtmlNode script = CreateNode(HtmlNodeType.Text,
1660 _currentnode._outerstartindex + _currentnode._outerlength);
1661 script._outerlength = _index-1 - script._outerstartindex;
1662 if (_streammode && ReportNode != null)
1663 _stop_parsing = ReportNode (script);
1664 else
1665 _currentnode.AppendChild(script);
1666 Debug ("Found script: [" + script.InnerText + "]");
1668 PushNodeStart(HtmlNodeType.Element, _index-1);
1669 PushNodeNameStart(false, _index-1 +2);
1670 _state = ParseState.Tag;
1671 IncrementPosition();
1675 break;
1679 // finish the current work
1680 if (_currentnode._namestartindex > 0)
1682 PushNodeNameEnd(_index);
1684 PushNodeEnd(_index, false);
1686 // we don't need this anymore
1687 _lastnodes.Clear();
1690 private bool NewCheck()
1692 if (_c != '<')
1694 return false;
1696 //SLIM: if (_index<_text.Length)
1697 if (! _text.Eof (_index))
1699 if (_text[_index] == '%')
1701 switch(_state)
1703 case ParseState.AttributeAfterEquals:
1704 PushAttributeValueStart(_index-1);
1705 break;
1707 case ParseState.BetweenAttributes:
1708 PushAttributeNameStart(_index-1);
1709 break;
1711 case ParseState.WhichTag:
1712 PushNodeNameStart(true, _index-1);
1713 _state = ParseState.Tag;
1714 break;
1716 _oldstate = _state;
1717 _state = ParseState.ServerSideCode;
1718 return true;
1722 PushNodeEnd(_index-1, true);
1723 _state = ParseState.WhichTag;
1724 //SLIM: if ((_index-1) <= (_text.Length-2))
1725 if (!_text.Eof (_index))
1727 if (_text[_index] == '!')
1729 PushNodeStart(HtmlNodeType.Comment, _index-1);
1730 PushNodeNameStart(true, _index);
1731 PushNodeNameEnd(_index+1);
1732 _state = ParseState.Comment;
1733 //SLIM: if (_index<(_text.Length-2))
1734 if (! _text.Eof (_index + 2))
1736 if ((_text[_index+1] == '-') &&
1737 (_text[_index+2] == '-'))
1739 _fullcomment = true;
1741 else
1743 _fullcomment = false;
1746 return true;
1749 PushNodeStart(HtmlNodeType.Element, _index-1);
1750 return true;
1753 private void ReadDocumentEncoding(HtmlNode node)
1755 if (!OptionReadEncoding)
1756 return;
1757 // format is
1758 // <meta http-equiv="content-type" content="text/html;charset=iso-8859-1" />
1760 // when we append a child, we are in node end, so attributes are already populated
1761 if (node._namelength == 4) // quick check, avoids string alloc
1763 // only these nodes can occur before meta
1764 // if we started seeing any other node, we will never see a meta node
1765 if (node.NodeType == HtmlNodeType.Element &&
1766 (node.Name != "head" && node.Name != "script" &&
1767 node.Name != "style" && node.Name != "title" &&
1768 node.Name != "head" && node.Name != "link" &&
1769 node.Name != "html" && node.Name != "meta"))
1770 throw new EncodingFoundException (null);
1771 else if (node.Name == "meta") // all nodes names are lowercase
1773 HtmlAttribute att = node.Attributes["http-equiv"];
1774 if (att != null)
1776 if (string.Compare(att.Value, "content-type", true) == 0)
1778 HtmlAttribute content = node.Attributes["content"];
1779 if (content != null)
1781 string charset = NameValuePairList.GetNameValuePairsValue(content.Value, "charset");
1782 if (charset != null)
1784 _declaredencoding = Encoding.GetEncoding(charset);
1785 if (_onlyDetectEncoding)
1787 throw new EncodingFoundException(_declaredencoding);
1790 if (_streamencoding != null)
1792 if (_declaredencoding.WindowsCodePage != _streamencoding.WindowsCodePage)
1794 AddError(
1795 HtmlParseErrorCode.CharsetMismatch,
1796 _line, _lineposition,
1797 _index, node.OuterHtml,
1798 "Encoding mismatch between StreamEncoding: " +
1799 _streamencoding.WebName + " and DeclaredEncoding: " + _declaredencoding.WebName);
1810 private void PushAttributeNameStart(int index)
1812 _currentattribute = CreateAttribute();
1813 _currentattribute._namestartindex = index;
1814 _currentattribute._line = _line;
1815 _currentattribute._lineposition = _lineposition;
1816 _currentattribute._streamposition = index;
1819 private void PushAttributeNameEnd(int index)
1821 _currentattribute._namelength = index - _currentattribute._namestartindex;
1822 _currentnode.Attributes.Append(_currentattribute);
1825 private void PushAttributeValueStart(int index)
1827 _currentattribute._valuestartindex = index;
1830 private void PushAttributeValueEnd(int index)
1832 _currentattribute._valuelength = index - _currentattribute._valuestartindex;
1835 private void PushNodeStart(HtmlNodeType type, int index)
1837 _currentnode = CreateNode(type, index);
1838 _currentnode._line = _line;
1839 _currentnode._lineposition = _lineposition;
1840 if (type == HtmlNodeType.Element)
1842 _currentnode._lineposition--;
1844 _currentnode._streamposition = index;
1847 private void PushNodeEnd(int index, bool close)
1849 _currentnode._outerlength = index - _currentnode._outerstartindex;
1851 //SLIM: inform caller
1852 if (_streammode && ReportNode != null)
1853 _stop_parsing = ReportNode (_currentnode);
1855 if (_debug) {
1856 if (_currentnode._nodetype == HtmlNodeType.Text)
1857 Debug ("Text:" + _currentnode.InnerText);
1858 else
1859 Debug ((_currentnode.StartTag ? "Start-" : "End-") + _currentnode.Name);
1861 if ((_currentnode._nodetype == HtmlNodeType.Text) ||
1862 (_currentnode._nodetype == HtmlNodeType.Comment))
1864 // forget about void nodes
1865 if (_currentnode._outerlength>0)
1867 _currentnode._innerlength = _currentnode._outerlength;
1868 _currentnode._innerstartindex = _currentnode._outerstartindex;
1869 // SLIM: no need to append child in stream mode
1870 // SLIM: whatever the caller needs to do, tell it to do now
1871 if (!_streammode && _lastparentnode != null)
1873 _lastparentnode.AppendChild(_currentnode);
1877 else
1879 if ((_currentnode._starttag) && (_lastparentnode != _currentnode))
1881 // add to parent node
1882 // SLIM: no need to append child in stream mode
1883 // SLIM: whatever the caller needs to do, tell it to do now
1884 if (!_streammode && _lastparentnode != null)
1886 _lastparentnode.AppendChild(_currentnode);
1889 ReadDocumentEncoding(_currentnode);
1891 // remember last node of this kind
1892 // SLIM: we still to store _currentnode to help other tags in the same level
1893 HtmlNode prev = (HtmlNode)_lastnodes[_currentnode.Name];
1894 _currentnode._prevwithsamename = prev;
1895 _lastnodes[_currentnode.Name] = _currentnode;
1897 // change parent?
1898 if ((_currentnode.NodeType == HtmlNodeType.Document) ||
1899 (_currentnode.NodeType == HtmlNodeType.Element))
1901 _lastparentnode = _currentnode;
1904 if (HtmlNode.IsCDataElement(CurrentNodeName()))
1906 _state = ParseState.PcData;
1907 return;
1910 if ((HtmlNode.IsClosedElement(_currentnode.Name)) ||
1911 (HtmlNode.IsEmptyElement(_currentnode.Name)))
1913 close = true;
1918 if ((close) || (!_currentnode._starttag))
1920 CloseCurrentNode();
1921 if ((_currentnode._nodetype == HtmlNodeType.Text) ||
1922 (_currentnode._nodetype == HtmlNodeType.Comment))
1923 _currentnode = null;
1927 private void PushNodeNameStart(bool starttag, int index)
1929 _currentnode._starttag = starttag;
1930 _currentnode._namestartindex = index;
1933 private string[] GetResetters(string name)
1935 switch (name)
1937 case "li":
1938 return new string[]{"ul"};
1940 case "tr":
1941 return new string[]{"table"};
1943 case "th":
1944 case "td":
1945 return new string[]{"tr", "table"};
1947 default:
1948 return null;
1952 private void FixNestedTags()
1954 // we are only interested by start tags, not closing tags
1955 if (!_currentnode._starttag)
1956 return;
1958 string name = CurrentNodeName().ToLower();
1959 FixNestedTag(name, GetResetters(name));
1962 private void FixNestedTag(string name, string[] resetters)
1964 if (resetters == null)
1965 return;
1967 HtmlNode prev;
1969 // if we find a previous unclosed same name node, without a resetter node between, we must close it
1970 prev = (HtmlNode)_lastnodes[name];
1971 if ((prev != null) && (!prev.Closed))
1974 // try to find a resetter node, if found, we do nothing
1975 if (FindResetterNodes(prev, resetters))
1977 return;
1980 // ok we need to close the prev now
1981 // create a fake closer node
1982 HtmlNode close = new HtmlNode(prev.NodeType, this, -1);
1983 close._endnode = close;
1984 prev.CloseNode(close);
1989 private bool FindResetterNodes(HtmlNode node, string[] names)
1991 if (names == null)
1993 return false;
1995 for(int i=0;i<names.Length;i++)
1997 if (FindResetterNode(node, names[i]) != null)
1999 return true;
2002 return false;
2005 private HtmlNode FindResetterNode(HtmlNode node, string name)
2007 HtmlNode resetter = (HtmlNode)_lastnodes[name];
2008 if (resetter == null)
2009 return null;
2010 if (resetter.Closed)
2012 return null;
2014 if (resetter._streamposition<node._streamposition)
2016 return null;
2018 return resetter;
2021 private void PushNodeNameEnd(int index)
2023 _currentnode._namelength = index - _currentnode._namestartindex;
2024 if (OptionFixNestedTags)
2026 FixNestedTags();
2030 private void CloseCurrentNode()
2032 if (_currentnode.Closed) // text or document are by def closed
2033 return;
2035 bool error = false;
2037 // find last node of this kind
2038 HtmlNode prev = (HtmlNode)_lastnodes[_currentnode.Name];
2039 if (prev == null)
2041 if (HtmlNode.IsClosedElement(_currentnode.Name))
2043 // </br> will be seen as <br>
2044 _currentnode.CloseNode(_currentnode);
2046 // add to parent node
2047 if (_lastparentnode != null)
2049 HtmlNode foundNode = null;
2050 Stack futureChild = new Stack();
2051 for (HtmlNode node = _lastparentnode.LastChild; node != null; node = node.PreviousSibling)
2053 if ((node.Name == _currentnode.Name) && (! node.HasChildNodes))
2055 foundNode = node;
2056 break;
2058 futureChild.Push(node);
2060 if (foundNode != null)
2062 HtmlNode node = null;
2063 while(futureChild.Count != 0)
2065 node = (HtmlNode)futureChild.Pop();
2066 _lastparentnode.RemoveChild(node);
2067 foundNode.AppendChild(node);
2070 else
2072 _lastparentnode.AppendChild(_currentnode);
2077 else
2079 // node has no parent
2080 // node is not a closed node
2082 if (HtmlNode.CanOverlapElement(_currentnode.Name))
2084 // this is a hack: add it as a text node
2085 HtmlNode closenode = CreateNode(HtmlNodeType.Text, _currentnode._outerstartindex);
2086 closenode._outerlength = _currentnode._outerlength;
2087 ((HtmlTextNode)closenode).Text = ((HtmlTextNode)closenode).Text.ToLower();
2088 if (_lastparentnode != null)
2090 _lastparentnode.AppendChild(closenode);
2094 else
2096 if (HtmlNode.IsEmptyElement(_currentnode.Name))
2098 AddError(
2099 HtmlParseErrorCode.EndTagNotRequired,
2100 _currentnode._line, _currentnode._lineposition,
2101 _currentnode._streamposition, _currentnode.OuterHtml,
2102 "End tag </" + _currentnode.Name + "> is not required");
2104 else
2106 // node cannot overlap, node is not empty
2107 AddError(
2108 HtmlParseErrorCode.TagNotOpened,
2109 _currentnode._line, _currentnode._lineposition,
2110 _currentnode._streamposition, _currentnode.OuterHtml,
2111 "Start tag <" + _currentnode.Name + "> was not found");
2112 error = true;
2117 else
2119 if (OptionFixNestedTags)
2121 if (FindResetterNodes(prev, GetResetters(_currentnode.Name)))
2123 AddError(
2124 HtmlParseErrorCode.EndTagInvalidHere,
2125 _currentnode._line, _currentnode._lineposition,
2126 _currentnode._streamposition, _currentnode.OuterHtml,
2127 "End tag </" + _currentnode.Name + "> invalid here");
2128 error = true;
2132 if (!error)
2134 _lastnodes[_currentnode.Name] = prev._prevwithsamename;
2135 prev.CloseNode(_currentnode);
2140 // we close this node, get grandparent
2141 if (!error)
2143 if ((_lastparentnode != null) &&
2144 ((!HtmlNode.IsClosedElement(_currentnode.Name)) ||
2145 (_currentnode._starttag)))
2147 UpdateLastParentNode();
2152 internal void UpdateLastParentNode()
2156 if (_lastparentnode.Closed)
2158 _lastparentnode = _lastparentnode.ParentNode;
2161 while ((_lastparentnode != null) && (_lastparentnode.Closed));
2162 if (_lastparentnode == null)
2164 _lastparentnode = _documentnode;
2168 private string CurrentAttributeName()
2170 return _text.Substring(_currentattribute._namestartindex, _currentattribute._namelength);
2173 private string CurrentAttributeValue()
2175 return _text.Substring(_currentattribute._valuestartindex, _currentattribute._valuelength);
2178 private string CurrentNodeName()
2180 return _text.Substring(_currentnode._namestartindex, _currentnode._namelength);
2183 private string CurrentNodeOuter()
2185 return _text.Substring(_currentnode._outerstartindex, _currentnode._outerlength);
2188 private string CurrentNodeInner()
2190 return _text.Substring(_currentnode._innerstartindex, _currentnode._innerlength);
2193 /// <summary>
2194 /// Determines if the specified character is considered as a whitespace character.
2195 /// </summary>
2196 /// <param name="c">The character to check.</param>
2197 /// <returns>true if if the specified character is considered as a whitespace character.</returns>
2198 public static bool IsWhiteSpace(int c)
2200 if ((c == 10) || (c == 13) || (c == 32) || (c == 9))
2202 return true;
2204 return false;
2209 internal class EncodingFoundException: Exception
2211 private Encoding _encoding;
2213 internal EncodingFoundException(Encoding encoding)
2215 _encoding = encoding;
2218 internal Encoding Encoding
2222 return _encoding;