Compute lucene-style scores for our hits.
[beagle.git] / Filters / HtmlAgilityPack / HtmlDocument.cs
blob7a2739a4027d4459d50e2cca7a9cd0ee244b1387
1 // HtmlAgilityPack V1.0 - Simon Mourier <simonm@microsoft.com>
3 /*
4 Copyright (C) 2003 Simon Mourier <simonm@microsoft.com>
5 All rights reserved.
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions
9 are met:
10 1. Redistributions of source code must retain the above copyright
11 notice, this list of conditions and the following disclaimer.
12 2. Redistributions in binary form must reproduce the above copyright
13 notice, this list of conditions and the following disclaimer in the
14 documentation and/or other materials provided with the distribution.
15 3. The name of the author may not be used to endorse or promote products
16 derived from this software without specific prior written permission.
18 THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
19 IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20 OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21 IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
22 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
23 NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
27 THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 using System;
31 using System.IO;
32 using System.Text;
33 using System.Diagnostics;
34 using System.Collections;
35 using System.Text.RegularExpressions;
36 using System.Xml;
37 using System.Xml.XPath;
39 namespace HtmlAgilityPack
41 /// <summary>
42 /// Represents the type of parsing error.
43 /// </summary>
44 public enum HtmlParseErrorCode
46 /// <summary>
47 /// A tag was not closed.
48 /// </summary>
49 TagNotClosed,
51 /// <summary>
52 /// A tag was not opened.
53 /// </summary>
54 TagNotOpened,
56 /// <summary>
57 /// There is a charset mismatch between stream and declared (META) encoding.
58 /// </summary>
59 CharsetMismatch,
61 /// <summary>
62 /// An end tag was not required.
63 /// </summary>
64 EndTagNotRequired,
66 /// <summary>
67 /// An end tag is invalid at this position.
68 /// </summary>
69 EndTagInvalidHere
72 /// <summary>
73 /// Represents a parsing error found during document parsing.
74 /// </summary>
75 public class HtmlParseError
77 private HtmlParseErrorCode _code;
78 private int _line;
79 private int _linePosition;
80 private int _streamPosition;
81 private string _sourceText;
82 private string _reason;
84 internal HtmlParseError(
85 HtmlParseErrorCode code,
86 int line,
87 int linePosition,
88 int streamPosition,
89 string sourceText,
90 string reason)
92 _code = code;
93 _line = line;
94 _linePosition = linePosition;
95 _streamPosition = streamPosition;
96 _sourceText = sourceText;
97 _reason = reason;
100 /// <summary>
101 /// Gets the type of error.
102 /// </summary>
103 public HtmlParseErrorCode Code
107 return _code;
111 /// <summary>
112 /// Gets the line number of this error in the document.
113 /// </summary>
114 public int Line
118 return _line;
122 /// <summary>
123 /// Gets the column number of this error in the document.
124 /// </summary>
125 public int LinePosition
129 return _linePosition;
133 /// <summary>
134 /// Gets the absolstream position of this error in the document, relative to the start of the document.
135 /// </summary>
136 public int StreamPosition
140 return _streamPosition;
144 /// <summary>
145 /// Gets the the full text of the line containing the error.
146 /// </summary>
147 public string SourceText
151 return _sourceText;
155 /// <summary>
156 /// Gets a description for the error.
157 /// </summary>
158 public string Reason
162 return _reason;
167 /// <summary>
168 /// Represents a complete HTML document.
169 /// </summary>
170 public class HtmlDocument: IXPathNavigable
172 internal static readonly string HtmlExceptionRefNotChild = "Reference node must be a child of this node";
173 internal static readonly string HtmlExceptionUseIdAttributeFalse = "You need to set UseIdAttribute property to true to enable this feature";
175 internal Hashtable _openednodes;
176 internal Hashtable _lastnodes = new Hashtable();
177 internal Hashtable _nodesid;
178 private HtmlNode _documentnode;
179 internal string _text;
180 private HtmlNode _currentnode;
181 private HtmlNode _lastparentnode;
182 private HtmlAttribute _currentattribute;
183 private int _index;
184 private int _line;
185 private int _lineposition, _maxlineposition;
186 private int _c;
187 private bool _fullcomment;
188 private System.Text.Encoding _streamencoding;
189 private System.Text.Encoding _declaredencoding;
190 private ArrayList _parseerrors = new ArrayList();
191 private ParseState _state, _oldstate;
192 private Crc32 _crc32 = null;
193 private bool _onlyDetectEncoding = false;
195 // public props
197 /// <summary>
198 /// Defines if a checksum must be computed for the document while parsing. Default is false.
199 /// </summary>
200 public bool OptionComputeChecksum = false;
202 /// <summary>
203 /// Defines if declared encoding must be read from the document.
204 /// Declared encoding is determined using the meta http-equiv="content-type" content="text/html;charset=XXXXX" html node.
205 /// Default is true.
206 /// </summary>
207 public bool OptionReadEncoding = true;
210 /// <summary>
211 /// Defines if non closed nodes will be checked at the end of parsing. Default is true.
212 /// </summary>
213 public bool OptionCheckSyntax = true;
215 /// <summary>
216 /// Defines if the 'id' attribute must be specifically used. Default is true.
217 /// </summary>
218 public bool OptionUseIdAttribute = true;
220 /// <summary>
221 /// Defines if empty nodes must be written as closed during output. Default is false.
222 /// </summary>
223 public bool OptionWriteEmptyNodes = false;
225 /// <summary>
226 /// Defines if output must conform to XML, instead of HTML.
227 /// </summary>
228 public bool OptionOutputAsXml = false;
230 /// <summary>
231 /// Defines if name must be output in uppercase. Default is false.
232 /// </summary>
233 public bool OptionOutputUpperCase = false;
235 /// <summary>
236 /// Defines if attribute value output must be optimized (not bound with double quotes if it is possible). Default is false.
237 /// </summary>
238 public bool OptionOutputOptimizeAttributeValues = false;
240 /// <summary>
241 /// Adds Debugging attributes to node. Default is false.
242 /// </summary>
243 public bool OptionAddDebuggingAttributes = false;
245 /// <summary>
246 /// Defines if source text must be extracted while parsing errors.
247 /// If the document has a lot of errors, or cascading errors, parsing performance can be dramatically affected if set to true.
248 /// Default is false.
249 /// </summary>
250 public bool OptionExtractErrorSourceText = false; // turning this on can dramatically slow performance if a lot of errors are detected
252 /// <summary>
253 /// Defines if closing for non closed nodes must be done at the end or directly in the document.
254 /// Setting this to true can actually change how browsers render the page. Default is false.
255 /// </summary>
256 public bool OptionAutoCloseOnEnd = false; // close errors at the end
258 /// <summary>
259 /// Defines if LI, TR, TH, TD tags must be partially fixed when nesting errors are detected. Default is false.
260 /// </summary>
261 public bool OptionFixNestedTags = false; // fix li, tr, th, td tags
263 /// <summary>
264 /// Defines the maximum length of source text or parse errors. Default is 100.
265 /// </summary>
266 public int OptionExtractErrorSourceTextMaxLength = 100;
268 /// <summary>
269 /// Defines the default stream encoding to use. Default is System.Text.Encoding.Default.
270 /// </summary>
271 public System.Text.Encoding OptionDefaultStreamEncoding = System.Text.Encoding.Default;
273 /// <summary>
274 /// Gets a list of parse errors found in the document.
275 /// </summary>
276 public ArrayList ParseErrors
280 return _parseerrors;
284 /// <summary>
285 /// Gets the document's stream encoding.
286 /// </summary>
287 public System.Text.Encoding StreamEncoding
291 return _streamencoding;
295 /// <summary>
296 /// Gets the document's declared encoding.
297 /// Declared encoding is determined using the meta http-equiv="content-type" content="text/html;charset=XXXXX" html node.
298 /// </summary>
299 public System.Text.Encoding DeclaredEncoding
303 return _declaredencoding;
307 /// <summary>
308 /// Creates an instance of an HTML document.
309 /// </summary>
310 public HtmlDocument()
312 _documentnode = CreateNode(HtmlNodeType.Document, 0);
315 internal HtmlNode GetXmlDeclaration()
317 if (!_documentnode.HasChildNodes)
319 return null;
322 foreach(HtmlNode node in _documentnode._childnodes)
324 if (node.Name == "?xml") // it's ok, names are case sensitive
326 return node;
329 return null;
332 /// <summary>
333 /// Applies HTML encoding to a specified string.
334 /// </summary>
335 /// <param name="html">The input string to encode. May not be null.</param>
336 /// <returns>The encoded string.</returns>
337 public static string HtmlEncode(string html)
339 if (html == null)
341 throw new ArgumentNullException("html");
343 // replace & by &amp; but only once!
344 Regex rx = new Regex("&(?!(amp;)|(lt;)|(gt;)|(quot;))", RegexOptions.IgnoreCase);
345 return rx.Replace(html, "&amp;").Replace("<", "&lt;").Replace(">", "&gt;").Replace("\"", "&quot;");
348 /// <summary>
349 /// Detects the encoding of an HTML stream.
350 /// </summary>
351 /// <param name="stream">The input stream. May not be null.</param>
352 /// <returns>The detected encoding.</returns>
353 public Encoding DetectEncoding(Stream stream)
355 if (stream == null)
357 throw new ArgumentNullException("stream");
359 return DetectEncoding(new StreamReader(stream));
362 /// <summary>
363 /// Detects the encoding of an HTML file.
364 /// </summary>
365 /// <param name="path">Path for the file containing the HTML document to detect. May not be null.</param>
366 /// <returns>The detected encoding.</returns>
367 public Encoding DetectEncoding(string path)
369 if (path == null)
371 throw new ArgumentNullException("path");
373 StreamReader sr = new StreamReader(path, OptionDefaultStreamEncoding);
374 Encoding encoding = DetectEncoding(sr);
375 sr.Close();
376 return encoding;
379 /// <summary>
380 /// Detects the encoding of an HTML text.
381 /// </summary>
382 /// <param name="html">The input html text. May not be null.</param>
383 /// <returns>The detected encoding.</returns>
384 public Encoding DetectEncodingHtml(string html)
386 if (html == null)
388 throw new ArgumentNullException("html");
390 StringReader sr = new StringReader(html);
391 Encoding encoding = DetectEncoding(sr);
392 sr.Close();
393 return encoding;
396 /// <summary>
397 /// Detects the encoding of an HTML text provided on a TextReader.
398 /// </summary>
399 /// <param name="reader">The TextReader used to feed the HTML. May not be null.</param>
400 /// <returns>The detected encoding.</returns>
401 public Encoding DetectEncoding(TextReader reader)
403 if (reader == null)
405 throw new ArgumentNullException("reader");
407 _onlyDetectEncoding = true;
408 if (OptionCheckSyntax)
410 _openednodes = new Hashtable();
412 else
414 _openednodes = null;
417 if (OptionUseIdAttribute)
419 _nodesid = new Hashtable();
421 else
423 _nodesid = null;
426 StreamReader sr = reader as StreamReader;
427 if (sr != null)
429 _streamencoding = sr.CurrentEncoding;
431 else
433 _streamencoding = null;
435 _declaredencoding = null;
437 _text = reader.ReadToEnd();
438 _documentnode = CreateNode(HtmlNodeType.Document, 0);
440 // this is a hack, but it allows us not to muck with the original parsing code
443 Parse();
445 catch(EncodingFoundException ex)
447 return ex.Encoding;
449 return null;
452 /// <summary>
453 /// Loads an HTML document from a stream.
454 /// </summary>
455 /// <param name="stream">The input stream.</param>
456 public void Load(Stream stream)
458 Load(new StreamReader(stream, OptionDefaultStreamEncoding));
461 /// <summary>
462 /// Loads an HTML document from a stream.
463 /// </summary>
464 /// <param name="stream">The input stream.</param>
465 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the stream.</param>
466 public void Load(Stream stream, bool detectEncodingFromByteOrderMarks)
468 Load(new StreamReader(stream, detectEncodingFromByteOrderMarks));
471 /// <summary>
472 /// Loads an HTML document from a stream.
473 /// </summary>
474 /// <param name="stream">The input stream.</param>
475 /// <param name="encoding">The character encoding to use.</param>
476 public void Load(Stream stream, Encoding encoding)
478 Load(new StreamReader(stream, encoding));
481 /// <summary>
482 /// Loads an HTML document from a stream.
483 /// </summary>
484 /// <param name="stream">The input stream.</param>
485 /// <param name="encoding">The character encoding to use.</param>
486 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the stream.</param>
487 public void Load(Stream stream, Encoding encoding, bool detectEncodingFromByteOrderMarks)
489 Load(new StreamReader(stream, encoding, detectEncodingFromByteOrderMarks));
492 /// <summary>
493 /// Loads an HTML document from a stream.
494 /// </summary>
495 /// <param name="stream">The input stream.</param>
496 /// <param name="encoding">The character encoding to use.</param>
497 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the stream.</param>
498 /// <param name="buffersize">The minimum buffer size.</param>
499 public void Load(Stream stream, Encoding encoding, bool detectEncodingFromByteOrderMarks, int buffersize)
501 Load(new StreamReader(stream, encoding, detectEncodingFromByteOrderMarks, buffersize));
504 /// <summary>
505 /// Loads an HTML document from a file.
506 /// </summary>
507 /// <param name="path">The complete file path to be read. May not be null.</param>
508 public void Load(string path)
510 if (path == null)
512 throw new ArgumentNullException("path");
514 StreamReader sr = new StreamReader(path, OptionDefaultStreamEncoding);
515 Load(sr);
516 sr.Close();
519 /// <summary>
520 /// Loads an HTML document from a file.
521 /// </summary>
522 /// <param name="path">The complete file path to be read. May not be null.</param>
523 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
524 public void Load(string path, bool detectEncodingFromByteOrderMarks)
526 if (path == null)
528 throw new ArgumentNullException("path");
530 StreamReader sr = new StreamReader(path, detectEncodingFromByteOrderMarks);
531 Load(sr);
532 sr.Close();
535 /// <summary>
536 /// Loads an HTML document from a file.
537 /// </summary>
538 /// <param name="path">The complete file path to be read. May not be null.</param>
539 /// <param name="encoding">The character encoding to use. May not be null.</param>
540 public void Load(string path, Encoding encoding)
542 if (path == null)
544 throw new ArgumentNullException("path");
546 if (encoding == null)
548 throw new ArgumentNullException("encoding");
550 StreamReader sr = new StreamReader(path, encoding);
551 Load(sr);
552 sr.Close();
555 /// <summary>
556 /// Loads an HTML document from a file.
557 /// </summary>
558 /// <param name="path">The complete file path to be read. May not be null.</param>
559 /// <param name="encoding">The character encoding to use. May not be null.</param>
560 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
561 public void Load(string path, Encoding encoding, bool detectEncodingFromByteOrderMarks)
563 if (path == null)
565 throw new ArgumentNullException("path");
567 if (encoding == null)
569 throw new ArgumentNullException("encoding");
571 StreamReader sr = new StreamReader(path, encoding, detectEncodingFromByteOrderMarks);
572 Load(sr);
573 sr.Close();
576 /// <summary>
577 /// Loads an HTML document from a file.
578 /// </summary>
579 /// <param name="path">The complete file path to be read. May not be null.</param>
580 /// <param name="encoding">The character encoding to use. May not be null.</param>
581 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
582 /// <param name="buffersize">The minimum buffer size.</param>
583 public void Load(string path, Encoding encoding, bool detectEncodingFromByteOrderMarks, int buffersize)
585 if (path == null)
587 throw new ArgumentNullException("path");
589 if (encoding == null)
591 throw new ArgumentNullException("encoding");
593 StreamReader sr = new StreamReader(path, encoding, detectEncodingFromByteOrderMarks, buffersize);
594 Load(sr);
595 sr.Close();
598 /// <summary>
599 /// Loads the HTML document from the specified string.
600 /// </summary>
601 /// <param name="html">String containing the HTML document to load. May not be null.</param>
602 public void LoadHtml(string html)
604 if (html == null)
606 throw new ArgumentNullException("html");
608 StringReader sr = new StringReader(html);
609 Load(sr);
610 sr.Close();
613 /// <summary>
614 /// Detects the encoding of an HTML document from a file first, and then loads the file.
615 /// </summary>
616 /// <param name="path">The complete file path to be read.</param>
617 public void DetectEncodingAndLoad(string path)
619 DetectEncodingAndLoad(path, true);
622 /// <summary>
623 /// Detects the encoding of an HTML document from a file first, and then loads the file.
624 /// </summary>
625 /// <param name="path">The complete file path to be read. May not be null.</param>
626 /// <param name="detectEncoding">true to detect encoding, false otherwise.</param>
627 public void DetectEncodingAndLoad(string path, bool detectEncoding)
629 if (path == null)
631 throw new ArgumentNullException("path");
633 System.Text.Encoding enc;
634 if (detectEncoding)
636 enc = DetectEncoding(path);
638 else
640 enc = null;
643 if (enc == null)
645 Load(path);
647 else
649 Load(path, enc);
653 /// <summary>
654 /// Loads the HTML document from the specified TextReader.
655 /// </summary>
656 /// <param name="reader">The TextReader used to feed the HTML data into the document. May not be null.</param>
657 public void Load(TextReader reader)
659 // all Load methods pass down to this one
660 if (reader == null)
662 throw new ArgumentNullException("reader");
665 _onlyDetectEncoding = false;
667 if (OptionCheckSyntax)
669 _openednodes = new Hashtable();
671 else
673 _openednodes = null;
676 if (OptionUseIdAttribute)
678 _nodesid = new Hashtable();
680 else
682 _nodesid = null;
685 StreamReader sr = reader as StreamReader;
686 if (sr != null)
690 // trigger bom read if needed
691 sr.Peek();
693 catch
695 // void on purpose
697 _streamencoding = sr.CurrentEncoding;
699 else
701 _streamencoding = null;
703 _declaredencoding = null;
705 _text = reader.ReadToEnd();
706 _documentnode = CreateNode(HtmlNodeType.Document, 0);
707 Parse();
709 if (OptionCheckSyntax)
711 foreach(HtmlNode node in _openednodes.Values)
713 if (!node._starttag) // already reported
715 continue;
718 string html;
719 if (OptionExtractErrorSourceText)
721 html = node.OuterHtml;
722 if (html.Length > OptionExtractErrorSourceTextMaxLength)
724 html = html.Substring(0, OptionExtractErrorSourceTextMaxLength);
727 else
729 html = string.Empty;
731 AddError(
732 HtmlParseErrorCode.TagNotClosed,
733 node._line, node._lineposition,
734 node._streamposition, html,
735 "End tag </" + node.Name + "> was not found");
738 // we don't need this anymore
739 _openednodes.Clear();
743 internal System.Text.Encoding GetOutEncoding()
745 // when unspecified, use the stream encoding first
746 if (_declaredencoding != null)
748 return _declaredencoding;
750 else
752 if (_streamencoding != null)
754 return _streamencoding;
757 return OptionDefaultStreamEncoding;
761 /// <summary>
762 /// Gets the document's output encoding.
763 /// </summary>
764 public System.Text.Encoding Encoding
768 return GetOutEncoding();
772 /// <summary>
773 /// Saves the HTML document to the specified stream.
774 /// </summary>
775 /// <param name="outStream">The stream to which you want to save.</param>
776 public void Save(Stream outStream)
778 StreamWriter sw = new StreamWriter(outStream, GetOutEncoding());
779 Save(sw);
782 /// <summary>
783 /// Saves the HTML document to the specified stream.
784 /// </summary>
785 /// <param name="outStream">The stream to which you want to save. May not be null.</param>
786 /// <param name="encoding">The character encoding to use. May not be null.</param>
787 public void Save(Stream outStream, System.Text.Encoding encoding)
789 if (outStream == null)
791 throw new ArgumentNullException("outStream");
793 if (encoding == null)
795 throw new ArgumentNullException("encoding");
797 StreamWriter sw = new StreamWriter(outStream, encoding);
798 Save(sw);
801 /// <summary>
802 /// Saves the mixed document to the specified file.
803 /// </summary>
804 /// <param name="filename">The location of the file where you want to save the document.</param>
805 public void Save(string filename)
807 StreamWriter sw = new StreamWriter(filename, false, GetOutEncoding());
808 Save(sw);
809 sw.Close();
812 /// <summary>
813 /// Saves the mixed document to the specified file.
814 /// </summary>
815 /// <param name="filename">The location of the file where you want to save the document. May not be null.</param>
816 /// <param name="encoding">The character encoding to use. May not be null.</param>
817 public void Save(string filename, System.Text.Encoding encoding)
819 if (filename == null)
821 throw new ArgumentNullException("filename");
823 if (encoding == null)
825 throw new ArgumentNullException("encoding");
827 StreamWriter sw = new StreamWriter(filename, false, encoding);
828 Save(sw);
829 sw.Close();
832 /// <summary>
833 /// Saves the HTML document to the specified StreamWriter.
834 /// </summary>
835 /// <param name="writer">The StreamWriter to which you want to save.</param>
836 public void Save(StreamWriter writer)
838 Save((TextWriter)writer);
841 /// <summary>
842 /// Saves the HTML document to the specified TextWriter.
843 /// </summary>
844 /// <param name="writer">The TextWriter to which you want to save. May not be null.</param>
845 public void Save(TextWriter writer)
847 if (writer == null)
849 throw new ArgumentNullException("writer");
851 DocumentNode.WriteTo(writer);
854 /// <summary>
855 /// Saves the HTML document to the specified XmlWriter.
856 /// </summary>
857 /// <param name="writer">The XmlWriter to which you want to save.</param>
858 public void Save(XmlWriter writer)
860 DocumentNode.WriteTo(writer);
861 writer.Flush();
864 /// <summary>
865 /// Creates a new XPathNavigator object for navigating this HTML document.
866 /// </summary>
867 /// <returns>An XPathNavigator object. The XPathNavigator is positioned on the root of the document.</returns>
868 public XPathNavigator CreateNavigator()
870 return new HtmlNodeNavigator(this, _documentnode);
873 internal void SetIdForNode(HtmlNode node, string id)
875 if (!OptionUseIdAttribute)
877 return;
880 if ((_nodesid == null) || (id == null))
882 return;
885 if (node == null)
887 _nodesid.Remove(id.ToLower());
889 else
891 _nodesid[id.ToLower()] = node;
895 /// <summary>
896 /// Gets the HTML node with the specified 'id' attribute value.
897 /// </summary>
898 /// <param name="id">The attribute id to match. May not be null.</param>
899 /// <returns>The HTML node with the matching id or null if not found.</returns>
900 public HtmlNode GetElementbyId(string id)
902 if (id == null)
904 throw new ArgumentNullException("id");
906 if (_nodesid == null)
908 throw new Exception(HtmlExceptionUseIdAttributeFalse);
911 return _nodesid[id.ToLower()] as HtmlNode;
914 /// <summary>
915 /// Creates an HTML element node with the specified name.
916 /// </summary>
917 /// <param name="name">The qualified name of the element. May not be null.</param>
918 /// <returns>The new HTML node.</returns>
919 public HtmlNode CreateElement(string name)
921 if (name == null)
923 throw new ArgumentNullException("name");
925 HtmlNode node = CreateNode(HtmlNodeType.Element);
926 node._name = name;
927 return node;
930 /// <summary>
931 /// Creates an HTML comment node.
932 /// </summary>
933 /// <returns>The new HTML comment node.</returns>
934 public HtmlCommentNode CreateComment()
936 return (HtmlCommentNode)CreateNode(HtmlNodeType.Comment);
939 /// <summary>
940 /// Creates an HTML comment node with the specified comment text.
941 /// </summary>
942 /// <param name="comment">The comment text. May not be null.</param>
943 /// <returns>The new HTML comment node.</returns>
944 public HtmlCommentNode CreateComment(string comment)
946 if (comment == null)
948 throw new ArgumentNullException("comment");
950 HtmlCommentNode c = CreateComment();
951 c.Comment = comment;
952 return c;
955 /// <summary>
956 /// Creates an HTML text node.
957 /// </summary>
958 /// <returns>The new HTML text node.</returns>
959 public HtmlTextNode CreateTextNode()
961 return (HtmlTextNode)CreateNode(HtmlNodeType.Text);
964 /// <summary>
965 /// Creates an HTML text node with the specified text.
966 /// </summary>
967 /// <param name="text">The text of the node. May not be null.</param>
968 /// <returns>The new HTML text node.</returns>
969 public HtmlTextNode CreateTextNode(string text)
971 if (text == null)
973 throw new ArgumentNullException("text");
975 HtmlTextNode t = CreateTextNode();
976 t.Text = text;
977 return t;
980 internal HtmlNode CreateNode(HtmlNodeType type)
982 return CreateNode(type, -1);
985 internal HtmlNode CreateNode(HtmlNodeType type, int index)
987 switch (type)
989 case HtmlNodeType.Comment:
990 return new HtmlCommentNode(this, index);
992 case HtmlNodeType.Text:
993 return new HtmlTextNode(this, index);
995 default:
996 return new HtmlNode(type, this, index);
1000 internal HtmlAttribute CreateAttribute()
1002 return new HtmlAttribute(this);
1005 /// <summary>
1006 /// Creates an HTML attribute with the specified name.
1007 /// </summary>
1008 /// <param name="name">The name of the attribute. May not be null.</param>
1009 /// <returns>The new HTML attribute.</returns>
1010 public HtmlAttribute CreateAttribute(string name)
1012 if (name == null)
1014 throw new ArgumentNullException("name");
1016 HtmlAttribute att = CreateAttribute();
1017 att.Name = name;
1018 return att;
1021 /// <summary>
1022 /// Creates an HTML attribute with the specified name.
1023 /// </summary>
1024 /// <param name="name">The name of the attribute. May not be null.</param>
1025 /// <param name="value">The value of the attribute.</param>
1026 /// <returns>The new HTML attribute.</returns>
1027 public HtmlAttribute CreateAttribute(string name, string value)
1029 if (name == null)
1031 throw new ArgumentNullException("name");
1033 HtmlAttribute att = CreateAttribute(name);
1034 att.Value = value;
1035 return att;
1038 /// <summary>
1039 /// Gets the root node of the document.
1040 /// </summary>
1041 public HtmlNode DocumentNode
1045 return _documentnode;
1049 /// <summary>
1050 /// Gets the document CRC32 checksum if OptionComputeChecksum was set to true before parsing, 0 otherwise.
1051 /// </summary>
1052 public int CheckSum
1056 if (_crc32 == null)
1058 return 0;
1060 else
1062 return (int)_crc32.CheckSum;
1067 private HtmlParseError AddError(
1068 HtmlParseErrorCode code,
1069 int line,
1070 int linePosition,
1071 int streamPosition,
1072 string sourceText,
1073 string reason)
1075 HtmlParseError err = new HtmlParseError(code, line, linePosition, streamPosition, sourceText, reason);
1076 _parseerrors.Add(err);
1077 return err;
1080 private enum ParseState
1082 Text,
1083 WhichTag,
1084 Tag,
1085 BetweenAttributes,
1086 EmptyTag,
1087 AttributeName,
1088 AttributeBeforeEquals,
1089 AttributeAfterEquals,
1090 AttributeValue,
1091 Comment,
1092 QuotedAttributeValue,
1093 ServerSideCode,
1094 PcData
1097 private void IncrementPosition()
1099 if (_crc32 != null)
1101 // REVIEW: should we add some checksum code in DecrementPosition too?
1102 _crc32.AddToCRC32(_c);
1105 _index++;
1106 _maxlineposition = _lineposition;
1107 if (_c == 10)
1109 _lineposition = 1;
1110 _line++;
1112 else
1114 _lineposition++;
1118 private void DecrementPosition()
1120 _index--;
1121 if (_lineposition == 1)
1123 _lineposition = _maxlineposition;
1124 _line--;
1126 else
1128 _lineposition--;
1132 private void Parse()
1134 int lastquote = 0;
1135 if (OptionComputeChecksum)
1137 _crc32 = new Crc32();
1140 _lastnodes = new Hashtable();
1141 _c = 0;
1142 _fullcomment = false;
1143 _parseerrors = new ArrayList();
1144 _line = 1;
1145 _lineposition = 1;
1146 _maxlineposition = 1;
1148 _state = ParseState.Text;
1149 _oldstate = _state;
1150 _documentnode._innerlength = _text.Length;
1151 _documentnode._outerlength = _text.Length;
1153 _lastparentnode = _documentnode;
1154 _currentnode = CreateNode(HtmlNodeType.Text, 0);
1155 _currentattribute = null;
1157 _index = 0;
1158 PushNodeStart(HtmlNodeType.Text, 0);
1159 while (_index<_text.Length)
1161 _c = _text[_index];
1162 IncrementPosition();
1164 switch(_state)
1166 case ParseState.Text:
1167 if (NewCheck())
1168 continue;
1169 break;
1171 case ParseState.WhichTag:
1172 if (NewCheck())
1173 continue;
1174 if (_c == '/')
1176 PushNodeNameStart(false, _index);
1178 else
1180 PushNodeNameStart(true, _index-1);
1181 DecrementPosition();
1183 _state = ParseState.Tag;
1184 break;
1186 case ParseState.Tag:
1187 if (NewCheck())
1188 continue;
1189 if (IsWhiteSpace(_c))
1191 PushNodeNameEnd(_index-1);
1192 if (_state != ParseState.Tag)
1193 continue;
1194 _state = ParseState.BetweenAttributes;
1195 continue;
1197 if (_c == '/')
1199 PushNodeNameEnd(_index-1);
1200 if (_state != ParseState.Tag)
1201 continue;
1202 _state = ParseState.EmptyTag;
1203 continue;
1205 if (_c == '>')
1207 PushNodeNameEnd(_index-1);
1208 if (_state != ParseState.Tag)
1209 continue;
1210 PushNodeEnd(_index, false);
1211 if (_state != ParseState.Tag)
1212 continue;
1213 _state = ParseState.Text;
1214 PushNodeStart(HtmlNodeType.Text, _index);
1216 break;
1218 case ParseState.BetweenAttributes:
1219 if (NewCheck())
1220 continue;
1222 if (IsWhiteSpace(_c))
1223 continue;
1225 if ((_c == '/') || (_c == '?'))
1227 _state = ParseState.EmptyTag;
1228 continue;
1231 if (_c == '>')
1233 PushNodeEnd(_index, false);
1234 if (_state != ParseState.BetweenAttributes)
1235 continue;
1236 _state = ParseState.Text;
1237 PushNodeStart(HtmlNodeType.Text, _index);
1238 continue;
1241 PushAttributeNameStart(_index-1);
1242 _state = ParseState.AttributeName;
1243 break;
1245 case ParseState.EmptyTag:
1246 if (NewCheck())
1247 continue;
1249 if (_c == '>')
1251 PushNodeEnd(_index, true);
1252 if (_state != ParseState.EmptyTag)
1253 continue;
1254 _state = ParseState.Text;
1255 PushNodeStart(HtmlNodeType.Text, _index);
1256 continue;
1258 _state = ParseState.BetweenAttributes;
1259 break;
1261 case ParseState.AttributeName:
1262 if (NewCheck())
1263 continue;
1265 if (IsWhiteSpace(_c))
1267 PushAttributeNameEnd(_index-1);
1268 _state = ParseState.AttributeBeforeEquals;
1269 continue;
1271 if (_c == '=')
1273 PushAttributeNameEnd(_index-1);
1274 _state = ParseState.AttributeAfterEquals;
1275 continue;
1277 if (_c == '>')
1279 PushAttributeNameEnd(_index-1);
1280 PushNodeEnd(_index, false);
1281 if (_state != ParseState.AttributeName)
1282 continue;
1283 _state = ParseState.Text;
1284 PushNodeStart(HtmlNodeType.Text, _index);
1285 continue;
1287 break;
1289 case ParseState.AttributeBeforeEquals:
1290 if (NewCheck())
1291 continue;
1293 if (IsWhiteSpace(_c))
1294 continue;
1295 if (_c == '>')
1297 PushNodeEnd(_index, false);
1298 if (_state != ParseState.AttributeBeforeEquals)
1299 continue;
1300 _state = ParseState.Text;
1301 PushNodeStart(HtmlNodeType.Text, _index);
1302 continue;
1304 if (_c == '=')
1306 _state = ParseState.AttributeAfterEquals;
1307 continue;
1309 // no equals, no whitespace, it's a new attrribute starting
1310 _state = ParseState.BetweenAttributes;
1311 DecrementPosition();
1312 break;
1314 case ParseState.AttributeAfterEquals:
1315 if (NewCheck())
1316 continue;
1318 if (IsWhiteSpace(_c))
1319 continue;
1321 if ((_c == '\'') || (_c == '"'))
1323 _state = ParseState.QuotedAttributeValue;
1324 PushAttributeValueStart(_index);
1325 lastquote = _c;
1326 continue;
1328 if (_c == '>')
1330 PushNodeEnd(_index, false);
1331 if (_state != ParseState.AttributeAfterEquals)
1332 continue;
1333 _state = ParseState.Text;
1334 PushNodeStart(HtmlNodeType.Text, _index);
1335 continue;
1337 PushAttributeValueStart(_index-1);
1338 _state = ParseState.AttributeValue;
1339 break;
1341 case ParseState.AttributeValue:
1342 if (NewCheck())
1343 continue;
1345 if (IsWhiteSpace(_c))
1347 PushAttributeValueEnd(_index-1);
1348 _state = ParseState.BetweenAttributes;
1349 continue;
1352 if (_c == '>')
1354 PushAttributeValueEnd(_index-1);
1355 PushNodeEnd(_index, false);
1356 if (_state != ParseState.AttributeValue)
1357 continue;
1358 _state = ParseState.Text;
1359 PushNodeStart(HtmlNodeType.Text, _index);
1360 continue;
1362 break;
1364 case ParseState.QuotedAttributeValue:
1365 if (_c == lastquote)
1367 PushAttributeValueEnd(_index-1);
1368 _state = ParseState.BetweenAttributes;
1369 continue;
1371 if (_c == '<')
1373 if (_index<_text.Length)
1375 if (_text[_index] == '%')
1377 _oldstate = _state;
1378 _state = ParseState.ServerSideCode;
1379 continue;
1383 break;
1385 case ParseState.Comment:
1386 if (_c == '>')
1388 if (_fullcomment)
1390 if ((_text[_index-2] != '-') ||
1391 (_text[_index-3] != '-'))
1393 continue;
1396 PushNodeEnd(_index, false);
1397 _state = ParseState.Text;
1398 PushNodeStart(HtmlNodeType.Text, _index);
1399 continue;
1401 break;
1403 case ParseState.ServerSideCode:
1404 if (_c == '%')
1406 if (_index<_text.Length)
1408 if (_text[_index] == '>')
1410 switch(_oldstate)
1412 case ParseState.AttributeAfterEquals:
1413 _state = ParseState.AttributeValue;
1414 break;
1416 case ParseState.BetweenAttributes:
1417 PushAttributeNameEnd(_index+1);
1418 _state = ParseState.BetweenAttributes;
1419 break;
1421 default:
1422 _state = _oldstate;
1423 break;
1425 IncrementPosition();
1429 break;
1431 case ParseState.PcData:
1432 // look for </tag + 1 char
1434 // check buffer end
1435 if ((_currentnode._namelength+3)<=(_text.Length-(_index-1)))
1437 if (string.Compare(_text.Substring(_index-1, _currentnode._namelength+2),
1438 "</" + _currentnode.Name, true) == 0)
1440 int c = _text[_index-1 + 2 + _currentnode.Name.Length];
1441 if ((c == '>') || (IsWhiteSpace(c)))
1443 // add the script as a text node
1444 HtmlNode script = CreateNode(HtmlNodeType.Text,
1445 _currentnode._outerstartindex + _currentnode._outerlength);
1446 script._outerlength = _index-1 - script._outerstartindex;
1447 _currentnode.AppendChild(script);
1450 PushNodeStart(HtmlNodeType.Element, _index-1);
1451 PushNodeNameStart(false, _index-1 +2);
1452 _state = ParseState.Tag;
1453 IncrementPosition();
1457 break;
1461 // finish the current work
1462 if (_currentnode._namestartindex > 0)
1464 PushNodeNameEnd(_index);
1466 PushNodeEnd(_index, false);
1468 // we don't need this anymore
1469 _lastnodes.Clear();
1472 private bool NewCheck()
1474 if (_c != '<')
1476 return false;
1478 if (_index<_text.Length)
1480 if (_text[_index] == '%')
1482 switch(_state)
1484 case ParseState.AttributeAfterEquals:
1485 PushAttributeValueStart(_index-1);
1486 break;
1488 case ParseState.BetweenAttributes:
1489 PushAttributeNameStart(_index-1);
1490 break;
1492 case ParseState.WhichTag:
1493 PushNodeNameStart(true, _index-1);
1494 _state = ParseState.Tag;
1495 break;
1497 _oldstate = _state;
1498 _state = ParseState.ServerSideCode;
1499 return true;
1503 PushNodeEnd(_index-1, true);
1504 _state = ParseState.WhichTag;
1505 if ((_index-1) <= (_text.Length-2))
1507 if (_text[_index] == '!')
1509 PushNodeStart(HtmlNodeType.Comment, _index-1);
1510 PushNodeNameStart(true, _index);
1511 PushNodeNameEnd(_index+1);
1512 _state = ParseState.Comment;
1513 if (_index<(_text.Length-2))
1515 if ((_text[_index+1] == '-') &&
1516 (_text[_index+2] == '-'))
1518 _fullcomment = true;
1520 else
1522 _fullcomment = false;
1525 return true;
1528 PushNodeStart(HtmlNodeType.Element, _index-1);
1529 return true;
1532 private void ReadDocumentEncoding(HtmlNode node)
1534 if (!OptionReadEncoding)
1535 return;
1536 // format is
1537 // <meta http-equiv="content-type" content="text/html;charset=iso-8859-1" />
1539 // when we append a child, we are in node end, so attributes are already populated
1540 if (node._namelength == 4) // quick check, avoids string alloc
1542 if (node.Name == "meta") // all nodes names are lowercase
1544 HtmlAttribute att = node.Attributes["http-equiv"];
1545 if (att != null)
1547 if (string.Compare(att.Value, "content-type", true) == 0)
1549 HtmlAttribute content = node.Attributes["content"];
1550 if (content != null)
1552 string charset = NameValuePairList.GetNameValuePairsValue(content.Value, "charset");
1553 if (charset != null)
1555 _declaredencoding = Encoding.GetEncoding(charset);
1556 if (_onlyDetectEncoding)
1558 throw new EncodingFoundException(_declaredencoding);
1561 if (_streamencoding != null)
1563 if (_declaredencoding.WindowsCodePage != _streamencoding.WindowsCodePage)
1565 AddError(
1566 HtmlParseErrorCode.CharsetMismatch,
1567 _line, _lineposition,
1568 _index, node.OuterHtml,
1569 "Encoding mismatch between StreamEncoding: " +
1570 _streamencoding.WebName + " and DeclaredEncoding: " + _declaredencoding.WebName);
1581 private void PushAttributeNameStart(int index)
1583 _currentattribute = CreateAttribute();
1584 _currentattribute._namestartindex = index;
1585 _currentattribute._line = _line;
1586 _currentattribute._lineposition = _lineposition;
1587 _currentattribute._streamposition = index;
1590 private void PushAttributeNameEnd(int index)
1592 _currentattribute._namelength = index - _currentattribute._namestartindex;
1593 _currentnode.Attributes.Append(_currentattribute);
1596 private void PushAttributeValueStart(int index)
1598 _currentattribute._valuestartindex = index;
1601 private void PushAttributeValueEnd(int index)
1603 _currentattribute._valuelength = index - _currentattribute._valuestartindex;
1606 private void PushNodeStart(HtmlNodeType type, int index)
1608 _currentnode = CreateNode(type, index);
1609 _currentnode._line = _line;
1610 _currentnode._lineposition = _lineposition;
1611 if (type == HtmlNodeType.Element)
1613 _currentnode._lineposition--;
1615 _currentnode._streamposition = index;
1618 private void PushNodeEnd(int index, bool close)
1620 _currentnode._outerlength = index - _currentnode._outerstartindex;
1622 if ((_currentnode._nodetype == HtmlNodeType.Text) ||
1623 (_currentnode._nodetype == HtmlNodeType.Comment))
1625 // forget about void nodes
1626 if (_currentnode._outerlength>0)
1628 _currentnode._innerlength = _currentnode._outerlength;
1629 _currentnode._innerstartindex = _currentnode._outerstartindex;
1630 if (_lastparentnode != null)
1632 _lastparentnode.AppendChild(_currentnode);
1636 else
1638 if ((_currentnode._starttag) && (_lastparentnode != _currentnode))
1640 // add to parent node
1641 if (_lastparentnode != null)
1643 _lastparentnode.AppendChild(_currentnode);
1646 ReadDocumentEncoding(_currentnode);
1648 // remember last node of this kind
1649 HtmlNode prev = (HtmlNode)_lastnodes[_currentnode.Name];
1650 _currentnode._prevwithsamename = prev;
1651 _lastnodes[_currentnode.Name] = _currentnode;
1653 // change parent?
1654 if ((_currentnode.NodeType == HtmlNodeType.Document) ||
1655 (_currentnode.NodeType == HtmlNodeType.Element))
1657 _lastparentnode = _currentnode;
1660 if (HtmlNode.IsCDataElement(CurrentNodeName()))
1662 _state = ParseState.PcData;
1663 return;
1666 if ((HtmlNode.IsClosedElement(_currentnode.Name)) ||
1667 (HtmlNode.IsEmptyElement(_currentnode.Name)))
1669 close = true;
1674 if ((close) || (!_currentnode._starttag))
1676 CloseCurrentNode();
1680 private void PushNodeNameStart(bool starttag, int index)
1682 _currentnode._starttag = starttag;
1683 _currentnode._namestartindex = index;
1686 private string[] GetResetters(string name)
1688 switch (name)
1690 case "li":
1691 return new string[]{"ul"};
1693 case "tr":
1694 return new string[]{"table"};
1696 case "th":
1697 case "td":
1698 return new string[]{"tr", "table"};
1700 default:
1701 return null;
1705 private void FixNestedTags()
1707 // we are only interested by start tags, not closing tags
1708 if (!_currentnode._starttag)
1709 return;
1711 string name = CurrentNodeName().ToLower();
1712 FixNestedTag(name, GetResetters(name));
1715 private void FixNestedTag(string name, string[] resetters)
1717 if (resetters == null)
1718 return;
1720 HtmlNode prev;
1722 // if we find a previous unclosed same name node, without a resetter node between, we must close it
1723 prev = (HtmlNode)_lastnodes[name];
1724 if ((prev != null) && (!prev.Closed))
1727 // try to find a resetter node, if found, we do nothing
1728 if (FindResetterNodes(prev, resetters))
1730 return;
1733 // ok we need to close the prev now
1734 // create a fake closer node
1735 HtmlNode close = new HtmlNode(prev.NodeType, this, -1);
1736 close._endnode = close;
1737 prev.CloseNode(close);
1742 private bool FindResetterNodes(HtmlNode node, string[] names)
1744 if (names == null)
1746 return false;
1748 for(int i=0;i<names.Length;i++)
1750 if (FindResetterNode(node, names[i]) != null)
1752 return true;
1755 return false;
1758 private HtmlNode FindResetterNode(HtmlNode node, string name)
1760 HtmlNode resetter = (HtmlNode)_lastnodes[name];
1761 if (resetter == null)
1762 return null;
1763 if (resetter.Closed)
1765 return null;
1767 if (resetter._streamposition<node._streamposition)
1769 return null;
1771 return resetter;
1774 private void PushNodeNameEnd(int index)
1776 _currentnode._namelength = index - _currentnode._namestartindex;
1777 if (OptionFixNestedTags)
1779 FixNestedTags();
1783 private void CloseCurrentNode()
1785 if (_currentnode.Closed) // text or document are by def closed
1786 return;
1788 bool error = false;
1790 // find last node of this kind
1791 HtmlNode prev = (HtmlNode)_lastnodes[_currentnode.Name];
1792 if (prev == null)
1794 if (HtmlNode.IsClosedElement(_currentnode.Name))
1796 // </br> will be seen as <br>
1797 _currentnode.CloseNode(_currentnode);
1799 // add to parent node
1800 if (_lastparentnode != null)
1802 HtmlNode foundNode = null;
1803 Stack futureChild = new Stack();
1804 for (HtmlNode node = _lastparentnode.LastChild; node != null; node = node.PreviousSibling)
1806 if ((node.Name == _currentnode.Name) && (! node.HasChildNodes))
1808 foundNode = node;
1809 break;
1811 futureChild.Push(node);
1813 if (foundNode != null)
1815 HtmlNode node = null;
1816 while(futureChild.Count != 0)
1818 node = (HtmlNode)futureChild.Pop();
1819 _lastparentnode.RemoveChild(node);
1820 foundNode.AppendChild(node);
1823 else
1825 _lastparentnode.AppendChild(_currentnode);
1830 else
1832 // node has no parent
1833 // node is not a closed node
1835 if (HtmlNode.CanOverlapElement(_currentnode.Name))
1837 // this is a hack: add it as a text node
1838 HtmlNode closenode = CreateNode(HtmlNodeType.Text, _currentnode._outerstartindex);
1839 closenode._outerlength = _currentnode._outerlength;
1840 ((HtmlTextNode)closenode).Text = ((HtmlTextNode)closenode).Text.ToLower();
1841 if (_lastparentnode != null)
1843 _lastparentnode.AppendChild(closenode);
1847 else
1849 if (HtmlNode.IsEmptyElement(_currentnode.Name))
1851 AddError(
1852 HtmlParseErrorCode.EndTagNotRequired,
1853 _currentnode._line, _currentnode._lineposition,
1854 _currentnode._streamposition, _currentnode.OuterHtml,
1855 "End tag </" + _currentnode.Name + "> is not required");
1857 else
1859 // node cannot overlap, node is not empty
1860 AddError(
1861 HtmlParseErrorCode.TagNotOpened,
1862 _currentnode._line, _currentnode._lineposition,
1863 _currentnode._streamposition, _currentnode.OuterHtml,
1864 "Start tag <" + _currentnode.Name + "> was not found");
1865 error = true;
1870 else
1872 if (OptionFixNestedTags)
1874 if (FindResetterNodes(prev, GetResetters(_currentnode.Name)))
1876 AddError(
1877 HtmlParseErrorCode.EndTagInvalidHere,
1878 _currentnode._line, _currentnode._lineposition,
1879 _currentnode._streamposition, _currentnode.OuterHtml,
1880 "End tag </" + _currentnode.Name + "> invalid here");
1881 error = true;
1885 if (!error)
1887 _lastnodes[_currentnode.Name] = prev._prevwithsamename;
1888 prev.CloseNode(_currentnode);
1893 // we close this node, get grandparent
1894 if (!error)
1896 if ((_lastparentnode != null) &&
1897 ((!HtmlNode.IsClosedElement(_currentnode.Name)) ||
1898 (_currentnode._starttag)))
1900 UpdateLastParentNode();
1905 internal void UpdateLastParentNode()
1909 if (_lastparentnode.Closed)
1911 _lastparentnode = _lastparentnode.ParentNode;
1914 while ((_lastparentnode != null) && (_lastparentnode.Closed));
1915 if (_lastparentnode == null)
1917 _lastparentnode = _documentnode;
1921 private string CurrentAttributeName()
1923 return _text.Substring(_currentattribute._namestartindex, _currentattribute._namelength);
1926 private string CurrentAttributeValue()
1928 return _text.Substring(_currentattribute._valuestartindex, _currentattribute._valuelength);
1931 private string CurrentNodeName()
1933 return _text.Substring(_currentnode._namestartindex, _currentnode._namelength);
1936 private string CurrentNodeOuter()
1938 return _text.Substring(_currentnode._outerstartindex, _currentnode._outerlength);
1941 private string CurrentNodeInner()
1943 return _text.Substring(_currentnode._innerstartindex, _currentnode._innerlength);
1946 /// <summary>
1947 /// Determines if the specified character is considered as a whitespace character.
1948 /// </summary>
1949 /// <param name="c">The character to check.</param>
1950 /// <returns>true if if the specified character is considered as a whitespace character.</returns>
1951 public static bool IsWhiteSpace(int c)
1953 if ((c == 10) || (c == 13) || (c == 32) || (c == 9))
1955 return true;
1957 return false;
1962 internal class EncodingFoundException: Exception
1964 private Encoding _encoding;
1966 internal EncodingFoundException(Encoding encoding)
1968 _encoding = encoding;
1971 internal Encoding Encoding
1975 return _encoding;