1 // HtmlAgilityPack V1.0 - Simon Mourier <simonm@microsoft.com>
4 Copyright (C) 2003 Simon Mourier <simonm@microsoft.com>
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions
10 1. Redistributions of source code must retain the above copyright
11 notice, this list of conditions and the following disclaimer.
12 2. Redistributions in binary form must reproduce the above copyright
13 notice, this list of conditions and the following disclaimer in the
14 documentation and/or other materials provided with the distribution.
15 3. The name of the author may not be used to endorse or promote products
16 derived from this software without specific prior written permission.
18 THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
19 IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20 OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21 IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
22 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
23 NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
27 THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 using System
.Diagnostics
;
34 using System
.Collections
;
35 using System
.Text
.RegularExpressions
;
37 using System
.Xml
.XPath
;
40 // Legend: SLIM=Comment added describing changes to original HtmlAgilityPack
41 // to reduce memory consumption
42 // Once the parser is free of bugs, the comments will be taken out
43 namespace HtmlAgilityPack
46 /// Represents the type of parsing error.
48 public enum HtmlParseErrorCode
51 /// A tag was not closed.
56 /// A tag was not opened.
61 /// There is a charset mismatch between stream and declared (META) encoding.
66 /// An end tag was not required.
71 /// An end tag is invalid at this position.
77 /// Represents a parsing error found during document parsing.
79 public class HtmlParseError
81 private HtmlParseErrorCode _code
;
83 private int _linePosition
;
84 private int _streamPosition
;
85 private string _sourceText
;
86 private string _reason
;
88 internal HtmlParseError(
89 HtmlParseErrorCode code
,
98 _linePosition
= linePosition
;
99 _streamPosition
= streamPosition
;
100 _sourceText
= sourceText
;
105 /// Gets the type of error.
107 public HtmlParseErrorCode Code
116 /// Gets the line number of this error in the document.
127 /// Gets the column number of this error in the document.
129 public int LinePosition
133 return _linePosition
;
138 /// Gets the absolstream position of this error in the document, relative to the start of the document.
140 public int StreamPosition
144 return _streamPosition
;
149 /// Gets the the full text of the line containing the error.
151 public string SourceText
160 /// Gets a description for the error.
171 // SLIM: creating this class to wrap around a textreader
172 // to emulate ReadToEnd () behaviour
173 class StreamAsArray
{
174 private StreamReader _reader
;
176 private int _position
;
178 private char[] _buf_previous
; // could have used only one array
179 private char[] _buf_current
; // but, this is cleaner
180 private int _block_size
;
182 public StreamAsArray (StreamReader r
)
190 _buf_previous
= new char [_block_size
];
191 _buf_current
= new char [_block_size
];
196 private void Read (bool initial
)
199 Array
.Copy (_buf_current
, _buf_previous
, _block_size
);
200 _position
+= _block_size
;
202 HtmlDocument
.Debug ("Debug: Read in buffer at:" + _position
);
204 int num_read
= _reader
.Read (_buf_current
, 0, _block_size
);
205 if (num_read
< _block_size
) {
207 _length
= _position
+ num_read
;
209 HtmlDocument
.Debug ("[" + new string (_buf_current
, 0, num_read
) + "]");
212 public bool Eof (int index
) {
214 return (index
== _length
);
216 if (index
>= _position
+ _block_size
&&
217 index
< _position
+ _block_size
+ _block_size
)
220 return (index
== _length
);
226 public new char this[int index
] {
228 if (index
>= _position
&&
229 index
< _position
+ _block_size
)
230 return _buf_current
[index
% _block_size
];
231 if (index
>= _position
- _block_size
&&
233 return _buf_previous
[ index
% _block_size
];
234 if (index
>= _position
+ _block_size
&&
235 index
< _position
+ _block_size
+ _block_size
) {
237 return _buf_current
[index
% _block_size
];
239 Console
.WriteLine ("EXCEPTION!!!");
240 throw new Exception (String
.Format ("{0} is out of current bounds:[{1}-{2}] and further than read-ahead",
242 _position
- _block_size
,
243 _position
+ _block_size
- 1));
247 // evil function ... you get what you pay for!
248 private string OutOfBandRead (int startindex
, int length
)
250 HtmlDocument
.Debug ("Out of band read! From " + startindex
+ " to " + (startindex
+ length
- 1));
251 ResetPosition (startindex
);
252 // ahh.. now we are at the correct place
253 // create a buffer of required length
254 // who cares if the buffer size does not align well
255 // with page boundary
256 char[] temp_buf
= new char [length
];
257 int num_read
= _reader
.Read (temp_buf
, 0, length
);
258 if (num_read
< length
) {
261 _length
= startindex
+ num_read
;
263 // discard data and reset stream position
264 int t
= (_eof
? _length
:_position
+ _block_size
);
266 return new String (temp_buf
);
269 // streamreader does not allow seeking
270 // seek on its basestream does not reflect the position
271 // of the reader - it is governed by the buffer size
272 // of the underlying stream
273 // :( so, read character by character from beginning ...
274 private void ResetPosition (int pos
)
276 _reader
.DiscardBufferedData ();
277 _reader
.BaseStream
.Position
= 0;
278 // read in chunks of block_size
279 int n1
= pos
/ _block_size
;
280 int n2
= pos
% _block_size
;
281 char[] tmp
= new char [_block_size
];
282 // yo ho... start reading till we have reach pos
283 // hopefully, reader will buffer itself, so we can be mean and get one char at a time
284 for (int i
= 0; i
< n1
; ++i
)
285 _reader
.Read (tmp
, 0, _block_size
);
286 for (int i
= 0; i
< n2
; ++i
)
291 public string Substring (int startindex
, int length
)
294 HtmlDocument
.Debug ("substring:" + startindex
+ " " + length
+ " " + _position
+ ":");
297 if (length
> _block_size
|| startindex
< _position
- _block_size
) {
298 return OutOfBandRead (startindex
, length
);
300 if (startindex
+ length
- 1 >= _position
+ _block_size
) {
304 if (startindex
< _position
) {
305 int len_1
= _position
- startindex
;
307 substr
= new String (_buf_previous
, _block_size
- len_1
, length
);
309 substr
= new String (_buf_previous
, _block_size
- len_1
, len_1
);
310 substr
+= new String (_buf_current
, 0, length
- len_1
);
313 substr
= new String (_buf_current
, startindex
- _position
, length
);
318 // FIXME: Is this costly ?
319 public int FullLength
{
321 return (int)_reader
.BaseStream
.Length
;
327 /// Represents a complete HTML document.
329 public class HtmlDocument
: IXPathNavigable
331 // SLIM: Make the parser event driven
332 // callback for FilterHtml
333 // return value is a way for the callback to signal to continue or stop parsing
334 public delegate bool NodeHandler (HtmlNode node
);
335 public NodeHandler ReportNode
;
336 // misnomer ... should be called event_driven_mode
337 private bool _streammode
= false;
338 private bool _stop_parsing
= false;
340 internal static readonly string HtmlExceptionRefNotChild
= "Reference node must be a child of this node";
341 internal static readonly string HtmlExceptionUseIdAttributeFalse
= "You need to set UseIdAttribute property to true to enable this feature";
343 internal Hashtable _openednodes
;
344 internal Hashtable _lastnodes
= new Hashtable();
345 internal Hashtable _nodesid
;
346 private HtmlNode _documentnode
;
347 //SLIM: internal string _text;
348 internal StreamAsArray _text
;
349 private HtmlNode _currentnode
;
350 private HtmlNode _lastparentnode
;
351 private HtmlAttribute _currentattribute
;
354 private int _lineposition
, _maxlineposition
;
356 private bool _fullcomment
;
357 private System
.Text
.Encoding _streamencoding
;
358 private System
.Text
.Encoding _declaredencoding
;
359 private ArrayList _parseerrors
= new ArrayList();
360 private ParseState _state
, _oldstate
;
361 private Crc32 _crc32
= null;
362 private bool _onlyDetectEncoding
= false;
363 private int _pcdata_quote_char
= '\0';
365 private static bool _debug
= false;
366 internal static void Debug (string s
)
369 Console
.WriteLine (s
);
375 /// Defines if a checksum must be computed for the document while parsing. Default is false.
377 public bool OptionComputeChecksum
= false;
380 /// Defines if declared encoding must be read from the document.
381 /// Declared encoding is determined using the meta http-equiv="content-type" content="text/html;charset=XXXXX" html node.
384 public bool OptionReadEncoding
= true;
388 /// Defines if non closed nodes will be checked at the end of parsing. Default is true.
390 public bool OptionCheckSyntax
= true;
393 /// Defines if the 'id' attribute must be specifically used. Default is true.
395 public bool OptionUseIdAttribute
= true;
398 /// Defines if empty nodes must be written as closed during output. Default is false.
400 public bool OptionWriteEmptyNodes
= false;
403 /// Defines if output must conform to XML, instead of HTML.
405 public bool OptionOutputAsXml
= false;
408 /// Defines if name must be output in uppercase. Default is false.
410 public bool OptionOutputUpperCase
= false;
413 /// Defines if attribute value output must be optimized (not bound with double quotes if it is possible). Default is false.
415 public bool OptionOutputOptimizeAttributeValues
= false;
418 /// Adds Debugging attributes to node. Default is false.
420 public bool OptionAddDebuggingAttributes
= false;
423 /// Defines if source text must be extracted while parsing errors.
424 /// If the document has a lot of errors, or cascading errors, parsing performance can be dramatically affected if set to true.
425 /// Default is false.
427 public bool OptionExtractErrorSourceText
= false; // turning this on can dramatically slow performance if a lot of errors are detected
430 /// Defines if closing for non closed nodes must be done at the end or directly in the document.
431 /// Setting this to true can actually change how browsers render the page. Default is false.
433 public bool OptionAutoCloseOnEnd
= false; // close errors at the end
436 /// Defines if LI, TR, TH, TD tags must be partially fixed when nesting errors are detected. Default is false.
438 public bool OptionFixNestedTags
= false; // fix li, tr, th, td tags
441 /// Defines the maximum length of source text or parse errors. Default is 100.
443 public int OptionExtractErrorSourceTextMaxLength
= 100;
446 /// Defines the default stream encoding to use. Default is System.Text.Encoding.Default.
448 // From http://www.w3.org/TR/REC-html40/charset.html
449 // The HTTP protocol ([RFC2616], section 3.7.1) mentions ISO-8859-1 as a default character encoding when the "charset" parameter is absent from the "Content-Type" header field.
450 // So, however we are still using UTF-8 for some unknown reason
451 //FIXME: Fix the default encoding!
452 public System
.Text
.Encoding OptionDefaultStreamEncoding
= Encoding
.UTF8
;
455 /// Gets a list of parse errors found in the document.
457 public ArrayList ParseErrors
466 /// Gets the document's stream encoding.
468 public System
.Text
.Encoding StreamEncoding
472 return _streamencoding
;
477 /// Gets the document's declared encoding.
478 /// Declared encoding is determined using the meta http-equiv="content-type" content="text/html;charset=XXXXX" html node.
480 public System
.Text
.Encoding DeclaredEncoding
484 return _declaredencoding
;
489 /// Creates an instance of an HTML document.
491 public HtmlDocument()
493 _documentnode
= CreateNode(HtmlNodeType
.Document
, 0);
496 internal HtmlNode
GetXmlDeclaration()
498 if (!_documentnode
.HasChildNodes
)
503 foreach(HtmlNode node
in _documentnode
._childnodes
)
505 if (node
.Name
== "?xml") // it's ok, names are case sensitive
514 /// Applies HTML encoding to a specified string.
516 /// <param name="html">The input string to encode. May not be null.</param>
517 /// <returns>The encoded string.</returns>
518 public static string HtmlEncode(string html
)
522 throw new ArgumentNullException("html");
524 // replace & by & but only once!
525 Regex rx
= new Regex("&(?!(amp;)|(lt;)|(gt;)|(quot;))", RegexOptions
.IgnoreCase
);
526 return rx
.Replace(html
, "&").Replace("<", "<").Replace(">", ">").Replace("\"", """);
530 /// Detects the encoding of an HTML stream.
532 /// <param name="stream">The input stream. May not be null.</param>
533 /// <returns>The detected encoding.</returns>
534 public Encoding
DetectEncoding(Stream stream
)
538 throw new ArgumentNullException("stream");
540 return DetectEncoding(new StreamReader(stream
));
544 /// Detects the encoding of an HTML file.
546 /// <param name="path">Path for the file containing the HTML document to detect. May not be null.</param>
547 /// <returns>The detected encoding.</returns>
548 public Encoding
DetectEncoding(string path
)
552 throw new ArgumentNullException("path");
554 StreamReader sr
= new StreamReader(path
, OptionDefaultStreamEncoding
);
555 Encoding encoding
= DetectEncoding(sr
);
561 /// Detects the encoding of an HTML text.
563 /// <param name="html">The input html text. May not be null.</param>
564 /// <returns>The detected encoding.</returns>
565 public Encoding
DetectEncodingHtml(string html
)
569 throw new ArgumentNullException("html");
571 StringReader sr
= new StringReader(html
);
572 Encoding encoding
= DetectEncoding(sr
);
578 /// Detects the encoding of an HTML text provided on a TextReader.
580 /// <param name="reader">The TextReader used to feed the HTML. May not be null.</param>
581 /// <returns>The detected encoding.</returns>
582 public Encoding
DetectEncoding(TextReader reader
)
586 throw new ArgumentNullException("reader");
588 _onlyDetectEncoding
= true;
589 if (OptionCheckSyntax
)
591 _openednodes
= new Hashtable();
598 if (OptionUseIdAttribute
)
600 _nodesid
= new Hashtable();
607 StreamReader sr
= reader
as StreamReader
;
610 _streamencoding
= sr
.CurrentEncoding
;
614 _streamencoding
= null;
616 _declaredencoding
= null;
618 // SLIM: _text = reader.ReadToEnd();
619 _text
= new StreamAsArray (sr
);
620 _documentnode
= CreateNode(HtmlNodeType
.Document
, 0);
622 // this is a hack, but it allows us not to muck with the original parsing code
627 catch(EncodingFoundException ex
)
636 /// Loads an HTML document from a stream.
638 /// <param name="stream">The input stream.</param>
639 public void Load(Stream stream
)
641 Load(new StreamReader(stream
, OptionDefaultStreamEncoding
));
645 /// Loads an HTML document from a stream.
647 /// <param name="stream">The input stream.</param>
648 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the stream.</param>
649 public void Load(Stream stream
, bool detectEncodingFromByteOrderMarks
)
651 Load(new StreamReader(stream
, detectEncodingFromByteOrderMarks
));
655 /// Loads an HTML document from a stream.
657 /// <param name="stream">The input stream.</param>
658 /// <param name="encoding">The character encoding to use.</param>
659 public void Load(Stream stream
, Encoding encoding
)
661 Load(new StreamReader(stream
, encoding
));
665 /// Loads an HTML document from a stream.
667 /// <param name="stream">The input stream.</param>
668 /// <param name="encoding">The character encoding to use.</param>
669 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the stream.</param>
670 public void Load(Stream stream
, Encoding encoding
, bool detectEncodingFromByteOrderMarks
)
672 Load(new StreamReader(stream
, encoding
, detectEncodingFromByteOrderMarks
));
676 /// Loads an HTML document from a stream.
678 /// <param name="stream">The input stream.</param>
679 /// <param name="encoding">The character encoding to use.</param>
680 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the stream.</param>
681 /// <param name="buffersize">The minimum buffer size.</param>
682 public void Load(Stream stream
, Encoding encoding
, bool detectEncodingFromByteOrderMarks
, int buffersize
)
684 Load(new StreamReader(stream
, encoding
, detectEncodingFromByteOrderMarks
, buffersize
));
688 /// Loads an HTML document from a file.
690 /// <param name="path">The complete file path to be read. May not be null.</param>
691 public void Load(string path
)
695 throw new ArgumentNullException("path");
697 StreamReader sr
= new StreamReader(path
, OptionDefaultStreamEncoding
);
703 /// Loads an HTML document from a file.
705 /// <param name="path">The complete file path to be read. May not be null.</param>
706 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
707 public void Load(string path
, bool detectEncodingFromByteOrderMarks
)
711 throw new ArgumentNullException("path");
713 StreamReader sr
= new StreamReader(path
, detectEncodingFromByteOrderMarks
);
719 /// Loads an HTML document from a file.
721 /// <param name="path">The complete file path to be read. May not be null.</param>
722 /// <param name="encoding">The character encoding to use. May not be null.</param>
723 public void Load(string path
, Encoding encoding
)
727 throw new ArgumentNullException("path");
729 if (encoding
== null)
731 throw new ArgumentNullException("encoding");
733 StreamReader sr
= new StreamReader(path
, encoding
);
739 /// Loads an HTML document from a file.
741 /// <param name="path">The complete file path to be read. May not be null.</param>
742 /// <param name="encoding">The character encoding to use. May not be null.</param>
743 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
744 public void Load(string path
, Encoding encoding
, bool detectEncodingFromByteOrderMarks
)
748 throw new ArgumentNullException("path");
750 if (encoding
== null)
752 throw new ArgumentNullException("encoding");
754 StreamReader sr
= new StreamReader(path
, encoding
, detectEncodingFromByteOrderMarks
);
760 /// Loads an HTML document from a file.
762 /// <param name="path">The complete file path to be read. May not be null.</param>
763 /// <param name="encoding">The character encoding to use. May not be null.</param>
764 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
765 /// <param name="buffersize">The minimum buffer size.</param>
766 public void Load(string path
, Encoding encoding
, bool detectEncodingFromByteOrderMarks
, int buffersize
)
770 throw new ArgumentNullException("path");
772 if (encoding
== null)
774 throw new ArgumentNullException("encoding");
776 StreamReader sr
= new StreamReader(path
, encoding
, detectEncodingFromByteOrderMarks
, buffersize
);
782 /// Loads the HTML document from the specified string.
784 /// <param name="html">String containing the HTML document to load. May not be null.</param>
785 public void LoadHtml(string html
)
789 throw new ArgumentNullException("html");
791 StringReader sr
= new StringReader(html
);
797 /// Detects the encoding of an HTML document from a file first, and then loads the file.
799 /// <param name="path">The complete file path to be read.</param>
800 public void DetectEncodingAndLoad(string path
)
802 DetectEncodingAndLoad(path
, true);
806 /// Detects the encoding of an HTML document from a file first, and then loads the file.
808 /// <param name="path">The complete file path to be read. May not be null.</param>
809 /// <param name="detectEncoding">true to detect encoding, false otherwise.</param>
810 public void DetectEncodingAndLoad(string path
, bool detectEncoding
)
814 throw new ArgumentNullException("path");
816 System
.Text
.Encoding enc
;
819 enc
= DetectEncoding(path
);
837 /// Loads the HTML document from the specified TextReader.
839 /// <param name="reader">The TextReader used to feed the HTML data into the document. May not be null.</param>
840 public void Load(TextReader reader
)
842 // all Load methods pass down to this one
845 throw new ArgumentNullException("reader");
848 _onlyDetectEncoding
= false;
850 if (OptionCheckSyntax
)
852 _openednodes
= new Hashtable();
859 if (OptionUseIdAttribute
)
861 _nodesid
= new Hashtable();
868 StreamReader sr
= reader
as StreamReader
;
873 // trigger bom read if needed
880 _streamencoding
= sr
.CurrentEncoding
;
884 _streamencoding
= null;
886 _declaredencoding
= null;
888 // SLIM: _text = reader.ReadToEnd();
889 _text
= new StreamAsArray (sr
);
890 _documentnode
= CreateNode(HtmlNodeType
.Document
, 0);
893 if (OptionCheckSyntax
)
895 foreach(HtmlNode node
in _openednodes
.Values
)
897 if (!node
._starttag
) // already reported
903 if (OptionExtractErrorSourceText
)
905 html
= node
.OuterHtml
;
906 if (html
.Length
> OptionExtractErrorSourceTextMaxLength
)
908 html
= html
.Substring(0, OptionExtractErrorSourceTextMaxLength
);
916 HtmlParseErrorCode
.TagNotClosed
,
917 node
._line
, node
._lineposition
,
918 node
._streamposition
, html
,
919 "End tag </" + node
.Name
+ "> was not found");
922 // we don't need this anymore
923 _openednodes
.Clear();
927 internal System
.Text
.Encoding
GetOutEncoding()
929 // when unspecified, use the stream encoding first
930 if (_declaredencoding
!= null)
932 return _declaredencoding
;
936 if (_streamencoding
!= null)
938 return _streamencoding
;
941 return OptionDefaultStreamEncoding
;
946 /// Gets the document's output encoding.
948 public System
.Text
.Encoding Encoding
952 return GetOutEncoding();
957 /// Saves the HTML document to the specified stream.
959 /// <param name="outStream">The stream to which you want to save.</param>
960 public void Save(Stream outStream
)
962 StreamWriter sw
= new StreamWriter(outStream
, GetOutEncoding());
967 /// Saves the HTML document to the specified stream.
969 /// <param name="outStream">The stream to which you want to save. May not be null.</param>
970 /// <param name="encoding">The character encoding to use. May not be null.</param>
971 public void Save(Stream outStream
, System
.Text
.Encoding encoding
)
973 if (outStream
== null)
975 throw new ArgumentNullException("outStream");
977 if (encoding
== null)
979 throw new ArgumentNullException("encoding");
981 StreamWriter sw
= new StreamWriter(outStream
, encoding
);
986 /// Saves the mixed document to the specified file.
988 /// <param name="filename">The location of the file where you want to save the document.</param>
989 public void Save(string filename
)
991 StreamWriter sw
= new StreamWriter(filename
, false, GetOutEncoding());
997 /// Saves the mixed document to the specified file.
999 /// <param name="filename">The location of the file where you want to save the document. May not be null.</param>
1000 /// <param name="encoding">The character encoding to use. May not be null.</param>
1001 public void Save(string filename
, System
.Text
.Encoding encoding
)
1003 if (filename
== null)
1005 throw new ArgumentNullException("filename");
1007 if (encoding
== null)
1009 throw new ArgumentNullException("encoding");
1011 StreamWriter sw
= new StreamWriter(filename
, false, encoding
);
1017 /// Saves the HTML document to the specified StreamWriter.
1019 /// <param name="writer">The StreamWriter to which you want to save.</param>
1020 public void Save(StreamWriter writer
)
1022 Save((TextWriter
)writer
);
1026 /// Saves the HTML document to the specified TextWriter.
1028 /// <param name="writer">The TextWriter to which you want to save. May not be null.</param>
1029 public void Save(TextWriter writer
)
1033 throw new ArgumentNullException("writer");
1035 DocumentNode
.WriteTo(writer
);
1039 /// Saves the HTML document to the specified XmlWriter.
1041 /// <param name="writer">The XmlWriter to which you want to save.</param>
1042 public void Save(XmlWriter writer
)
1044 DocumentNode
.WriteTo(writer
);
1049 /// Creates a new XPathNavigator object for navigating this HTML document.
1051 /// <returns>An XPathNavigator object. The XPathNavigator is positioned on the root of the document.</returns>
1052 public XPathNavigator
CreateNavigator()
1054 return new HtmlNodeNavigator(this, _documentnode
);
1057 internal void SetIdForNode(HtmlNode node
, string id
)
1059 if (!OptionUseIdAttribute
)
1064 if ((_nodesid
== null) || (id
== null))
1071 _nodesid
.Remove(id
.ToLower());
1075 _nodesid
[id
.ToLower()] = node
;
1080 /// Gets the HTML node with the specified 'id' attribute value.
1082 /// <param name="id">The attribute id to match. May not be null.</param>
1083 /// <returns>The HTML node with the matching id or null if not found.</returns>
1084 public HtmlNode
GetElementbyId(string id
)
1088 throw new ArgumentNullException("id");
1090 if (_nodesid
== null)
1092 throw new Exception(HtmlExceptionUseIdAttributeFalse
);
1095 return _nodesid
[id
.ToLower()] as HtmlNode
;
1099 /// Creates an HTML element node with the specified name.
1101 /// <param name="name">The qualified name of the element. May not be null.</param>
1102 /// <returns>The new HTML node.</returns>
1103 public HtmlNode
CreateElement(string name
)
1107 throw new ArgumentNullException("name");
1109 HtmlNode node
= CreateNode(HtmlNodeType
.Element
);
1115 /// Creates an HTML comment node.
1117 /// <returns>The new HTML comment node.</returns>
1118 public HtmlCommentNode
CreateComment()
1120 return (HtmlCommentNode
)CreateNode(HtmlNodeType
.Comment
);
1124 /// Creates an HTML comment node with the specified comment text.
1126 /// <param name="comment">The comment text. May not be null.</param>
1127 /// <returns>The new HTML comment node.</returns>
1128 public HtmlCommentNode
CreateComment(string comment
)
1130 if (comment
== null)
1132 throw new ArgumentNullException("comment");
1134 HtmlCommentNode c
= CreateComment();
1135 c
.Comment
= comment
;
1140 /// Creates an HTML text node.
1142 /// <returns>The new HTML text node.</returns>
1143 public HtmlTextNode
CreateTextNode()
1145 return (HtmlTextNode
)CreateNode(HtmlNodeType
.Text
);
1149 /// Creates an HTML text node with the specified text.
1151 /// <param name="text">The text of the node. May not be null.</param>
1152 /// <returns>The new HTML text node.</returns>
1153 public HtmlTextNode
CreateTextNode(string text
)
1157 throw new ArgumentNullException("text");
1159 HtmlTextNode t
= CreateTextNode();
1164 internal HtmlNode
CreateNode(HtmlNodeType type
)
1166 return CreateNode(type
, -1);
1169 internal HtmlNode
CreateNode(HtmlNodeType type
, int index
)
1173 case HtmlNodeType
.Comment
:
1174 return new HtmlCommentNode(this, index
);
1176 case HtmlNodeType
.Text
:
1177 return new HtmlTextNode(this, index
);
1180 return new HtmlNode(type
, this, index
);
1184 internal HtmlAttribute
CreateAttribute()
1186 return new HtmlAttribute(this);
1190 /// Creates an HTML attribute with the specified name.
1192 /// <param name="name">The name of the attribute. May not be null.</param>
1193 /// <returns>The new HTML attribute.</returns>
1194 public HtmlAttribute
CreateAttribute(string name
)
1198 throw new ArgumentNullException("name");
1200 HtmlAttribute att
= CreateAttribute();
1206 /// Creates an HTML attribute with the specified name.
1208 /// <param name="name">The name of the attribute. May not be null.</param>
1209 /// <param name="value">The value of the attribute.</param>
1210 /// <returns>The new HTML attribute.</returns>
1211 public HtmlAttribute
CreateAttribute(string name
, string value)
1215 throw new ArgumentNullException("name");
1217 HtmlAttribute att
= CreateAttribute(name
);
1223 /// Gets the root node of the document.
1225 public HtmlNode DocumentNode
1229 return _documentnode
;
1234 /// Gets the document CRC32 checksum if OptionComputeChecksum was set to true before parsing, 0 otherwise.
1246 return (int)_crc32
.CheckSum
;
1251 public bool StreamMode
1259 _streammode
= value;
1263 private HtmlParseError
AddError(
1264 HtmlParseErrorCode code
,
1271 HtmlParseError err
= new HtmlParseError(code
, line
, linePosition
, streamPosition
, sourceText
, reason
);
1272 _parseerrors
.Add(err
);
1276 private enum ParseState
1284 AttributeBeforeEquals
,
1285 AttributeAfterEquals
,
1288 QuotedAttributeValue
,
1294 private void IncrementPosition()
1298 // REVIEW: should we add some checksum code in DecrementPosition too?
1299 _crc32
.AddToCRC32(_c
);
1303 _maxlineposition
= _lineposition
;
1315 private void DecrementPosition()
1318 if (_lineposition
== 1)
1320 _lineposition
= _maxlineposition
;
1329 private void Parse()
1332 if (OptionComputeChecksum
)
1334 _crc32
= new Crc32();
1337 _lastnodes
= new Hashtable();
1339 _fullcomment
= false;
1340 _parseerrors
= new ArrayList();
1343 _maxlineposition
= 1;
1345 _state
= ParseState
.Text
;
1347 _documentnode
._innerlength
= _text
.FullLength
;
1348 _documentnode
._outerlength
= _text
.FullLength
;
1350 _lastparentnode
= _documentnode
;
1351 _currentnode
= CreateNode(HtmlNodeType
.Text
, 0);
1352 _currentattribute
= null;
1355 PushNodeStart(HtmlNodeType
.Text
, 0);
1356 // SLIM: while (_index<_text.Length)
1357 while (! _stop_parsing
&& ! _text
.Eof (_index
))
1360 IncrementPosition();
1364 case ParseState
.Text
:
1369 case ParseState
.WhichTag
:
1374 PushNodeNameStart(false, _index
);
1378 PushNodeNameStart(true, _index
-1);
1379 DecrementPosition();
1381 _state
= ParseState
.Tag
;
1384 case ParseState
.Tag
:
1387 if (IsWhiteSpace(_c
))
1389 PushNodeNameEnd(_index
-1);
1390 if (_state
!= ParseState
.Tag
)
1392 _state
= ParseState
.BetweenAttributes
;
1397 PushNodeNameEnd(_index
-1);
1398 if (_state
!= ParseState
.Tag
)
1400 _state
= ParseState
.EmptyTag
;
1405 PushNodeNameEnd(_index
-1);
1406 if (_state
!= ParseState
.Tag
)
1408 PushNodeEnd(_index
, false);
1409 if (_state
!= ParseState
.Tag
)
1411 _state
= ParseState
.Text
;
1412 PushNodeStart(HtmlNodeType
.Text
, _index
);
1416 case ParseState
.BetweenAttributes
:
1420 if (IsWhiteSpace(_c
))
1423 if ((_c
== '/') || (_c
== '?'))
1425 _state
= ParseState
.EmptyTag
;
1431 PushNodeEnd(_index
, false);
1432 if (_state
!= ParseState
.BetweenAttributes
)
1434 _state
= ParseState
.Text
;
1435 PushNodeStart(HtmlNodeType
.Text
, _index
);
1439 PushAttributeNameStart(_index
-1);
1440 _state
= ParseState
.AttributeName
;
1443 case ParseState
.EmptyTag
:
1449 PushNodeEnd(_index
, true);
1450 if (_state
!= ParseState
.EmptyTag
)
1452 _state
= ParseState
.Text
;
1453 PushNodeStart(HtmlNodeType
.Text
, _index
);
1456 _state
= ParseState
.BetweenAttributes
;
1459 case ParseState
.AttributeName
:
1463 if (IsWhiteSpace(_c
))
1465 PushAttributeNameEnd(_index
-1);
1466 _state
= ParseState
.AttributeBeforeEquals
;
1471 PushAttributeNameEnd(_index
-1);
1472 _state
= ParseState
.AttributeAfterEquals
;
1477 PushAttributeNameEnd(_index
-1);
1478 PushNodeEnd(_index
, false);
1479 if (_state
!= ParseState
.AttributeName
)
1481 _state
= ParseState
.Text
;
1482 PushNodeStart(HtmlNodeType
.Text
, _index
);
1487 case ParseState
.AttributeBeforeEquals
:
1491 if (IsWhiteSpace(_c
))
1495 PushNodeEnd(_index
, false);
1496 if (_state
!= ParseState
.AttributeBeforeEquals
)
1498 _state
= ParseState
.Text
;
1499 PushNodeStart(HtmlNodeType
.Text
, _index
);
1504 _state
= ParseState
.AttributeAfterEquals
;
1507 // no equals, no whitespace, it's a new attrribute starting
1508 _state
= ParseState
.BetweenAttributes
;
1509 DecrementPosition();
1512 case ParseState
.AttributeAfterEquals
:
1516 if (IsWhiteSpace(_c
))
1519 if ((_c
== '\'') || (_c
== '"'))
1521 _state
= ParseState
.QuotedAttributeValue
;
1522 PushAttributeValueStart(_index
);
1528 PushNodeEnd(_index
, false);
1529 if (_state
!= ParseState
.AttributeAfterEquals
)
1531 _state
= ParseState
.Text
;
1532 PushNodeStart(HtmlNodeType
.Text
, _index
);
1535 PushAttributeValueStart(_index
-1);
1536 _state
= ParseState
.AttributeValue
;
1539 case ParseState
.AttributeValue
:
1543 if (IsWhiteSpace(_c
))
1545 PushAttributeValueEnd(_index
-1);
1546 _state
= ParseState
.BetweenAttributes
;
1552 PushAttributeValueEnd(_index
-1);
1553 PushNodeEnd(_index
, false);
1554 if (_state
!= ParseState
.AttributeValue
)
1556 _state
= ParseState
.Text
;
1557 PushNodeStart(HtmlNodeType
.Text
, _index
);
1562 case ParseState
.QuotedAttributeValue
:
1563 if (_c
== lastquote
)
1565 PushAttributeValueEnd(_index
-1);
1566 _state
= ParseState
.BetweenAttributes
;
1571 //SLIM: if (_index<_text.Length)
1572 if (!_text
.Eof (_index
))
1574 if (_text
[_index
] == '%')
1577 _state
= ParseState
.ServerSideCode
;
1584 case ParseState
.Comment
:
1589 if ((_text
[_index
-2] != '-') ||
1590 (_text
[_index
-3] != '-'))
1595 PushNodeEnd(_index
, false);
1596 _state
= ParseState
.Text
;
1597 PushNodeStart(HtmlNodeType
.Text
, _index
);
1602 case ParseState
.ServerSideCode
:
1605 //SLIM: if (_index<_text.Length)
1606 if (! _text
.Eof (_index
))
1608 if (_text
[_index
] == '>')
1612 case ParseState
.AttributeAfterEquals
:
1613 _state
= ParseState
.AttributeValue
;
1616 case ParseState
.BetweenAttributes
:
1617 PushAttributeNameEnd(_index
+1);
1618 _state
= ParseState
.BetweenAttributes
;
1625 IncrementPosition();
1631 // handle <script>a="</script>"</script>
1632 case ParseState
.PcDataQuote
:
1633 if ((_c
== _pcdata_quote_char
) && (_text
[_index
- 2] != '\\')) {
1634 _pcdata_quote_char
= '\0';
1635 _state
= ParseState
.PcData
;
1639 case ParseState
.PcData
:
1640 Debug ("PCDATA " + _currentnode
.Name
+ " " + _text
.Substring(_index
-1, _currentnode
._namelength
+2));
1641 if (_c
== '\"' || _c
== '\''){
1642 _pcdata_quote_char
= _c
;
1643 _state
= ParseState
.PcDataQuote
;
1646 // look for </tag + 1 char
1649 //SLIM: if ((_currentnode._namelength+3)<=(_text.Length-(_index-1)))
1650 if (! _text
.Eof (_currentnode
._namelength
+ _index
+ 1))
1652 if (string.Compare(_text
.Substring(_index
-1, _currentnode
._namelength
+2),
1653 "</" + _currentnode
.Name
, true) == 0)
1655 int c
= _text
[_index
-1 + 2 + _currentnode
.Name
.Length
];
1656 if ((c
== '>') || (IsWhiteSpace(c
)))
1658 // add the script as a text node
1659 HtmlNode script
= CreateNode(HtmlNodeType
.Text
,
1660 _currentnode
._outerstartindex
+ _currentnode
._outerlength
);
1661 script
._outerlength
= _index
-1 - script
._outerstartindex
;
1662 if (_streammode
&& ReportNode
!= null)
1663 _stop_parsing
= ReportNode (script
);
1665 _currentnode
.AppendChild(script
);
1666 Debug ("Found script: [" + script
.InnerText
+ "]");
1668 PushNodeStart(HtmlNodeType
.Element
, _index
-1);
1669 PushNodeNameStart(false, _index
-1 +2);
1670 _state
= ParseState
.Tag
;
1671 IncrementPosition();
1679 // finish the current work
1680 if (_currentnode
._namestartindex
> 0)
1682 PushNodeNameEnd(_index
);
1684 PushNodeEnd(_index
, false);
1686 // we don't need this anymore
1690 private bool NewCheck()
1696 //SLIM: if (_index<_text.Length)
1697 if (! _text
.Eof (_index
))
1699 if (_text
[_index
] == '%')
1703 case ParseState
.AttributeAfterEquals
:
1704 PushAttributeValueStart(_index
-1);
1707 case ParseState
.BetweenAttributes
:
1708 PushAttributeNameStart(_index
-1);
1711 case ParseState
.WhichTag
:
1712 PushNodeNameStart(true, _index
-1);
1713 _state
= ParseState
.Tag
;
1717 _state
= ParseState
.ServerSideCode
;
1722 PushNodeEnd(_index
-1, true);
1723 _state
= ParseState
.WhichTag
;
1724 //SLIM: if ((_index-1) <= (_text.Length-2))
1725 if (!_text
.Eof (_index
))
1727 if (_text
[_index
] == '!')
1729 PushNodeStart(HtmlNodeType
.Comment
, _index
-1);
1730 PushNodeNameStart(true, _index
);
1731 PushNodeNameEnd(_index
+1);
1732 _state
= ParseState
.Comment
;
1733 //SLIM: if (_index<(_text.Length-2))
1734 if (! _text
.Eof (_index
+ 2))
1736 if ((_text
[_index
+1] == '-') &&
1737 (_text
[_index
+2] == '-'))
1739 _fullcomment
= true;
1743 _fullcomment
= false;
1749 PushNodeStart(HtmlNodeType
.Element
, _index
-1);
1753 private void ReadDocumentEncoding(HtmlNode node
)
1755 if (!OptionReadEncoding
)
1758 // <meta http-equiv="content-type" content="text/html;charset=iso-8859-1" />
1760 // when we append a child, we are in node end, so attributes are already populated
1761 if (node
._namelength
== 4) // quick check, avoids string alloc
1763 // only these nodes can occur before meta
1764 // if we started seeing any other node, we will never see a meta node
1765 if (node
.NodeType
== HtmlNodeType
.Element
&&
1766 (node
.Name
!= "head" && node
.Name
!= "script" &&
1767 node
.Name
!= "style" && node
.Name
!= "title" &&
1768 node
.Name
!= "head" && node
.Name
!= "link" &&
1769 node
.Name
!= "html" && node
.Name
!= "meta"))
1770 throw new EncodingFoundException (null);
1771 else if (node
.Name
== "meta") // all nodes names are lowercase
1773 HtmlAttribute att
= node
.Attributes
["http-equiv"];
1776 if (string.Compare(att
.Value
, "content-type", true) == 0)
1778 HtmlAttribute content
= node
.Attributes
["content"];
1779 if (content
!= null)
1781 string charset
= NameValuePairList
.GetNameValuePairsValue(content
.Value
, "charset");
1782 if (charset
!= null)
1784 _declaredencoding
= Encoding
.GetEncoding(charset
);
1785 if (_onlyDetectEncoding
)
1787 throw new EncodingFoundException(_declaredencoding
);
1790 if (_streamencoding
!= null)
1792 if (_declaredencoding
.WindowsCodePage
!= _streamencoding
.WindowsCodePage
)
1795 HtmlParseErrorCode
.CharsetMismatch
,
1796 _line
, _lineposition
,
1797 _index
, node
.OuterHtml
,
1798 "Encoding mismatch between StreamEncoding: " +
1799 _streamencoding
.WebName
+ " and DeclaredEncoding: " + _declaredencoding
.WebName
);
1810 private void PushAttributeNameStart(int index
)
1812 _currentattribute
= CreateAttribute();
1813 _currentattribute
._namestartindex
= index
;
1814 _currentattribute
._line
= _line
;
1815 _currentattribute
._lineposition
= _lineposition
;
1816 _currentattribute
._streamposition
= index
;
1819 private void PushAttributeNameEnd(int index
)
1821 _currentattribute
._namelength
= index
- _currentattribute
._namestartindex
;
1822 _currentnode
.Attributes
.Append(_currentattribute
);
1825 private void PushAttributeValueStart(int index
)
1827 _currentattribute
._valuestartindex
= index
;
1830 private void PushAttributeValueEnd(int index
)
1832 _currentattribute
._valuelength
= index
- _currentattribute
._valuestartindex
;
1835 private void PushNodeStart(HtmlNodeType type
, int index
)
1837 _currentnode
= CreateNode(type
, index
);
1838 _currentnode
._line
= _line
;
1839 _currentnode
._lineposition
= _lineposition
;
1840 if (type
== HtmlNodeType
.Element
)
1842 _currentnode
._lineposition
--;
1844 _currentnode
._streamposition
= index
;
1847 private void PushNodeEnd(int index
, bool close
)
1849 _currentnode
._outerlength
= index
- _currentnode
._outerstartindex
;
1851 //SLIM: inform caller
1852 if (_streammode
&& ReportNode
!= null)
1853 _stop_parsing
= ReportNode (_currentnode
);
1856 if (_currentnode
._nodetype
== HtmlNodeType
.Text
)
1857 Debug ("Text:" + _currentnode
.InnerText
);
1859 Debug ((_currentnode
.StartTag
? "Start-" : "End-") + _currentnode
.Name
);
1861 if ((_currentnode
._nodetype
== HtmlNodeType
.Text
) ||
1862 (_currentnode
._nodetype
== HtmlNodeType
.Comment
))
1864 // forget about void nodes
1865 if (_currentnode
._outerlength
>0)
1867 _currentnode
._innerlength
= _currentnode
._outerlength
;
1868 _currentnode
._innerstartindex
= _currentnode
._outerstartindex
;
1869 // SLIM: no need to append child in stream mode
1870 // SLIM: whatever the caller needs to do, tell it to do now
1871 if (!_streammode
&& _lastparentnode
!= null)
1873 _lastparentnode
.AppendChild(_currentnode
);
1879 if ((_currentnode
._starttag
) && (_lastparentnode
!= _currentnode
))
1881 // add to parent node
1882 // SLIM: no need to append child in stream mode
1883 // SLIM: whatever the caller needs to do, tell it to do now
1884 if (!_streammode
&& _lastparentnode
!= null)
1886 _lastparentnode
.AppendChild(_currentnode
);
1889 ReadDocumentEncoding(_currentnode
);
1891 // remember last node of this kind
1892 // SLIM: we still to store _currentnode to help other tags in the same level
1893 HtmlNode prev
= (HtmlNode
)_lastnodes
[_currentnode
.Name
];
1894 _currentnode
._prevwithsamename
= prev
;
1895 _lastnodes
[_currentnode
.Name
] = _currentnode
;
1898 if ((_currentnode
.NodeType
== HtmlNodeType
.Document
) ||
1899 (_currentnode
.NodeType
== HtmlNodeType
.Element
))
1901 _lastparentnode
= _currentnode
;
1904 if (HtmlNode
.IsCDataElement(CurrentNodeName()))
1906 _state
= ParseState
.PcData
;
1910 if ((HtmlNode
.IsClosedElement(_currentnode
.Name
)) ||
1911 (HtmlNode
.IsEmptyElement(_currentnode
.Name
)))
1918 if ((close
) || (!_currentnode
._starttag
))
1921 if ((_currentnode
._nodetype
== HtmlNodeType
.Text
) ||
1922 (_currentnode
._nodetype
== HtmlNodeType
.Comment
))
1923 _currentnode
= null;
1927 private void PushNodeNameStart(bool starttag
, int index
)
1929 _currentnode
._starttag
= starttag
;
1930 _currentnode
._namestartindex
= index
;
1933 private string[] GetResetters(string name
)
1938 return new string[]{"ul"}
;
1941 return new string[]{"table"}
;
1945 return new string[]{"tr", "table"}
;
1952 private void FixNestedTags()
1954 // we are only interested by start tags, not closing tags
1955 if (!_currentnode
._starttag
)
1958 string name
= CurrentNodeName().ToLower();
1959 FixNestedTag(name
, GetResetters(name
));
1962 private void FixNestedTag(string name
, string[] resetters
)
1964 if (resetters
== null)
1969 // if we find a previous unclosed same name node, without a resetter node between, we must close it
1970 prev
= (HtmlNode
)_lastnodes
[name
];
1971 if ((prev
!= null) && (!prev
.Closed
))
1974 // try to find a resetter node, if found, we do nothing
1975 if (FindResetterNodes(prev
, resetters
))
1980 // ok we need to close the prev now
1981 // create a fake closer node
1982 HtmlNode close
= new HtmlNode(prev
.NodeType
, this, -1);
1983 close
._endnode
= close
;
1984 prev
.CloseNode(close
);
1989 private bool FindResetterNodes(HtmlNode node
, string[] names
)
1995 for(int i
=0;i
<names
.Length
;i
++)
1997 if (FindResetterNode(node
, names
[i
]) != null)
2005 private HtmlNode
FindResetterNode(HtmlNode node
, string name
)
2007 HtmlNode resetter
= (HtmlNode
)_lastnodes
[name
];
2008 if (resetter
== null)
2010 if (resetter
.Closed
)
2014 if (resetter
._streamposition
<node
._streamposition
)
2021 private void PushNodeNameEnd(int index
)
2023 _currentnode
._namelength
= index
- _currentnode
._namestartindex
;
2024 if (OptionFixNestedTags
)
2030 private void CloseCurrentNode()
2032 if (_currentnode
.Closed
) // text or document are by def closed
2037 // find last node of this kind
2038 HtmlNode prev
= (HtmlNode
)_lastnodes
[_currentnode
.Name
];
2041 if (HtmlNode
.IsClosedElement(_currentnode
.Name
))
2043 // </br> will be seen as <br>
2044 _currentnode
.CloseNode(_currentnode
);
2046 // add to parent node
2047 if (_lastparentnode
!= null)
2049 HtmlNode foundNode
= null;
2050 Stack futureChild
= new Stack();
2051 for (HtmlNode node
= _lastparentnode
.LastChild
; node
!= null; node
= node
.PreviousSibling
)
2053 if ((node
.Name
== _currentnode
.Name
) && (! node
.HasChildNodes
))
2058 futureChild
.Push(node
);
2060 if (foundNode
!= null)
2062 HtmlNode node
= null;
2063 while(futureChild
.Count
!= 0)
2065 node
= (HtmlNode
)futureChild
.Pop();
2066 _lastparentnode
.RemoveChild(node
);
2067 foundNode
.AppendChild(node
);
2072 _lastparentnode
.AppendChild(_currentnode
);
2079 // node has no parent
2080 // node is not a closed node
2082 if (HtmlNode
.CanOverlapElement(_currentnode
.Name
))
2084 // this is a hack: add it as a text node
2085 HtmlNode closenode
= CreateNode(HtmlNodeType
.Text
, _currentnode
._outerstartindex
);
2086 closenode
._outerlength
= _currentnode
._outerlength
;
2087 ((HtmlTextNode
)closenode
).Text
= ((HtmlTextNode
)closenode
).Text
.ToLower();
2088 if (_lastparentnode
!= null)
2090 _lastparentnode
.AppendChild(closenode
);
2096 if (HtmlNode
.IsEmptyElement(_currentnode
.Name
))
2099 HtmlParseErrorCode
.EndTagNotRequired
,
2100 _currentnode
._line
, _currentnode
._lineposition
,
2101 _currentnode
._streamposition
, _currentnode
.OuterHtml
,
2102 "End tag </" + _currentnode
.Name
+ "> is not required");
2106 // node cannot overlap, node is not empty
2108 HtmlParseErrorCode
.TagNotOpened
,
2109 _currentnode
._line
, _currentnode
._lineposition
,
2110 _currentnode
._streamposition
, _currentnode
.OuterHtml
,
2111 "Start tag <" + _currentnode
.Name
+ "> was not found");
2119 if (OptionFixNestedTags
)
2121 if (FindResetterNodes(prev
, GetResetters(_currentnode
.Name
)))
2124 HtmlParseErrorCode
.EndTagInvalidHere
,
2125 _currentnode
._line
, _currentnode
._lineposition
,
2126 _currentnode
._streamposition
, _currentnode
.OuterHtml
,
2127 "End tag </" + _currentnode
.Name
+ "> invalid here");
2134 _lastnodes
[_currentnode
.Name
] = prev
._prevwithsamename
;
2135 prev
.CloseNode(_currentnode
);
2140 // we close this node, get grandparent
2143 if ((_lastparentnode
!= null) &&
2144 ((!HtmlNode
.IsClosedElement(_currentnode
.Name
)) ||
2145 (_currentnode
._starttag
)))
2147 UpdateLastParentNode();
2152 internal void UpdateLastParentNode()
2156 if (_lastparentnode
.Closed
)
2158 _lastparentnode
= _lastparentnode
.ParentNode
;
2161 while ((_lastparentnode
!= null) && (_lastparentnode
.Closed
));
2162 if (_lastparentnode
== null)
2164 _lastparentnode
= _documentnode
;
2168 private string CurrentAttributeName()
2170 return _text
.Substring(_currentattribute
._namestartindex
, _currentattribute
._namelength
);
2173 private string CurrentAttributeValue()
2175 return _text
.Substring(_currentattribute
._valuestartindex
, _currentattribute
._valuelength
);
2178 private string CurrentNodeName()
2180 return _text
.Substring(_currentnode
._namestartindex
, _currentnode
._namelength
);
2183 private string CurrentNodeOuter()
2185 return _text
.Substring(_currentnode
._outerstartindex
, _currentnode
._outerlength
);
2188 private string CurrentNodeInner()
2190 return _text
.Substring(_currentnode
._innerstartindex
, _currentnode
._innerlength
);
2194 /// Determines if the specified character is considered as a whitespace character.
2196 /// <param name="c">The character to check.</param>
2197 /// <returns>true if if the specified character is considered as a whitespace character.</returns>
2198 public static bool IsWhiteSpace(int c
)
2200 if ((c
== 10) || (c
== 13) || (c
== 32) || (c
== 9))
2209 internal class EncodingFoundException
: Exception
2211 private Encoding _encoding
;
2213 internal EncodingFoundException(Encoding encoding
)
2215 _encoding
= encoding
;
2218 internal Encoding Encoding