1 // HtmlAgilityPack V1.0 - Simon Mourier <simonm@microsoft.com>
4 Copyright (C) 2003 Simon Mourier <simonm@microsoft.com>
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions
10 1. Redistributions of source code must retain the above copyright
11 notice, this list of conditions and the following disclaimer.
12 2. Redistributions in binary form must reproduce the above copyright
13 notice, this list of conditions and the following disclaimer in the
14 documentation and/or other materials provided with the distribution.
15 3. The name of the author may not be used to endorse or promote products
16 derived from this software without specific prior written permission.
18 THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
19 IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20 OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21 IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
22 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
23 NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
27 THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 using System
.Diagnostics
;
34 using System
.Collections
;
35 using System
.Text
.RegularExpressions
;
37 using System
.Xml
.XPath
;
39 namespace HtmlAgilityPack
42 /// Represents the type of parsing error.
44 public enum HtmlParseErrorCode
47 /// A tag was not closed.
52 /// A tag was not opened.
57 /// There is a charset mismatch between stream and declared (META) encoding.
62 /// An end tag was not required.
67 /// An end tag is invalid at this position.
73 /// Represents a parsing error found during document parsing.
75 public class HtmlParseError
77 private HtmlParseErrorCode _code
;
79 private int _linePosition
;
80 private int _streamPosition
;
81 private string _sourceText
;
82 private string _reason
;
84 internal HtmlParseError(
85 HtmlParseErrorCode code
,
94 _linePosition
= linePosition
;
95 _streamPosition
= streamPosition
;
96 _sourceText
= sourceText
;
101 /// Gets the type of error.
103 public HtmlParseErrorCode Code
112 /// Gets the line number of this error in the document.
123 /// Gets the column number of this error in the document.
125 public int LinePosition
129 return _linePosition
;
134 /// Gets the absolstream position of this error in the document, relative to the start of the document.
136 public int StreamPosition
140 return _streamPosition
;
145 /// Gets the the full text of the line containing the error.
147 public string SourceText
156 /// Gets a description for the error.
168 /// Represents a complete HTML document.
170 public class HtmlDocument
: IXPathNavigable
172 internal static readonly string HtmlExceptionRefNotChild
= "Reference node must be a child of this node";
173 internal static readonly string HtmlExceptionUseIdAttributeFalse
= "You need to set UseIdAttribute property to true to enable this feature";
175 internal Hashtable _openednodes
;
176 internal Hashtable _lastnodes
= new Hashtable();
177 internal Hashtable _nodesid
;
178 private HtmlNode _documentnode
;
179 internal string _text
;
180 private HtmlNode _currentnode
;
181 private HtmlNode _lastparentnode
;
182 private HtmlAttribute _currentattribute
;
185 private int _lineposition
, _maxlineposition
;
187 private bool _fullcomment
;
188 private System
.Text
.Encoding _streamencoding
;
189 private System
.Text
.Encoding _declaredencoding
;
190 private ArrayList _parseerrors
= new ArrayList();
191 private ParseState _state
, _oldstate
;
192 private Crc32 _crc32
= null;
193 private bool _onlyDetectEncoding
= false;
198 /// Defines if a checksum must be computed for the document while parsing. Default is false.
200 public bool OptionComputeChecksum
= false;
203 /// Defines if declared encoding must be read from the document.
204 /// Declared encoding is determined using the meta http-equiv="content-type" content="text/html;charset=XXXXX" html node.
207 public bool OptionReadEncoding
= true;
211 /// Defines if non closed nodes will be checked at the end of parsing. Default is true.
213 public bool OptionCheckSyntax
= true;
216 /// Defines if the 'id' attribute must be specifically used. Default is true.
218 public bool OptionUseIdAttribute
= true;
221 /// Defines if empty nodes must be written as closed during output. Default is false.
223 public bool OptionWriteEmptyNodes
= false;
226 /// Defines if output must conform to XML, instead of HTML.
228 public bool OptionOutputAsXml
= false;
231 /// Defines if name must be output in uppercase. Default is false.
233 public bool OptionOutputUpperCase
= false;
236 /// Defines if attribute value output must be optimized (not bound with double quotes if it is possible). Default is false.
238 public bool OptionOutputOptimizeAttributeValues
= false;
241 /// Adds Debugging attributes to node. Default is false.
243 public bool OptionAddDebuggingAttributes
= false;
246 /// Defines if source text must be extracted while parsing errors.
247 /// If the document has a lot of errors, or cascading errors, parsing performance can be dramatically affected if set to true.
248 /// Default is false.
250 public bool OptionExtractErrorSourceText
= false; // turning this on can dramatically slow performance if a lot of errors are detected
253 /// Defines if closing for non closed nodes must be done at the end or directly in the document.
254 /// Setting this to true can actually change how browsers render the page. Default is false.
256 public bool OptionAutoCloseOnEnd
= false; // close errors at the end
259 /// Defines if LI, TR, TH, TD tags must be partially fixed when nesting errors are detected. Default is false.
261 public bool OptionFixNestedTags
= false; // fix li, tr, th, td tags
264 /// Defines the maximum length of source text or parse errors. Default is 100.
266 public int OptionExtractErrorSourceTextMaxLength
= 100;
269 /// Defines the default stream encoding to use. Default is System.Text.Encoding.Default.
271 public System
.Text
.Encoding OptionDefaultStreamEncoding
= System
.Text
.Encoding
.Default
;
274 /// Gets a list of parse errors found in the document.
276 public ArrayList ParseErrors
285 /// Gets the document's stream encoding.
287 public System
.Text
.Encoding StreamEncoding
291 return _streamencoding
;
296 /// Gets the document's declared encoding.
297 /// Declared encoding is determined using the meta http-equiv="content-type" content="text/html;charset=XXXXX" html node.
299 public System
.Text
.Encoding DeclaredEncoding
303 return _declaredencoding
;
308 /// Creates an instance of an HTML document.
310 public HtmlDocument()
312 _documentnode
= CreateNode(HtmlNodeType
.Document
, 0);
315 internal HtmlNode
GetXmlDeclaration()
317 if (!_documentnode
.HasChildNodes
)
322 foreach(HtmlNode node
in _documentnode
._childnodes
)
324 if (node
.Name
== "?xml") // it's ok, names are case sensitive
333 /// Applies HTML encoding to a specified string.
335 /// <param name="html">The input string to encode. May not be null.</param>
336 /// <returns>The encoded string.</returns>
337 public static string HtmlEncode(string html
)
341 throw new ArgumentNullException("html");
343 // replace & by & but only once!
344 Regex rx
= new Regex("&(?!(amp;)|(lt;)|(gt;)|(quot;))", RegexOptions
.IgnoreCase
);
345 return rx
.Replace(html
, "&").Replace("<", "<").Replace(">", ">").Replace("\"", """);
349 /// Detects the encoding of an HTML stream.
351 /// <param name="stream">The input stream. May not be null.</param>
352 /// <returns>The detected encoding.</returns>
353 public Encoding
DetectEncoding(Stream stream
)
357 throw new ArgumentNullException("stream");
359 return DetectEncoding(new StreamReader(stream
));
363 /// Detects the encoding of an HTML file.
365 /// <param name="path">Path for the file containing the HTML document to detect. May not be null.</param>
366 /// <returns>The detected encoding.</returns>
367 public Encoding
DetectEncoding(string path
)
371 throw new ArgumentNullException("path");
373 StreamReader sr
= new StreamReader(path
, OptionDefaultStreamEncoding
);
374 Encoding encoding
= DetectEncoding(sr
);
380 /// Detects the encoding of an HTML text.
382 /// <param name="html">The input html text. May not be null.</param>
383 /// <returns>The detected encoding.</returns>
384 public Encoding
DetectEncodingHtml(string html
)
388 throw new ArgumentNullException("html");
390 StringReader sr
= new StringReader(html
);
391 Encoding encoding
= DetectEncoding(sr
);
397 /// Detects the encoding of an HTML text provided on a TextReader.
399 /// <param name="reader">The TextReader used to feed the HTML. May not be null.</param>
400 /// <returns>The detected encoding.</returns>
401 public Encoding
DetectEncoding(TextReader reader
)
405 throw new ArgumentNullException("reader");
407 _onlyDetectEncoding
= true;
408 if (OptionCheckSyntax
)
410 _openednodes
= new Hashtable();
417 if (OptionUseIdAttribute
)
419 _nodesid
= new Hashtable();
426 StreamReader sr
= reader
as StreamReader
;
429 _streamencoding
= sr
.CurrentEncoding
;
433 _streamencoding
= null;
435 _declaredencoding
= null;
437 _text
= reader
.ReadToEnd();
438 _documentnode
= CreateNode(HtmlNodeType
.Document
, 0);
440 // this is a hack, but it allows us not to muck with the original parsing code
445 catch(EncodingFoundException ex
)
453 /// Loads an HTML document from a stream.
455 /// <param name="stream">The input stream.</param>
456 public void Load(Stream stream
)
458 Load(new StreamReader(stream
, OptionDefaultStreamEncoding
));
462 /// Loads an HTML document from a stream.
464 /// <param name="stream">The input stream.</param>
465 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the stream.</param>
466 public void Load(Stream stream
, bool detectEncodingFromByteOrderMarks
)
468 Load(new StreamReader(stream
, detectEncodingFromByteOrderMarks
));
472 /// Loads an HTML document from a stream.
474 /// <param name="stream">The input stream.</param>
475 /// <param name="encoding">The character encoding to use.</param>
476 public void Load(Stream stream
, Encoding encoding
)
478 Load(new StreamReader(stream
, encoding
));
482 /// Loads an HTML document from a stream.
484 /// <param name="stream">The input stream.</param>
485 /// <param name="encoding">The character encoding to use.</param>
486 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the stream.</param>
487 public void Load(Stream stream
, Encoding encoding
, bool detectEncodingFromByteOrderMarks
)
489 Load(new StreamReader(stream
, encoding
, detectEncodingFromByteOrderMarks
));
493 /// Loads an HTML document from a stream.
495 /// <param name="stream">The input stream.</param>
496 /// <param name="encoding">The character encoding to use.</param>
497 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the stream.</param>
498 /// <param name="buffersize">The minimum buffer size.</param>
499 public void Load(Stream stream
, Encoding encoding
, bool detectEncodingFromByteOrderMarks
, int buffersize
)
501 Load(new StreamReader(stream
, encoding
, detectEncodingFromByteOrderMarks
, buffersize
));
505 /// Loads an HTML document from a file.
507 /// <param name="path">The complete file path to be read. May not be null.</param>
508 public void Load(string path
)
512 throw new ArgumentNullException("path");
514 StreamReader sr
= new StreamReader(path
, OptionDefaultStreamEncoding
);
520 /// Loads an HTML document from a file.
522 /// <param name="path">The complete file path to be read. May not be null.</param>
523 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
524 public void Load(string path
, bool detectEncodingFromByteOrderMarks
)
528 throw new ArgumentNullException("path");
530 StreamReader sr
= new StreamReader(path
, detectEncodingFromByteOrderMarks
);
536 /// Loads an HTML document from a file.
538 /// <param name="path">The complete file path to be read. May not be null.</param>
539 /// <param name="encoding">The character encoding to use. May not be null.</param>
540 public void Load(string path
, Encoding encoding
)
544 throw new ArgumentNullException("path");
546 if (encoding
== null)
548 throw new ArgumentNullException("encoding");
550 StreamReader sr
= new StreamReader(path
, encoding
);
556 /// Loads an HTML document from a file.
558 /// <param name="path">The complete file path to be read. May not be null.</param>
559 /// <param name="encoding">The character encoding to use. May not be null.</param>
560 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
561 public void Load(string path
, Encoding encoding
, bool detectEncodingFromByteOrderMarks
)
565 throw new ArgumentNullException("path");
567 if (encoding
== null)
569 throw new ArgumentNullException("encoding");
571 StreamReader sr
= new StreamReader(path
, encoding
, detectEncodingFromByteOrderMarks
);
577 /// Loads an HTML document from a file.
579 /// <param name="path">The complete file path to be read. May not be null.</param>
580 /// <param name="encoding">The character encoding to use. May not be null.</param>
581 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
582 /// <param name="buffersize">The minimum buffer size.</param>
583 public void Load(string path
, Encoding encoding
, bool detectEncodingFromByteOrderMarks
, int buffersize
)
587 throw new ArgumentNullException("path");
589 if (encoding
== null)
591 throw new ArgumentNullException("encoding");
593 StreamReader sr
= new StreamReader(path
, encoding
, detectEncodingFromByteOrderMarks
, buffersize
);
599 /// Loads the HTML document from the specified string.
601 /// <param name="html">String containing the HTML document to load. May not be null.</param>
602 public void LoadHtml(string html
)
606 throw new ArgumentNullException("html");
608 StringReader sr
= new StringReader(html
);
614 /// Detects the encoding of an HTML document from a file first, and then loads the file.
616 /// <param name="path">The complete file path to be read.</param>
617 public void DetectEncodingAndLoad(string path
)
619 DetectEncodingAndLoad(path
, true);
623 /// Detects the encoding of an HTML document from a file first, and then loads the file.
625 /// <param name="path">The complete file path to be read. May not be null.</param>
626 /// <param name="detectEncoding">true to detect encoding, false otherwise.</param>
627 public void DetectEncodingAndLoad(string path
, bool detectEncoding
)
631 throw new ArgumentNullException("path");
633 System
.Text
.Encoding enc
;
636 enc
= DetectEncoding(path
);
654 /// Loads the HTML document from the specified TextReader.
656 /// <param name="reader">The TextReader used to feed the HTML data into the document. May not be null.</param>
657 public void Load(TextReader reader
)
659 // all Load methods pass down to this one
662 throw new ArgumentNullException("reader");
665 _onlyDetectEncoding
= false;
667 if (OptionCheckSyntax
)
669 _openednodes
= new Hashtable();
676 if (OptionUseIdAttribute
)
678 _nodesid
= new Hashtable();
685 StreamReader sr
= reader
as StreamReader
;
690 // trigger bom read if needed
697 _streamencoding
= sr
.CurrentEncoding
;
701 _streamencoding
= null;
703 _declaredencoding
= null;
705 _text
= reader
.ReadToEnd();
706 _documentnode
= CreateNode(HtmlNodeType
.Document
, 0);
709 if (OptionCheckSyntax
)
711 foreach(HtmlNode node
in _openednodes
.Values
)
713 if (!node
._starttag
) // already reported
719 if (OptionExtractErrorSourceText
)
721 html
= node
.OuterHtml
;
722 if (html
.Length
> OptionExtractErrorSourceTextMaxLength
)
724 html
= html
.Substring(0, OptionExtractErrorSourceTextMaxLength
);
732 HtmlParseErrorCode
.TagNotClosed
,
733 node
._line
, node
._lineposition
,
734 node
._streamposition
, html
,
735 "End tag </" + node
.Name
+ "> was not found");
738 // we don't need this anymore
739 _openednodes
.Clear();
743 internal System
.Text
.Encoding
GetOutEncoding()
745 // when unspecified, use the stream encoding first
746 if (_declaredencoding
!= null)
748 return _declaredencoding
;
752 if (_streamencoding
!= null)
754 return _streamencoding
;
757 return OptionDefaultStreamEncoding
;
762 /// Gets the document's output encoding.
764 public System
.Text
.Encoding Encoding
768 return GetOutEncoding();
773 /// Saves the HTML document to the specified stream.
775 /// <param name="outStream">The stream to which you want to save.</param>
776 public void Save(Stream outStream
)
778 StreamWriter sw
= new StreamWriter(outStream
, GetOutEncoding());
783 /// Saves the HTML document to the specified stream.
785 /// <param name="outStream">The stream to which you want to save. May not be null.</param>
786 /// <param name="encoding">The character encoding to use. May not be null.</param>
787 public void Save(Stream outStream
, System
.Text
.Encoding encoding
)
789 if (outStream
== null)
791 throw new ArgumentNullException("outStream");
793 if (encoding
== null)
795 throw new ArgumentNullException("encoding");
797 StreamWriter sw
= new StreamWriter(outStream
, encoding
);
802 /// Saves the mixed document to the specified file.
804 /// <param name="filename">The location of the file where you want to save the document.</param>
805 public void Save(string filename
)
807 StreamWriter sw
= new StreamWriter(filename
, false, GetOutEncoding());
813 /// Saves the mixed document to the specified file.
815 /// <param name="filename">The location of the file where you want to save the document. May not be null.</param>
816 /// <param name="encoding">The character encoding to use. May not be null.</param>
817 public void Save(string filename
, System
.Text
.Encoding encoding
)
819 if (filename
== null)
821 throw new ArgumentNullException("filename");
823 if (encoding
== null)
825 throw new ArgumentNullException("encoding");
827 StreamWriter sw
= new StreamWriter(filename
, false, encoding
);
833 /// Saves the HTML document to the specified StreamWriter.
835 /// <param name="writer">The StreamWriter to which you want to save.</param>
836 public void Save(StreamWriter writer
)
838 Save((TextWriter
)writer
);
842 /// Saves the HTML document to the specified TextWriter.
844 /// <param name="writer">The TextWriter to which you want to save. May not be null.</param>
845 public void Save(TextWriter writer
)
849 throw new ArgumentNullException("writer");
851 DocumentNode
.WriteTo(writer
);
855 /// Saves the HTML document to the specified XmlWriter.
857 /// <param name="writer">The XmlWriter to which you want to save.</param>
858 public void Save(XmlWriter writer
)
860 DocumentNode
.WriteTo(writer
);
865 /// Creates a new XPathNavigator object for navigating this HTML document.
867 /// <returns>An XPathNavigator object. The XPathNavigator is positioned on the root of the document.</returns>
868 public XPathNavigator
CreateNavigator()
870 return new HtmlNodeNavigator(this, _documentnode
);
873 internal void SetIdForNode(HtmlNode node
, string id
)
875 if (!OptionUseIdAttribute
)
880 if ((_nodesid
== null) || (id
== null))
887 _nodesid
.Remove(id
.ToLower());
891 _nodesid
[id
.ToLower()] = node
;
896 /// Gets the HTML node with the specified 'id' attribute value.
898 /// <param name="id">The attribute id to match. May not be null.</param>
899 /// <returns>The HTML node with the matching id or null if not found.</returns>
900 public HtmlNode
GetElementbyId(string id
)
904 throw new ArgumentNullException("id");
906 if (_nodesid
== null)
908 throw new Exception(HtmlExceptionUseIdAttributeFalse
);
911 return _nodesid
[id
.ToLower()] as HtmlNode
;
915 /// Creates an HTML element node with the specified name.
917 /// <param name="name">The qualified name of the element. May not be null.</param>
918 /// <returns>The new HTML node.</returns>
919 public HtmlNode
CreateElement(string name
)
923 throw new ArgumentNullException("name");
925 HtmlNode node
= CreateNode(HtmlNodeType
.Element
);
931 /// Creates an HTML comment node.
933 /// <returns>The new HTML comment node.</returns>
934 public HtmlCommentNode
CreateComment()
936 return (HtmlCommentNode
)CreateNode(HtmlNodeType
.Comment
);
940 /// Creates an HTML comment node with the specified comment text.
942 /// <param name="comment">The comment text. May not be null.</param>
943 /// <returns>The new HTML comment node.</returns>
944 public HtmlCommentNode
CreateComment(string comment
)
948 throw new ArgumentNullException("comment");
950 HtmlCommentNode c
= CreateComment();
956 /// Creates an HTML text node.
958 /// <returns>The new HTML text node.</returns>
959 public HtmlTextNode
CreateTextNode()
961 return (HtmlTextNode
)CreateNode(HtmlNodeType
.Text
);
965 /// Creates an HTML text node with the specified text.
967 /// <param name="text">The text of the node. May not be null.</param>
968 /// <returns>The new HTML text node.</returns>
969 public HtmlTextNode
CreateTextNode(string text
)
973 throw new ArgumentNullException("text");
975 HtmlTextNode t
= CreateTextNode();
980 internal HtmlNode
CreateNode(HtmlNodeType type
)
982 return CreateNode(type
, -1);
985 internal HtmlNode
CreateNode(HtmlNodeType type
, int index
)
989 case HtmlNodeType
.Comment
:
990 return new HtmlCommentNode(this, index
);
992 case HtmlNodeType
.Text
:
993 return new HtmlTextNode(this, index
);
996 return new HtmlNode(type
, this, index
);
1000 internal HtmlAttribute
CreateAttribute()
1002 return new HtmlAttribute(this);
1006 /// Creates an HTML attribute with the specified name.
1008 /// <param name="name">The name of the attribute. May not be null.</param>
1009 /// <returns>The new HTML attribute.</returns>
1010 public HtmlAttribute
CreateAttribute(string name
)
1014 throw new ArgumentNullException("name");
1016 HtmlAttribute att
= CreateAttribute();
1022 /// Creates an HTML attribute with the specified name.
1024 /// <param name="name">The name of the attribute. May not be null.</param>
1025 /// <param name="value">The value of the attribute.</param>
1026 /// <returns>The new HTML attribute.</returns>
1027 public HtmlAttribute
CreateAttribute(string name
, string value)
1031 throw new ArgumentNullException("name");
1033 HtmlAttribute att
= CreateAttribute(name
);
1039 /// Gets the root node of the document.
1041 public HtmlNode DocumentNode
1045 return _documentnode
;
1050 /// Gets the document CRC32 checksum if OptionComputeChecksum was set to true before parsing, 0 otherwise.
1062 return (int)_crc32
.CheckSum
;
1067 private HtmlParseError
AddError(
1068 HtmlParseErrorCode code
,
1075 HtmlParseError err
= new HtmlParseError(code
, line
, linePosition
, streamPosition
, sourceText
, reason
);
1076 _parseerrors
.Add(err
);
1080 private enum ParseState
1088 AttributeBeforeEquals
,
1089 AttributeAfterEquals
,
1092 QuotedAttributeValue
,
1097 private void IncrementPosition()
1101 // REVIEW: should we add some checksum code in DecrementPosition too?
1102 _crc32
.AddToCRC32(_c
);
1106 _maxlineposition
= _lineposition
;
1118 private void DecrementPosition()
1121 if (_lineposition
== 1)
1123 _lineposition
= _maxlineposition
;
1132 private void Parse()
1135 if (OptionComputeChecksum
)
1137 _crc32
= new Crc32();
1140 _lastnodes
= new Hashtable();
1142 _fullcomment
= false;
1143 _parseerrors
= new ArrayList();
1146 _maxlineposition
= 1;
1148 _state
= ParseState
.Text
;
1150 _documentnode
._innerlength
= _text
.Length
;
1151 _documentnode
._outerlength
= _text
.Length
;
1153 _lastparentnode
= _documentnode
;
1154 _currentnode
= CreateNode(HtmlNodeType
.Text
, 0);
1155 _currentattribute
= null;
1158 PushNodeStart(HtmlNodeType
.Text
, 0);
1159 while (_index
<_text
.Length
)
1162 IncrementPosition();
1166 case ParseState
.Text
:
1171 case ParseState
.WhichTag
:
1176 PushNodeNameStart(false, _index
);
1180 PushNodeNameStart(true, _index
-1);
1181 DecrementPosition();
1183 _state
= ParseState
.Tag
;
1186 case ParseState
.Tag
:
1189 if (IsWhiteSpace(_c
))
1191 PushNodeNameEnd(_index
-1);
1192 if (_state
!= ParseState
.Tag
)
1194 _state
= ParseState
.BetweenAttributes
;
1199 PushNodeNameEnd(_index
-1);
1200 if (_state
!= ParseState
.Tag
)
1202 _state
= ParseState
.EmptyTag
;
1207 PushNodeNameEnd(_index
-1);
1208 if (_state
!= ParseState
.Tag
)
1210 PushNodeEnd(_index
, false);
1211 if (_state
!= ParseState
.Tag
)
1213 _state
= ParseState
.Text
;
1214 PushNodeStart(HtmlNodeType
.Text
, _index
);
1218 case ParseState
.BetweenAttributes
:
1222 if (IsWhiteSpace(_c
))
1225 if ((_c
== '/') || (_c
== '?'))
1227 _state
= ParseState
.EmptyTag
;
1233 PushNodeEnd(_index
, false);
1234 if (_state
!= ParseState
.BetweenAttributes
)
1236 _state
= ParseState
.Text
;
1237 PushNodeStart(HtmlNodeType
.Text
, _index
);
1241 PushAttributeNameStart(_index
-1);
1242 _state
= ParseState
.AttributeName
;
1245 case ParseState
.EmptyTag
:
1251 PushNodeEnd(_index
, true);
1252 if (_state
!= ParseState
.EmptyTag
)
1254 _state
= ParseState
.Text
;
1255 PushNodeStart(HtmlNodeType
.Text
, _index
);
1258 _state
= ParseState
.BetweenAttributes
;
1261 case ParseState
.AttributeName
:
1265 if (IsWhiteSpace(_c
))
1267 PushAttributeNameEnd(_index
-1);
1268 _state
= ParseState
.AttributeBeforeEquals
;
1273 PushAttributeNameEnd(_index
-1);
1274 _state
= ParseState
.AttributeAfterEquals
;
1279 PushAttributeNameEnd(_index
-1);
1280 PushNodeEnd(_index
, false);
1281 if (_state
!= ParseState
.AttributeName
)
1283 _state
= ParseState
.Text
;
1284 PushNodeStart(HtmlNodeType
.Text
, _index
);
1289 case ParseState
.AttributeBeforeEquals
:
1293 if (IsWhiteSpace(_c
))
1297 PushNodeEnd(_index
, false);
1298 if (_state
!= ParseState
.AttributeBeforeEquals
)
1300 _state
= ParseState
.Text
;
1301 PushNodeStart(HtmlNodeType
.Text
, _index
);
1306 _state
= ParseState
.AttributeAfterEquals
;
1309 // no equals, no whitespace, it's a new attrribute starting
1310 _state
= ParseState
.BetweenAttributes
;
1311 DecrementPosition();
1314 case ParseState
.AttributeAfterEquals
:
1318 if (IsWhiteSpace(_c
))
1321 if ((_c
== '\'') || (_c
== '"'))
1323 _state
= ParseState
.QuotedAttributeValue
;
1324 PushAttributeValueStart(_index
);
1330 PushNodeEnd(_index
, false);
1331 if (_state
!= ParseState
.AttributeAfterEquals
)
1333 _state
= ParseState
.Text
;
1334 PushNodeStart(HtmlNodeType
.Text
, _index
);
1337 PushAttributeValueStart(_index
-1);
1338 _state
= ParseState
.AttributeValue
;
1341 case ParseState
.AttributeValue
:
1345 if (IsWhiteSpace(_c
))
1347 PushAttributeValueEnd(_index
-1);
1348 _state
= ParseState
.BetweenAttributes
;
1354 PushAttributeValueEnd(_index
-1);
1355 PushNodeEnd(_index
, false);
1356 if (_state
!= ParseState
.AttributeValue
)
1358 _state
= ParseState
.Text
;
1359 PushNodeStart(HtmlNodeType
.Text
, _index
);
1364 case ParseState
.QuotedAttributeValue
:
1365 if (_c
== lastquote
)
1367 PushAttributeValueEnd(_index
-1);
1368 _state
= ParseState
.BetweenAttributes
;
1373 if (_index
<_text
.Length
)
1375 if (_text
[_index
] == '%')
1378 _state
= ParseState
.ServerSideCode
;
1385 case ParseState
.Comment
:
1390 if ((_text
[_index
-2] != '-') ||
1391 (_text
[_index
-3] != '-'))
1396 PushNodeEnd(_index
, false);
1397 _state
= ParseState
.Text
;
1398 PushNodeStart(HtmlNodeType
.Text
, _index
);
1403 case ParseState
.ServerSideCode
:
1406 if (_index
<_text
.Length
)
1408 if (_text
[_index
] == '>')
1412 case ParseState
.AttributeAfterEquals
:
1413 _state
= ParseState
.AttributeValue
;
1416 case ParseState
.BetweenAttributes
:
1417 PushAttributeNameEnd(_index
+1);
1418 _state
= ParseState
.BetweenAttributes
;
1425 IncrementPosition();
1431 case ParseState
.PcData
:
1432 // look for </tag + 1 char
1435 if ((_currentnode
._namelength
+3)<=(_text
.Length
-(_index
-1)))
1437 if (string.Compare(_text
.Substring(_index
-1, _currentnode
._namelength
+2),
1438 "</" + _currentnode
.Name
, true) == 0)
1440 int c
= _text
[_index
-1 + 2 + _currentnode
.Name
.Length
];
1441 if ((c
== '>') || (IsWhiteSpace(c
)))
1443 // add the script as a text node
1444 HtmlNode script
= CreateNode(HtmlNodeType
.Text
,
1445 _currentnode
._outerstartindex
+ _currentnode
._outerlength
);
1446 script
._outerlength
= _index
-1 - script
._outerstartindex
;
1447 _currentnode
.AppendChild(script
);
1450 PushNodeStart(HtmlNodeType
.Element
, _index
-1);
1451 PushNodeNameStart(false, _index
-1 +2);
1452 _state
= ParseState
.Tag
;
1453 IncrementPosition();
1461 // finish the current work
1462 if (_currentnode
._namestartindex
> 0)
1464 PushNodeNameEnd(_index
);
1466 PushNodeEnd(_index
, false);
1468 // we don't need this anymore
1472 private bool NewCheck()
1478 if (_index
<_text
.Length
)
1480 if (_text
[_index
] == '%')
1484 case ParseState
.AttributeAfterEquals
:
1485 PushAttributeValueStart(_index
-1);
1488 case ParseState
.BetweenAttributes
:
1489 PushAttributeNameStart(_index
-1);
1492 case ParseState
.WhichTag
:
1493 PushNodeNameStart(true, _index
-1);
1494 _state
= ParseState
.Tag
;
1498 _state
= ParseState
.ServerSideCode
;
1503 PushNodeEnd(_index
-1, true);
1504 _state
= ParseState
.WhichTag
;
1505 if ((_index
-1) <= (_text
.Length
-2))
1507 if (_text
[_index
] == '!')
1509 PushNodeStart(HtmlNodeType
.Comment
, _index
-1);
1510 PushNodeNameStart(true, _index
);
1511 PushNodeNameEnd(_index
+1);
1512 _state
= ParseState
.Comment
;
1513 if (_index
<(_text
.Length
-2))
1515 if ((_text
[_index
+1] == '-') &&
1516 (_text
[_index
+2] == '-'))
1518 _fullcomment
= true;
1522 _fullcomment
= false;
1528 PushNodeStart(HtmlNodeType
.Element
, _index
-1);
1532 private void ReadDocumentEncoding(HtmlNode node
)
1534 if (!OptionReadEncoding
)
1537 // <meta http-equiv="content-type" content="text/html;charset=iso-8859-1" />
1539 // when we append a child, we are in node end, so attributes are already populated
1540 if (node
._namelength
== 4) // quick check, avoids string alloc
1542 if (node
.Name
== "meta") // all nodes names are lowercase
1544 HtmlAttribute att
= node
.Attributes
["http-equiv"];
1547 if (string.Compare(att
.Value
, "content-type", true) == 0)
1549 HtmlAttribute content
= node
.Attributes
["content"];
1550 if (content
!= null)
1552 string charset
= NameValuePairList
.GetNameValuePairsValue(content
.Value
, "charset");
1553 if (charset
!= null)
1555 _declaredencoding
= Encoding
.GetEncoding(charset
);
1556 if (_onlyDetectEncoding
)
1558 throw new EncodingFoundException(_declaredencoding
);
1561 if (_streamencoding
!= null)
1563 if (_declaredencoding
.WindowsCodePage
!= _streamencoding
.WindowsCodePage
)
1566 HtmlParseErrorCode
.CharsetMismatch
,
1567 _line
, _lineposition
,
1568 _index
, node
.OuterHtml
,
1569 "Encoding mismatch between StreamEncoding: " +
1570 _streamencoding
.WebName
+ " and DeclaredEncoding: " + _declaredencoding
.WebName
);
1581 private void PushAttributeNameStart(int index
)
1583 _currentattribute
= CreateAttribute();
1584 _currentattribute
._namestartindex
= index
;
1585 _currentattribute
._line
= _line
;
1586 _currentattribute
._lineposition
= _lineposition
;
1587 _currentattribute
._streamposition
= index
;
1590 private void PushAttributeNameEnd(int index
)
1592 _currentattribute
._namelength
= index
- _currentattribute
._namestartindex
;
1593 _currentnode
.Attributes
.Append(_currentattribute
);
1596 private void PushAttributeValueStart(int index
)
1598 _currentattribute
._valuestartindex
= index
;
1601 private void PushAttributeValueEnd(int index
)
1603 _currentattribute
._valuelength
= index
- _currentattribute
._valuestartindex
;
1606 private void PushNodeStart(HtmlNodeType type
, int index
)
1608 _currentnode
= CreateNode(type
, index
);
1609 _currentnode
._line
= _line
;
1610 _currentnode
._lineposition
= _lineposition
;
1611 if (type
== HtmlNodeType
.Element
)
1613 _currentnode
._lineposition
--;
1615 _currentnode
._streamposition
= index
;
1618 private void PushNodeEnd(int index
, bool close
)
1620 _currentnode
._outerlength
= index
- _currentnode
._outerstartindex
;
1622 if ((_currentnode
._nodetype
== HtmlNodeType
.Text
) ||
1623 (_currentnode
._nodetype
== HtmlNodeType
.Comment
))
1625 // forget about void nodes
1626 if (_currentnode
._outerlength
>0)
1628 _currentnode
._innerlength
= _currentnode
._outerlength
;
1629 _currentnode
._innerstartindex
= _currentnode
._outerstartindex
;
1630 if (_lastparentnode
!= null)
1632 _lastparentnode
.AppendChild(_currentnode
);
1638 if ((_currentnode
._starttag
) && (_lastparentnode
!= _currentnode
))
1640 // add to parent node
1641 if (_lastparentnode
!= null)
1643 _lastparentnode
.AppendChild(_currentnode
);
1646 ReadDocumentEncoding(_currentnode
);
1648 // remember last node of this kind
1649 HtmlNode prev
= (HtmlNode
)_lastnodes
[_currentnode
.Name
];
1650 _currentnode
._prevwithsamename
= prev
;
1651 _lastnodes
[_currentnode
.Name
] = _currentnode
;
1654 if ((_currentnode
.NodeType
== HtmlNodeType
.Document
) ||
1655 (_currentnode
.NodeType
== HtmlNodeType
.Element
))
1657 _lastparentnode
= _currentnode
;
1660 if (HtmlNode
.IsCDataElement(CurrentNodeName()))
1662 _state
= ParseState
.PcData
;
1666 if ((HtmlNode
.IsClosedElement(_currentnode
.Name
)) ||
1667 (HtmlNode
.IsEmptyElement(_currentnode
.Name
)))
1674 if ((close
) || (!_currentnode
._starttag
))
1680 private void PushNodeNameStart(bool starttag
, int index
)
1682 _currentnode
._starttag
= starttag
;
1683 _currentnode
._namestartindex
= index
;
1686 private string[] GetResetters(string name
)
1691 return new string[]{"ul"}
;
1694 return new string[]{"table"}
;
1698 return new string[]{"tr", "table"}
;
1705 private void FixNestedTags()
1707 // we are only interested by start tags, not closing tags
1708 if (!_currentnode
._starttag
)
1711 string name
= CurrentNodeName().ToLower();
1712 FixNestedTag(name
, GetResetters(name
));
1715 private void FixNestedTag(string name
, string[] resetters
)
1717 if (resetters
== null)
1722 // if we find a previous unclosed same name node, without a resetter node between, we must close it
1723 prev
= (HtmlNode
)_lastnodes
[name
];
1724 if ((prev
!= null) && (!prev
.Closed
))
1727 // try to find a resetter node, if found, we do nothing
1728 if (FindResetterNodes(prev
, resetters
))
1733 // ok we need to close the prev now
1734 // create a fake closer node
1735 HtmlNode close
= new HtmlNode(prev
.NodeType
, this, -1);
1736 close
._endnode
= close
;
1737 prev
.CloseNode(close
);
1742 private bool FindResetterNodes(HtmlNode node
, string[] names
)
1748 for(int i
=0;i
<names
.Length
;i
++)
1750 if (FindResetterNode(node
, names
[i
]) != null)
1758 private HtmlNode
FindResetterNode(HtmlNode node
, string name
)
1760 HtmlNode resetter
= (HtmlNode
)_lastnodes
[name
];
1761 if (resetter
== null)
1763 if (resetter
.Closed
)
1767 if (resetter
._streamposition
<node
._streamposition
)
1774 private void PushNodeNameEnd(int index
)
1776 _currentnode
._namelength
= index
- _currentnode
._namestartindex
;
1777 if (OptionFixNestedTags
)
1783 private void CloseCurrentNode()
1785 if (_currentnode
.Closed
) // text or document are by def closed
1790 // find last node of this kind
1791 HtmlNode prev
= (HtmlNode
)_lastnodes
[_currentnode
.Name
];
1794 if (HtmlNode
.IsClosedElement(_currentnode
.Name
))
1796 // </br> will be seen as <br>
1797 _currentnode
.CloseNode(_currentnode
);
1799 // add to parent node
1800 if (_lastparentnode
!= null)
1802 HtmlNode foundNode
= null;
1803 Stack futureChild
= new Stack();
1804 for (HtmlNode node
= _lastparentnode
.LastChild
; node
!= null; node
= node
.PreviousSibling
)
1806 if ((node
.Name
== _currentnode
.Name
) && (! node
.HasChildNodes
))
1811 futureChild
.Push(node
);
1813 if (foundNode
!= null)
1815 HtmlNode node
= null;
1816 while(futureChild
.Count
!= 0)
1818 node
= (HtmlNode
)futureChild
.Pop();
1819 _lastparentnode
.RemoveChild(node
);
1820 foundNode
.AppendChild(node
);
1825 _lastparentnode
.AppendChild(_currentnode
);
1832 // node has no parent
1833 // node is not a closed node
1835 if (HtmlNode
.CanOverlapElement(_currentnode
.Name
))
1837 // this is a hack: add it as a text node
1838 HtmlNode closenode
= CreateNode(HtmlNodeType
.Text
, _currentnode
._outerstartindex
);
1839 closenode
._outerlength
= _currentnode
._outerlength
;
1840 ((HtmlTextNode
)closenode
).Text
= ((HtmlTextNode
)closenode
).Text
.ToLower();
1841 if (_lastparentnode
!= null)
1843 _lastparentnode
.AppendChild(closenode
);
1849 if (HtmlNode
.IsEmptyElement(_currentnode
.Name
))
1852 HtmlParseErrorCode
.EndTagNotRequired
,
1853 _currentnode
._line
, _currentnode
._lineposition
,
1854 _currentnode
._streamposition
, _currentnode
.OuterHtml
,
1855 "End tag </" + _currentnode
.Name
+ "> is not required");
1859 // node cannot overlap, node is not empty
1861 HtmlParseErrorCode
.TagNotOpened
,
1862 _currentnode
._line
, _currentnode
._lineposition
,
1863 _currentnode
._streamposition
, _currentnode
.OuterHtml
,
1864 "Start tag <" + _currentnode
.Name
+ "> was not found");
1872 if (OptionFixNestedTags
)
1874 if (FindResetterNodes(prev
, GetResetters(_currentnode
.Name
)))
1877 HtmlParseErrorCode
.EndTagInvalidHere
,
1878 _currentnode
._line
, _currentnode
._lineposition
,
1879 _currentnode
._streamposition
, _currentnode
.OuterHtml
,
1880 "End tag </" + _currentnode
.Name
+ "> invalid here");
1887 _lastnodes
[_currentnode
.Name
] = prev
._prevwithsamename
;
1888 prev
.CloseNode(_currentnode
);
1893 // we close this node, get grandparent
1896 if ((_lastparentnode
!= null) &&
1897 ((!HtmlNode
.IsClosedElement(_currentnode
.Name
)) ||
1898 (_currentnode
._starttag
)))
1900 UpdateLastParentNode();
1905 internal void UpdateLastParentNode()
1909 if (_lastparentnode
.Closed
)
1911 _lastparentnode
= _lastparentnode
.ParentNode
;
1914 while ((_lastparentnode
!= null) && (_lastparentnode
.Closed
));
1915 if (_lastparentnode
== null)
1917 _lastparentnode
= _documentnode
;
1921 private string CurrentAttributeName()
1923 return _text
.Substring(_currentattribute
._namestartindex
, _currentattribute
._namelength
);
1926 private string CurrentAttributeValue()
1928 return _text
.Substring(_currentattribute
._valuestartindex
, _currentattribute
._valuelength
);
1931 private string CurrentNodeName()
1933 return _text
.Substring(_currentnode
._namestartindex
, _currentnode
._namelength
);
1936 private string CurrentNodeOuter()
1938 return _text
.Substring(_currentnode
._outerstartindex
, _currentnode
._outerlength
);
1941 private string CurrentNodeInner()
1943 return _text
.Substring(_currentnode
._innerstartindex
, _currentnode
._innerlength
);
1947 /// Determines if the specified character is considered as a whitespace character.
1949 /// <param name="c">The character to check.</param>
1950 /// <returns>true if if the specified character is considered as a whitespace character.</returns>
1951 public static bool IsWhiteSpace(int c
)
1953 if ((c
== 10) || (c
== 13) || (c
== 32) || (c
== 9))
1962 internal class EncodingFoundException
: Exception
1964 private Encoding _encoding
;
1966 internal EncodingFoundException(Encoding encoding
)
1968 _encoding
= encoding
;
1971 internal Encoding Encoding