1 // HtmlAgilityPack V1.0 - Simon Mourier <simon underscore mourier at hotmail dot com>
3 using System
.Collections
;
4 using System
.Collections
.Generic
;
7 using System
.Text
.RegularExpressions
;
9 using System
.Xml
.XPath
;
11 namespace HtmlAgilityPack
14 /// Represents a complete HTML document.
16 public class HtmlDocument
: IXPathNavigable
22 private HtmlAttribute _currentattribute
;
23 private HtmlNode _currentnode
;
24 private Encoding _declaredencoding
;
25 private HtmlNode _documentnode
;
26 private bool _fullcomment
;
28 internal Hashtable _lastnodes
= new Hashtable();
29 private HtmlNode _lastparentnode
;
31 private int _lineposition
, _maxlineposition
;
32 internal Hashtable _nodesid
;
33 private ParseState _oldstate
;
34 private bool _onlyDetectEncoding
;
35 internal Hashtable _openednodes
;
36 private List
<HtmlParseError
> _parseerrors
= new List
<HtmlParseError
>();
37 private string _remainder
;
38 private int _remainderOffset
;
39 private ParseState _state
;
40 private Encoding _streamencoding
;
41 internal string _text
;
46 /// Adds Debugging attributes to node. Default is false.
48 public bool OptionAddDebuggingAttributes
;
51 /// Defines if closing for non closed nodes must be done at the end or directly in the document.
52 /// Setting this to true can actually change how browsers render the page. Default is false.
54 public bool OptionAutoCloseOnEnd
; // close errors at the end
57 /// Defines if non closed nodes will be checked at the end of parsing. Default is true.
59 public bool OptionCheckSyntax
= true;
62 /// Defines if a checksum must be computed for the document while parsing. Default is false.
64 public bool OptionComputeChecksum
;
67 /// Defines the default stream encoding to use. Default is System.Text.Encoding.Default.
69 public Encoding OptionDefaultStreamEncoding
= Encoding
.Default
;
72 /// Defines if source text must be extracted while parsing errors.
73 /// If the document has a lot of errors, or cascading errors, parsing performance can be dramatically affected if set to true.
76 public bool OptionExtractErrorSourceText
;
78 // turning this on can dramatically slow performance if a lot of errors are detected
81 /// Defines the maximum length of source text or parse errors. Default is 100.
83 public int OptionExtractErrorSourceTextMaxLength
= 100;
86 /// Defines if LI, TR, TH, TD tags must be partially fixed when nesting errors are detected. Default is false.
88 public bool OptionFixNestedTags
; // fix li, tr, th, td tags
91 /// Defines if output must conform to XML, instead of HTML.
93 public bool OptionOutputAsXml
;
96 /// Defines if attribute value output must be optimized (not bound with double quotes if it is possible). Default is false.
98 public bool OptionOutputOptimizeAttributeValues
;
101 /// Defines if name must be output with it's original case. Useful for asp.net tags and attributes
103 public bool OptionOutputOriginalCase
;
106 /// Defines if name must be output in uppercase. Default is false.
108 public bool OptionOutputUpperCase
;
111 /// Defines if declared encoding must be read from the document.
112 /// Declared encoding is determined using the meta http-equiv="content-type" content="text/html;charset=XXXXX" html node.
115 public bool OptionReadEncoding
= true;
118 /// Defines the name of a node that will throw the StopperNodeException when found as an end node. Default is null.
120 public string OptionStopperNodeName
;
123 /// Defines if the 'id' attribute must be specifically used. Default is true.
125 public bool OptionUseIdAttribute
= true;
128 /// Defines if empty nodes must be written as closed during output. Default is false.
130 public bool OptionWriteEmptyNodes
;
134 #region Static Members
136 internal static readonly string HtmlExceptionRefNotChild
= "Reference node must be a child of this node";
138 internal static readonly string HtmlExceptionUseIdAttributeFalse
=
139 "You need to set UseIdAttribute property to true to enable this feature";
146 /// Creates an instance of an HTML document.
148 public HtmlDocument()
150 _documentnode
= CreateNode(HtmlNodeType
.Document
, 0);
158 /// Gets the document CRC32 checksum if OptionComputeChecksum was set to true before parsing, 0 otherwise.
170 return (int) _crc32
.CheckSum
;
176 /// Gets the document's declared encoding.
177 /// Declared encoding is determined using the meta http-equiv="content-type" content="text/html;charset=XXXXX" html node.
179 public Encoding DeclaredEncoding
181 get { return _declaredencoding; }
185 /// Gets the root node of the document.
187 public HtmlNode DocumentNode
189 get { return _documentnode; }
193 /// Gets the document's output encoding.
195 public Encoding Encoding
197 get { return GetOutEncoding(); }
201 /// Gets a list of parse errors found in the document.
203 public IEnumerable
<HtmlParseError
> ParseErrors
205 get { return _parseerrors; }
209 /// Gets the remaining text.
210 /// Will always be null if OptionStopperNodeName is null.
212 public string Remainder
214 get { return _remainder; }
218 /// Gets the offset of Remainder in the original Html text.
219 /// If OptionStopperNodeName is null, this will return the length of the original Html text.
221 public int RemainderOffset
223 get { return _remainderOffset; }
227 /// Gets the document's stream encoding.
229 public Encoding StreamEncoding
231 get { return _streamencoding; }
236 #region IXPathNavigable Members
239 /// Creates a new XPathNavigator object for navigating this HTML document.
241 /// <returns>An XPathNavigator object. The XPathNavigator is positioned on the root of the document.</returns>
242 public XPathNavigator
CreateNavigator()
244 return new HtmlNodeNavigator(this, _documentnode
);
249 #region Public Methods
252 /// Gets a valid XML name.
254 /// <param name="name">Any text.</param>
255 /// <returns>A string that is a valid XML name.</returns>
256 public static string GetXmlName(string name
)
258 string xmlname
= string.Empty
;
259 bool nameisok
= true;
260 for (int i
= 0; i
< name
.Length
; i
++)
263 // note: we are very limited here, too much?
264 if (((name
[i
] >= 'a') && (name
[i
] <= 'z')) ||
265 ((name
[i
] >= '0') && (name
[i
] <= '9')) ||
266 // (name[i]==':') || (name[i]=='_') || (name[i]=='-') || (name[i]=='.')) // these are bads in fact
267 (name
[i
] == '_') || (name
[i
] == '-') || (name
[i
] == '.'))
274 byte[] bytes
= Encoding
.UTF8
.GetBytes(new char[] {name[i]}
);
275 for (int j
= 0; j
< bytes
.Length
; j
++)
277 xmlname
+= bytes
[j
].ToString("x2");
286 return "_" + xmlname
;
290 /// Applies HTML encoding to a specified string.
292 /// <param name="html">The input string to encode. May not be null.</param>
293 /// <returns>The encoded string.</returns>
294 public static string HtmlEncode(string html
)
298 throw new ArgumentNullException("html");
300 // replace & by & but only once!
301 Regex rx
= new Regex("&(?!(amp;)|(lt;)|(gt;)|(quot;))", RegexOptions
.IgnoreCase
);
302 return rx
.Replace(html
, "&").Replace("<", "<").Replace(">", ">").Replace("\"", """);
306 /// Determines if the specified character is considered as a whitespace character.
308 /// <param name="c">The character to check.</param>
309 /// <returns>true if if the specified character is considered as a whitespace character.</returns>
310 public static bool IsWhiteSpace(int c
)
312 if ((c
== 10) || (c
== 13) || (c
== 32) || (c
== 9))
320 /// Creates an HTML attribute with the specified name.
322 /// <param name="name">The name of the attribute. May not be null.</param>
323 /// <returns>The new HTML attribute.</returns>
324 public HtmlAttribute
CreateAttribute(string name
)
328 throw new ArgumentNullException("name");
330 HtmlAttribute att
= CreateAttribute();
336 /// Creates an HTML attribute with the specified name.
338 /// <param name="name">The name of the attribute. May not be null.</param>
339 /// <param name="value">The value of the attribute.</param>
340 /// <returns>The new HTML attribute.</returns>
341 public HtmlAttribute
CreateAttribute(string name
, string value)
345 throw new ArgumentNullException("name");
347 HtmlAttribute att
= CreateAttribute(name
);
353 /// Creates an HTML comment node.
355 /// <returns>The new HTML comment node.</returns>
356 public HtmlCommentNode
CreateComment()
358 return (HtmlCommentNode
) CreateNode(HtmlNodeType
.Comment
);
362 /// Creates an HTML comment node with the specified comment text.
364 /// <param name="comment">The comment text. May not be null.</param>
365 /// <returns>The new HTML comment node.</returns>
366 public HtmlCommentNode
CreateComment(string comment
)
370 throw new ArgumentNullException("comment");
372 HtmlCommentNode c
= CreateComment();
378 /// Creates an HTML element node with the specified name.
380 /// <param name="name">The qualified name of the element. May not be null.</param>
381 /// <returns>The new HTML node.</returns>
382 public HtmlNode
CreateElement(string name
)
386 throw new ArgumentNullException("name");
388 HtmlNode node
= CreateNode(HtmlNodeType
.Element
);
394 /// Creates an HTML text node.
396 /// <returns>The new HTML text node.</returns>
397 public HtmlTextNode
CreateTextNode()
399 return (HtmlTextNode
) CreateNode(HtmlNodeType
.Text
);
403 /// Creates an HTML text node with the specified text.
405 /// <param name="text">The text of the node. May not be null.</param>
406 /// <returns>The new HTML text node.</returns>
407 public HtmlTextNode
CreateTextNode(string text
)
411 throw new ArgumentNullException("text");
413 HtmlTextNode t
= CreateTextNode();
419 /// Detects the encoding of an HTML stream.
421 /// <param name="stream">The input stream. May not be null.</param>
422 /// <returns>The detected encoding.</returns>
423 public Encoding
DetectEncoding(Stream stream
)
427 throw new ArgumentNullException("stream");
429 return DetectEncoding(new StreamReader(stream
));
433 /// Detects the encoding of an HTML file.
435 /// <param name="path">Path for the file containing the HTML document to detect. May not be null.</param>
436 /// <returns>The detected encoding.</returns>
437 public Encoding
DetectEncoding(string path
)
441 throw new ArgumentNullException("path");
443 StreamReader sr
= new StreamReader(path
, OptionDefaultStreamEncoding
);
444 Encoding encoding
= DetectEncoding(sr
);
450 /// Detects the encoding of an HTML text provided on a TextReader.
452 /// <param name="reader">The TextReader used to feed the HTML. May not be null.</param>
453 /// <returns>The detected encoding.</returns>
454 public Encoding
DetectEncoding(TextReader reader
)
458 throw new ArgumentNullException("reader");
460 _onlyDetectEncoding
= true;
461 if (OptionCheckSyntax
)
463 _openednodes
= new Hashtable();
470 if (OptionUseIdAttribute
)
472 _nodesid
= new Hashtable();
479 StreamReader sr
= reader
as StreamReader
;
482 _streamencoding
= sr
.CurrentEncoding
;
486 _streamencoding
= null;
488 _declaredencoding
= null;
490 _text
= reader
.ReadToEnd();
491 _documentnode
= CreateNode(HtmlNodeType
.Document
, 0);
493 // this is almost a hack, but it allows us not to muck with the original parsing code
498 catch (EncodingFoundException ex
)
506 /// Detects the encoding of an HTML document from a file first, and then loads the file.
508 /// <param name="path">The complete file path to be read.</param>
509 public void DetectEncodingAndLoad(string path
)
511 DetectEncodingAndLoad(path
, true);
515 /// Detects the encoding of an HTML document from a file first, and then loads the file.
517 /// <param name="path">The complete file path to be read. May not be null.</param>
518 /// <param name="detectEncoding">true to detect encoding, false otherwise.</param>
519 public void DetectEncodingAndLoad(string path
, bool detectEncoding
)
523 throw new ArgumentNullException("path");
528 enc
= DetectEncoding(path
);
546 /// Detects the encoding of an HTML text.
548 /// <param name="html">The input html text. May not be null.</param>
549 /// <returns>The detected encoding.</returns>
550 public Encoding
DetectEncodingHtml(string html
)
554 throw new ArgumentNullException("html");
556 StringReader sr
= new StringReader(html
);
557 Encoding encoding
= DetectEncoding(sr
);
563 /// Gets the HTML node with the specified 'id' attribute value.
565 /// <param name="id">The attribute id to match. May not be null.</param>
566 /// <returns>The HTML node with the matching id or null if not found.</returns>
567 public HtmlNode
GetElementbyId(string id
)
571 throw new ArgumentNullException("id");
573 if (_nodesid
== null)
575 throw new Exception(HtmlExceptionUseIdAttributeFalse
);
578 return _nodesid
[id
.ToLower()] as HtmlNode
;
582 /// Loads an HTML document from a stream.
584 /// <param name="stream">The input stream.</param>
585 public void Load(Stream stream
)
587 Load(new StreamReader(stream
, OptionDefaultStreamEncoding
));
591 /// Loads an HTML document from a stream.
593 /// <param name="stream">The input stream.</param>
594 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the stream.</param>
595 public void Load(Stream stream
, bool detectEncodingFromByteOrderMarks
)
597 Load(new StreamReader(stream
, detectEncodingFromByteOrderMarks
));
601 /// Loads an HTML document from a stream.
603 /// <param name="stream">The input stream.</param>
604 /// <param name="encoding">The character encoding to use.</param>
605 public void Load(Stream stream
, Encoding encoding
)
607 Load(new StreamReader(stream
, encoding
));
611 /// Loads an HTML document from a stream.
613 /// <param name="stream">The input stream.</param>
614 /// <param name="encoding">The character encoding to use.</param>
615 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the stream.</param>
616 public void Load(Stream stream
, Encoding encoding
, bool detectEncodingFromByteOrderMarks
)
618 Load(new StreamReader(stream
, encoding
, detectEncodingFromByteOrderMarks
));
622 /// Loads an HTML document from a stream.
624 /// <param name="stream">The input stream.</param>
625 /// <param name="encoding">The character encoding to use.</param>
626 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the stream.</param>
627 /// <param name="buffersize">The minimum buffer size.</param>
628 public void Load(Stream stream
, Encoding encoding
, bool detectEncodingFromByteOrderMarks
, int buffersize
)
630 Load(new StreamReader(stream
, encoding
, detectEncodingFromByteOrderMarks
, buffersize
));
634 /// Loads an HTML document from a file.
636 /// <param name="path">The complete file path to be read. May not be null.</param>
637 public void Load(string path
)
641 throw new ArgumentNullException("path");
643 StreamReader sr
= new StreamReader(path
, OptionDefaultStreamEncoding
);
649 /// Loads an HTML document from a file.
651 /// <param name="path">The complete file path to be read. May not be null.</param>
652 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
653 public void Load(string path
, bool detectEncodingFromByteOrderMarks
)
657 throw new ArgumentNullException("path");
659 StreamReader sr
= new StreamReader(path
, detectEncodingFromByteOrderMarks
);
665 /// Loads an HTML document from a file.
667 /// <param name="path">The complete file path to be read. May not be null.</param>
668 /// <param name="encoding">The character encoding to use. May not be null.</param>
669 public void Load(string path
, Encoding encoding
)
673 throw new ArgumentNullException("path");
675 if (encoding
== null)
677 throw new ArgumentNullException("encoding");
679 StreamReader sr
= new StreamReader(path
, encoding
);
685 /// Loads an HTML document from a file.
687 /// <param name="path">The complete file path to be read. May not be null.</param>
688 /// <param name="encoding">The character encoding to use. May not be null.</param>
689 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
690 public void Load(string path
, Encoding encoding
, bool detectEncodingFromByteOrderMarks
)
694 throw new ArgumentNullException("path");
696 if (encoding
== null)
698 throw new ArgumentNullException("encoding");
700 StreamReader sr
= new StreamReader(path
, encoding
, detectEncodingFromByteOrderMarks
);
706 /// Loads an HTML document from a file.
708 /// <param name="path">The complete file path to be read. May not be null.</param>
709 /// <param name="encoding">The character encoding to use. May not be null.</param>
710 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
711 /// <param name="buffersize">The minimum buffer size.</param>
712 public void Load(string path
, Encoding encoding
, bool detectEncodingFromByteOrderMarks
, int buffersize
)
716 throw new ArgumentNullException("path");
718 if (encoding
== null)
720 throw new ArgumentNullException("encoding");
722 StreamReader sr
= new StreamReader(path
, encoding
, detectEncodingFromByteOrderMarks
, buffersize
);
728 /// Loads the HTML document from the specified TextReader.
730 /// <param name="reader">The TextReader used to feed the HTML data into the document. May not be null.</param>
731 public void Load(TextReader reader
)
733 // all Load methods pass down to this one
736 throw new ArgumentNullException("reader");
739 _onlyDetectEncoding
= false;
741 if (OptionCheckSyntax
)
743 _openednodes
= new Hashtable();
750 if (OptionUseIdAttribute
)
752 _nodesid
= new Hashtable();
759 StreamReader sr
= reader
as StreamReader
;
764 // trigger bom read if needed
767 // ReSharper disable EmptyGeneralCatchClause
769 // ReSharper restore EmptyGeneralCatchClause
773 _streamencoding
= sr
.CurrentEncoding
;
777 _streamencoding
= null;
779 _declaredencoding
= null;
781 _text
= reader
.ReadToEnd();
782 _documentnode
= CreateNode(HtmlNodeType
.Document
, 0);
785 if (OptionCheckSyntax
)
787 foreach (HtmlNode node
in _openednodes
.Values
)
789 if (!node
._starttag
) // already reported
795 if (OptionExtractErrorSourceText
)
797 html
= node
.OuterHtml
;
798 if (html
.Length
> OptionExtractErrorSourceTextMaxLength
)
800 html
= html
.Substring(0, OptionExtractErrorSourceTextMaxLength
);
808 HtmlParseErrorCode
.TagNotClosed
,
809 node
._line
, node
._lineposition
,
810 node
._streamposition
, html
,
811 "End tag </" + node
.Name
+ "> was not found");
814 // we don't need this anymore
815 _openednodes
.Clear();
820 /// Loads the HTML document from the specified string.
822 /// <param name="html">String containing the HTML document to load. May not be null.</param>
823 public void LoadHtml(string html
)
827 throw new ArgumentNullException("html");
829 StringReader sr
= new StringReader(html
);
835 /// Saves the HTML document to the specified stream.
837 /// <param name="outStream">The stream to which you want to save.</param>
838 public void Save(Stream outStream
)
840 StreamWriter sw
= new StreamWriter(outStream
, GetOutEncoding());
845 /// Saves the HTML document to the specified stream.
847 /// <param name="outStream">The stream to which you want to save. May not be null.</param>
848 /// <param name="encoding">The character encoding to use. May not be null.</param>
849 public void Save(Stream outStream
, Encoding encoding
)
851 if (outStream
== null)
853 throw new ArgumentNullException("outStream");
855 if (encoding
== null)
857 throw new ArgumentNullException("encoding");
859 StreamWriter sw
= new StreamWriter(outStream
, encoding
);
864 /// Saves the mixed document to the specified file.
866 /// <param name="filename">The location of the file where you want to save the document.</param>
867 public void Save(string filename
)
869 StreamWriter sw
= new StreamWriter(filename
, false, GetOutEncoding());
875 /// Saves the mixed document to the specified file.
877 /// <param name="filename">The location of the file where you want to save the document. May not be null.</param>
878 /// <param name="encoding">The character encoding to use. May not be null.</param>
879 public void Save(string filename
, Encoding encoding
)
881 if (filename
== null)
883 throw new ArgumentNullException("filename");
885 if (encoding
== null)
887 throw new ArgumentNullException("encoding");
889 StreamWriter sw
= new StreamWriter(filename
, false, encoding
);
895 /// Saves the HTML document to the specified StreamWriter.
897 /// <param name="writer">The StreamWriter to which you want to save.</param>
898 public void Save(StreamWriter writer
)
900 Save((TextWriter
) writer
);
904 /// Saves the HTML document to the specified TextWriter.
906 /// <param name="writer">The TextWriter to which you want to save. May not be null.</param>
907 public void Save(TextWriter writer
)
911 throw new ArgumentNullException("writer");
913 DocumentNode
.WriteTo(writer
);
917 /// Saves the HTML document to the specified XmlWriter.
919 /// <param name="writer">The XmlWriter to which you want to save.</param>
920 public void Save(XmlWriter writer
)
922 DocumentNode
.WriteTo(writer
);
928 #region Internal Methods
930 internal HtmlAttribute
CreateAttribute()
932 return new HtmlAttribute(this);
935 internal HtmlNode
CreateNode(HtmlNodeType type
)
937 return CreateNode(type
, -1);
940 internal HtmlNode
CreateNode(HtmlNodeType type
, int index
)
944 case HtmlNodeType
.Comment
:
945 return new HtmlCommentNode(this, index
);
947 case HtmlNodeType
.Text
:
948 return new HtmlTextNode(this, index
);
951 return new HtmlNode(type
, this, index
);
955 internal Encoding
GetOutEncoding()
957 // when unspecified, use the stream encoding first
958 if (_declaredencoding
!= null)
960 return _declaredencoding
;
964 if (_streamencoding
!= null)
966 return _streamencoding
;
969 return OptionDefaultStreamEncoding
;
972 internal HtmlNode
GetXmlDeclaration()
974 if (!_documentnode
.HasChildNodes
)
979 foreach (HtmlNode node
in _documentnode
._childnodes
)
981 if (node
.Name
== "?xml") // it's ok, names are case sensitive
989 internal void SetIdForNode(HtmlNode node
, string id
)
991 if (!OptionUseIdAttribute
)
996 if ((_nodesid
== null) || (id
== null))
1003 _nodesid
.Remove(id
.ToLower());
1007 _nodesid
[id
.ToLower()] = node
;
1011 internal void UpdateLastParentNode()
1015 if (_lastparentnode
.Closed
)
1017 _lastparentnode
= _lastparentnode
.ParentNode
;
1019 } while ((_lastparentnode
!= null) && (_lastparentnode
.Closed
));
1020 if (_lastparentnode
== null)
1022 _lastparentnode
= _documentnode
;
1028 #region Private Methods
1030 private HtmlParseError
AddError(
1031 HtmlParseErrorCode code
,
1038 HtmlParseError err
= new HtmlParseError(code
, line
, linePosition
, streamPosition
, sourceText
, reason
);
1039 _parseerrors
.Add(err
);
1043 private void CloseCurrentNode()
1045 if (_currentnode
.Closed
) // text or document are by def closed
1050 // find last node of this kind
1051 HtmlNode prev
= (HtmlNode
) _lastnodes
[_currentnode
.Name
];
1054 if (HtmlNode
.IsClosedElement(_currentnode
.Name
))
1056 // </br> will be seen as <br>
1057 _currentnode
.CloseNode(_currentnode
);
1059 // add to parent node
1060 if (_lastparentnode
!= null)
1062 HtmlNode foundNode
= null;
1063 Stack futureChild
= new Stack();
1064 for (HtmlNode node
= _lastparentnode
.LastChild
; node
!= null; node
= node
.PreviousSibling
)
1066 if ((node
.Name
== _currentnode
.Name
) && (!node
.HasChildNodes
))
1071 futureChild
.Push(node
);
1073 if (foundNode
!= null)
1075 HtmlNode node
= null;
1076 while (futureChild
.Count
!= 0)
1078 node
= (HtmlNode
) futureChild
.Pop();
1079 _lastparentnode
.RemoveChild(node
);
1080 foundNode
.AppendChild(node
);
1085 _lastparentnode
.AppendChild(_currentnode
);
1091 // node has no parent
1092 // node is not a closed node
1094 if (HtmlNode
.CanOverlapElement(_currentnode
.Name
))
1096 // this is a hack: add it as a text node
1097 HtmlNode closenode
= CreateNode(HtmlNodeType
.Text
, _currentnode
._outerstartindex
);
1098 closenode
._outerlength
= _currentnode
._outerlength
;
1099 ((HtmlTextNode
) closenode
).Text
= ((HtmlTextNode
) closenode
).Text
.ToLower();
1100 if (_lastparentnode
!= null)
1102 _lastparentnode
.AppendChild(closenode
);
1107 if (HtmlNode
.IsEmptyElement(_currentnode
.Name
))
1110 HtmlParseErrorCode
.EndTagNotRequired
,
1111 _currentnode
._line
, _currentnode
._lineposition
,
1112 _currentnode
._streamposition
, _currentnode
.OuterHtml
,
1113 "End tag </" + _currentnode
.Name
+ "> is not required");
1117 // node cannot overlap, node is not empty
1119 HtmlParseErrorCode
.TagNotOpened
,
1120 _currentnode
._line
, _currentnode
._lineposition
,
1121 _currentnode
._streamposition
, _currentnode
.OuterHtml
,
1122 "Start tag <" + _currentnode
.Name
+ "> was not found");
1130 if (OptionFixNestedTags
)
1132 if (FindResetterNodes(prev
, GetResetters(_currentnode
.Name
)))
1135 HtmlParseErrorCode
.EndTagInvalidHere
,
1136 _currentnode
._line
, _currentnode
._lineposition
,
1137 _currentnode
._streamposition
, _currentnode
.OuterHtml
,
1138 "End tag </" + _currentnode
.Name
+ "> invalid here");
1145 _lastnodes
[_currentnode
.Name
] = prev
._prevwithsamename
;
1146 prev
.CloseNode(_currentnode
);
1151 // we close this node, get grandparent
1154 if ((_lastparentnode
!= null) &&
1155 ((!HtmlNode
.IsClosedElement(_currentnode
.Name
)) ||
1156 (_currentnode
._starttag
)))
1158 UpdateLastParentNode();
1163 private string CurrentAttributeName()
1165 return _text
.Substring(_currentattribute
._namestartindex
, _currentattribute
._namelength
);
1168 private string CurrentAttributeValue()
1170 return _text
.Substring(_currentattribute
._valuestartindex
, _currentattribute
._valuelength
);
1173 private string CurrentNodeInner()
1175 return _text
.Substring(_currentnode
._innerstartindex
, _currentnode
._innerlength
);
1178 private string CurrentNodeName()
1180 return _text
.Substring(_currentnode
._namestartindex
, _currentnode
._namelength
);
1183 private string CurrentNodeOuter()
1185 return _text
.Substring(_currentnode
._outerstartindex
, _currentnode
._outerlength
);
1189 private void DecrementPosition()
1192 if (_lineposition
== 1)
1194 _lineposition
= _maxlineposition
;
1203 private HtmlNode
FindResetterNode(HtmlNode node
, string name
)
1205 HtmlNode resetter
= (HtmlNode
) _lastnodes
[name
];
1206 if (resetter
== null)
1208 if (resetter
.Closed
)
1212 if (resetter
._streamposition
< node
._streamposition
)
1219 private bool FindResetterNodes(HtmlNode node
, string[] names
)
1225 for (int i
= 0; i
< names
.Length
; i
++)
1227 if (FindResetterNode(node
, names
[i
]) != null)
1235 private void FixNestedTag(string name
, string[] resetters
)
1237 if (resetters
== null)
1242 // if we find a previous unclosed same name node, without a resetter node between, we must close it
1243 prev
= (HtmlNode
) _lastnodes
[name
];
1244 if ((prev
!= null) && (!prev
.Closed
))
1246 // try to find a resetter node, if found, we do nothing
1247 if (FindResetterNodes(prev
, resetters
))
1252 // ok we need to close the prev now
1253 // create a fake closer node
1254 HtmlNode close
= new HtmlNode(prev
.NodeType
, this, -1);
1255 close
._endnode
= close
;
1256 prev
.CloseNode(close
);
1260 private void FixNestedTags()
1262 // we are only interested by start tags, not closing tags
1263 if (!_currentnode
._starttag
)
1266 string name
= CurrentNodeName();
1267 FixNestedTag(name
, GetResetters(name
));
1270 private string[] GetResetters(string name
)
1275 return new string[] {"ul"}
;
1278 return new string[] {"table"}
;
1282 return new string[] {"tr", "table"}
;
1289 private void IncrementPosition()
1293 // REVIEW: should we add some checksum code in DecrementPosition too?
1294 _crc32
.AddToCRC32(_c
);
1298 _maxlineposition
= _lineposition
;
1310 private bool NewCheck()
1316 if (_index
< _text
.Length
)
1318 if (_text
[_index
] == '%')
1322 case ParseState
.AttributeAfterEquals
:
1323 PushAttributeValueStart(_index
- 1);
1326 case ParseState
.BetweenAttributes
:
1327 PushAttributeNameStart(_index
- 1);
1330 case ParseState
.WhichTag
:
1331 PushNodeNameStart(true, _index
- 1);
1332 _state
= ParseState
.Tag
;
1336 _state
= ParseState
.ServerSideCode
;
1341 if (!PushNodeEnd(_index
- 1, true))
1344 _index
= _text
.Length
;
1347 _state
= ParseState
.WhichTag
;
1348 if ((_index
- 1) <= (_text
.Length
- 2))
1350 if (_text
[_index
] == '!')
1352 PushNodeStart(HtmlNodeType
.Comment
, _index
- 1);
1353 PushNodeNameStart(true, _index
);
1354 PushNodeNameEnd(_index
+ 1);
1355 _state
= ParseState
.Comment
;
1356 if (_index
< (_text
.Length
- 2))
1358 if ((_text
[_index
+ 1] == '-') &&
1359 (_text
[_index
+ 2] == '-'))
1361 _fullcomment
= true;
1365 _fullcomment
= false;
1371 PushNodeStart(HtmlNodeType
.Element
, _index
- 1);
1375 private void Parse()
1378 if (OptionComputeChecksum
)
1380 _crc32
= new Crc32();
1383 _lastnodes
= new Hashtable();
1385 _fullcomment
= false;
1386 _parseerrors
= new List
<HtmlParseError
>();
1389 _maxlineposition
= 1;
1391 _state
= ParseState
.Text
;
1393 _documentnode
._innerlength
= _text
.Length
;
1394 _documentnode
._outerlength
= _text
.Length
;
1395 _remainderOffset
= _text
.Length
;
1397 _lastparentnode
= _documentnode
;
1398 _currentnode
= CreateNode(HtmlNodeType
.Text
, 0);
1399 _currentattribute
= null;
1402 PushNodeStart(HtmlNodeType
.Text
, 0);
1403 while (_index
< _text
.Length
)
1406 IncrementPosition();
1410 case ParseState
.Text
:
1415 case ParseState
.WhichTag
:
1420 PushNodeNameStart(false, _index
);
1424 PushNodeNameStart(true, _index
- 1);
1425 DecrementPosition();
1427 _state
= ParseState
.Tag
;
1430 case ParseState
.Tag
:
1433 if (IsWhiteSpace(_c
))
1435 PushNodeNameEnd(_index
- 1);
1436 if (_state
!= ParseState
.Tag
)
1438 _state
= ParseState
.BetweenAttributes
;
1443 PushNodeNameEnd(_index
- 1);
1444 if (_state
!= ParseState
.Tag
)
1446 _state
= ParseState
.EmptyTag
;
1451 PushNodeNameEnd(_index
- 1);
1452 if (_state
!= ParseState
.Tag
)
1454 if (!PushNodeEnd(_index
, false))
1457 _index
= _text
.Length
;
1460 if (_state
!= ParseState
.Tag
)
1462 _state
= ParseState
.Text
;
1463 PushNodeStart(HtmlNodeType
.Text
, _index
);
1467 case ParseState
.BetweenAttributes
:
1471 if (IsWhiteSpace(_c
))
1474 if ((_c
== '/') || (_c
== '?'))
1476 _state
= ParseState
.EmptyTag
;
1482 if (!PushNodeEnd(_index
, false))
1485 _index
= _text
.Length
;
1489 if (_state
!= ParseState
.BetweenAttributes
)
1491 _state
= ParseState
.Text
;
1492 PushNodeStart(HtmlNodeType
.Text
, _index
);
1496 PushAttributeNameStart(_index
- 1);
1497 _state
= ParseState
.AttributeName
;
1500 case ParseState
.EmptyTag
:
1506 if (!PushNodeEnd(_index
, true))
1509 _index
= _text
.Length
;
1513 if (_state
!= ParseState
.EmptyTag
)
1515 _state
= ParseState
.Text
;
1516 PushNodeStart(HtmlNodeType
.Text
, _index
);
1519 _state
= ParseState
.BetweenAttributes
;
1522 case ParseState
.AttributeName
:
1526 if (IsWhiteSpace(_c
))
1528 PushAttributeNameEnd(_index
- 1);
1529 _state
= ParseState
.AttributeBeforeEquals
;
1534 PushAttributeNameEnd(_index
- 1);
1535 _state
= ParseState
.AttributeAfterEquals
;
1540 PushAttributeNameEnd(_index
- 1);
1541 if (!PushNodeEnd(_index
, false))
1544 _index
= _text
.Length
;
1547 if (_state
!= ParseState
.AttributeName
)
1549 _state
= ParseState
.Text
;
1550 PushNodeStart(HtmlNodeType
.Text
, _index
);
1555 case ParseState
.AttributeBeforeEquals
:
1559 if (IsWhiteSpace(_c
))
1563 if (!PushNodeEnd(_index
, false))
1566 _index
= _text
.Length
;
1569 if (_state
!= ParseState
.AttributeBeforeEquals
)
1571 _state
= ParseState
.Text
;
1572 PushNodeStart(HtmlNodeType
.Text
, _index
);
1577 _state
= ParseState
.AttributeAfterEquals
;
1580 // no equals, no whitespace, it's a new attrribute starting
1581 _state
= ParseState
.BetweenAttributes
;
1582 DecrementPosition();
1585 case ParseState
.AttributeAfterEquals
:
1589 if (IsWhiteSpace(_c
))
1592 if ((_c
== '\'') || (_c
== '"'))
1594 _state
= ParseState
.QuotedAttributeValue
;
1595 PushAttributeValueStart(_index
, _c
);
1601 if (!PushNodeEnd(_index
, false))
1604 _index
= _text
.Length
;
1607 if (_state
!= ParseState
.AttributeAfterEquals
)
1609 _state
= ParseState
.Text
;
1610 PushNodeStart(HtmlNodeType
.Text
, _index
);
1613 PushAttributeValueStart(_index
- 1);
1614 _state
= ParseState
.AttributeValue
;
1617 case ParseState
.AttributeValue
:
1621 if (IsWhiteSpace(_c
))
1623 PushAttributeValueEnd(_index
- 1);
1624 _state
= ParseState
.BetweenAttributes
;
1630 PushAttributeValueEnd(_index
- 1);
1631 if (!PushNodeEnd(_index
, false))
1634 _index
= _text
.Length
;
1637 if (_state
!= ParseState
.AttributeValue
)
1639 _state
= ParseState
.Text
;
1640 PushNodeStart(HtmlNodeType
.Text
, _index
);
1645 case ParseState
.QuotedAttributeValue
:
1646 if (_c
== lastquote
)
1648 PushAttributeValueEnd(_index
- 1);
1649 _state
= ParseState
.BetweenAttributes
;
1654 if (_index
< _text
.Length
)
1656 if (_text
[_index
] == '%')
1659 _state
= ParseState
.ServerSideCode
;
1666 case ParseState
.Comment
:
1671 if ((_text
[_index
- 2] != '-') ||
1672 (_text
[_index
- 3] != '-'))
1677 if (!PushNodeEnd(_index
, false))
1680 _index
= _text
.Length
;
1683 _state
= ParseState
.Text
;
1684 PushNodeStart(HtmlNodeType
.Text
, _index
);
1689 case ParseState
.ServerSideCode
:
1692 if (_index
< _text
.Length
)
1694 if (_text
[_index
] == '>')
1698 case ParseState
.AttributeAfterEquals
:
1699 _state
= ParseState
.AttributeValue
;
1702 case ParseState
.BetweenAttributes
:
1703 PushAttributeNameEnd(_index
+ 1);
1704 _state
= ParseState
.BetweenAttributes
;
1711 IncrementPosition();
1717 case ParseState
.PcData
:
1718 // look for </tag + 1 char
1721 if ((_currentnode
._namelength
+ 3) <= (_text
.Length
- (_index
- 1)))
1723 if (string.Compare(_text
.Substring(_index
- 1, _currentnode
._namelength
+ 2),
1724 "</" + _currentnode
.Name
, true) == 0)
1726 int c
= _text
[_index
- 1 + 2 + _currentnode
.Name
.Length
];
1727 if ((c
== '>') || (IsWhiteSpace(c
)))
1729 // add the script as a text node
1730 HtmlNode script
= CreateNode(HtmlNodeType
.Text
,
1731 _currentnode
._outerstartindex
+
1732 _currentnode
._outerlength
);
1733 script
._outerlength
= _index
- 1 - script
._outerstartindex
;
1734 _currentnode
.AppendChild(script
);
1737 PushNodeStart(HtmlNodeType
.Element
, _index
- 1);
1738 PushNodeNameStart(false, _index
- 1 + 2);
1739 _state
= ParseState
.Tag
;
1740 IncrementPosition();
1748 // finish the current work
1749 if (_currentnode
._namestartindex
> 0)
1751 PushNodeNameEnd(_index
);
1753 PushNodeEnd(_index
, false);
1755 // we don't need this anymore
1759 private void PushAttributeNameEnd(int index
)
1761 _currentattribute
._namelength
= index
- _currentattribute
._namestartindex
;
1762 _currentnode
.Attributes
.Append(_currentattribute
);
1765 private void PushAttributeNameStart(int index
)
1767 _currentattribute
= CreateAttribute();
1768 _currentattribute
._namestartindex
= index
;
1769 _currentattribute
.Line
= _line
;
1770 _currentattribute
._lineposition
= _lineposition
;
1771 _currentattribute
._streamposition
= index
;
1774 private void PushAttributeValueEnd(int index
)
1776 _currentattribute
._valuelength
= index
- _currentattribute
._valuestartindex
;
1779 private void PushAttributeValueStart(int index
)
1781 PushAttributeValueStart(index
, 0);
1784 private void PushAttributeValueStart(int index
, int quote
)
1786 _currentattribute
._valuestartindex
= index
;
1788 _currentattribute
.QuoteType
= AttributeValueQuote
.SingleQuote
;
1791 private bool PushNodeEnd(int index
, bool close
)
1793 _currentnode
._outerlength
= index
- _currentnode
._outerstartindex
;
1795 if ((_currentnode
._nodetype
== HtmlNodeType
.Text
) ||
1796 (_currentnode
._nodetype
== HtmlNodeType
.Comment
))
1798 // forget about void nodes
1799 if (_currentnode
._outerlength
> 0)
1801 _currentnode
._innerlength
= _currentnode
._outerlength
;
1802 _currentnode
._innerstartindex
= _currentnode
._outerstartindex
;
1803 if (_lastparentnode
!= null)
1805 _lastparentnode
.AppendChild(_currentnode
);
1811 if ((_currentnode
._starttag
) && (_lastparentnode
!= _currentnode
))
1813 // add to parent node
1814 if (_lastparentnode
!= null)
1816 _lastparentnode
.AppendChild(_currentnode
);
1819 ReadDocumentEncoding(_currentnode
);
1821 // remember last node of this kind
1822 HtmlNode prev
= (HtmlNode
) _lastnodes
[_currentnode
.Name
];
1823 _currentnode
._prevwithsamename
= prev
;
1824 _lastnodes
[_currentnode
.Name
] = _currentnode
;
1827 if ((_currentnode
.NodeType
== HtmlNodeType
.Document
) ||
1828 (_currentnode
.NodeType
== HtmlNodeType
.Element
))
1830 _lastparentnode
= _currentnode
;
1833 if (HtmlNode
.IsCDataElement(CurrentNodeName()))
1835 _state
= ParseState
.PcData
;
1839 if ((HtmlNode
.IsClosedElement(_currentnode
.Name
)) ||
1840 (HtmlNode
.IsEmptyElement(_currentnode
.Name
)))
1847 if ((close
) || (!_currentnode
._starttag
))
1849 if ((OptionStopperNodeName
!= null) && (_remainder
== null) &&
1850 (string.Compare(_currentnode
.Name
, OptionStopperNodeName
, true) == 0))
1852 _remainderOffset
= index
;
1853 _remainder
= _text
.Substring(_remainderOffset
);
1855 return false; // stop parsing
1862 private void PushNodeNameEnd(int index
)
1864 _currentnode
._namelength
= index
- _currentnode
._namestartindex
;
1865 if (OptionFixNestedTags
)
1871 private void PushNodeNameStart(bool starttag
, int index
)
1873 _currentnode
._starttag
= starttag
;
1874 _currentnode
._namestartindex
= index
;
1877 private void PushNodeStart(HtmlNodeType type
, int index
)
1879 _currentnode
= CreateNode(type
, index
);
1880 _currentnode
._line
= _line
;
1881 _currentnode
._lineposition
= _lineposition
;
1882 if (type
== HtmlNodeType
.Element
)
1884 _currentnode
._lineposition
--;
1886 _currentnode
._streamposition
= index
;
1889 private void ReadDocumentEncoding(HtmlNode node
)
1891 if (!OptionReadEncoding
)
1894 // <meta http-equiv="content-type" content="text/html;charset=iso-8859-1" />
1896 // when we append a child, we are in node end, so attributes are already populated
1897 if (node
._namelength
== 4) // quick check, avoids string alloc
1899 if (node
.Name
== "meta") // all nodes names are lowercase
1901 HtmlAttribute att
= node
.Attributes
["http-equiv"];
1904 if (string.Compare(att
.Value
, "content-type", true) == 0)
1906 HtmlAttribute content
= node
.Attributes
["content"];
1907 if (content
!= null)
1909 string charset
= NameValuePairList
.GetNameValuePairsValue(content
.Value
, "charset");
1910 if (charset
!= null && (charset
= charset
.Trim()).Length
> 0)
1912 _declaredencoding
= Encoding
.GetEncoding(charset
.Trim());
1913 if (_onlyDetectEncoding
)
1915 throw new EncodingFoundException(_declaredencoding
);
1918 if (_streamencoding
!= null)
1920 if (_declaredencoding
.WindowsCodePage
!= _streamencoding
.WindowsCodePage
)
1923 HtmlParseErrorCode
.CharsetMismatch
,
1924 _line
, _lineposition
,
1925 _index
, node
.OuterHtml
,
1926 "Encoding mismatch between StreamEncoding: " +
1927 _streamencoding
.WebName
+ " and DeclaredEncoding: " +
1928 _declaredencoding
.WebName
);
1941 #region Nested type: ParseState
1943 private enum ParseState
1951 AttributeBeforeEquals
,
1952 AttributeAfterEquals
,
1955 QuotedAttributeValue
,