1 // HtmlAgilityPack V1.0 - Simon Mourier <simonm@microsoft.com>
4 Copyright (C) 2003 Simon Mourier <simonm@microsoft.com>
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions
10 1. Redistributions of source code must retain the above copyright
11 notice, this list of conditions and the following disclaimer.
12 2. Redistributions in binary form must reproduce the above copyright
13 notice, this list of conditions and the following disclaimer in the
14 documentation and/or other materials provided with the distribution.
15 3. The name of the author may not be used to endorse or promote products
16 derived from this software without specific prior written permission.
18 THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
19 IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20 OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21 IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
22 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
23 NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
27 THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 using System
.Diagnostics
;
34 using System
.Collections
;
35 using System
.Text
.RegularExpressions
;
37 using System
.Xml
.XPath
;
40 // Legend: SLIM=Comment added describing changes to original HtmlAgilityPack
41 // to reduce memory consumption
42 // Once the parser is free of bugs, the comments will be taken out
43 namespace HtmlAgilityPack
46 /// Represents the type of parsing error.
48 public enum HtmlParseErrorCode
51 /// A tag was not closed.
56 /// A tag was not opened.
61 /// There is a charset mismatch between stream and declared (META) encoding.
66 /// An end tag was not required.
71 /// An end tag is invalid at this position.
77 /// Represents a parsing error found during document parsing.
79 public class HtmlParseError
81 private HtmlParseErrorCode _code
;
83 private int _linePosition
;
84 private int _streamPosition
;
85 private string _sourceText
;
86 private string _reason
;
88 internal HtmlParseError(
89 HtmlParseErrorCode code
,
98 _linePosition
= linePosition
;
99 _streamPosition
= streamPosition
;
100 _sourceText
= sourceText
;
105 /// Gets the type of error.
107 public HtmlParseErrorCode Code
116 /// Gets the line number of this error in the document.
127 /// Gets the column number of this error in the document.
129 public int LinePosition
133 return _linePosition
;
138 /// Gets the absolstream position of this error in the document, relative to the start of the document.
140 public int StreamPosition
144 return _streamPosition
;
149 /// Gets the the full text of the line containing the error.
151 public string SourceText
160 /// Gets a description for the error.
172 abstract class StreamAsArray
{
173 public abstract bool Eof (int index
);
174 public abstract char this [int index
] { get;}
175 public abstract string Substring (int startindex
, int length
);
176 public abstract int FullLength { get;}
179 // SLIM: creating this class to wrap around a textreader
180 // to emulate ReadToEnd () behaviour
181 class ImplStreamAsArray
: StreamAsArray
{
182 private StreamReader _reader
;
184 private int _position
;
186 private char[] _buf_previous
; // could have used only one array
187 private char[] _buf_current
; // but, this is cleaner
188 private int _block_size
;
190 public ImplStreamAsArray (StreamReader r
)
198 _buf_previous
= new char [_block_size
];
199 _buf_current
= new char [_block_size
];
204 private void Read (bool initial
)
207 Array
.Copy (_buf_current
, _buf_previous
, _block_size
);
208 _position
+= _block_size
;
210 HtmlDocument
.Debug ("Debug: Read in buffer at:" + _position
);
212 int num_read
= _reader
.Read (_buf_current
, 0, _block_size
);
213 if (num_read
< _block_size
) {
215 _length
= _position
+ num_read
;
217 HtmlDocument
.Debug ("[" + new string (_buf_current
, 0, num_read
) + "]");
220 public override bool Eof (int index
) {
222 return (index
== _length
);
224 if (index
>= _position
+ _block_size
&&
225 index
< _position
+ _block_size
+ _block_size
)
228 return (index
== _length
);
234 public override char this[int index
] {
236 if (index
>= _position
&&
237 index
< _position
+ _block_size
)
238 return _buf_current
[index
% _block_size
];
239 if (index
>= _position
- _block_size
&&
241 return _buf_previous
[ index
% _block_size
];
242 if (index
>= _position
+ _block_size
&&
243 index
< _position
+ _block_size
+ _block_size
) {
245 return _buf_current
[index
% _block_size
];
247 Console
.WriteLine ("EXCEPTION!!!");
248 throw new Exception (String
.Format ("{0} is out of current bounds:[{1}-{2}] and further than read-ahead",
250 _position
- _block_size
,
251 _position
+ _block_size
- 1));
255 // evil function ... you get what you pay for!
256 private string OutOfBandRead (int startindex
, int length
)
258 HtmlDocument
.Debug ("Out of band read! From " + startindex
+ " to " + (startindex
+ length
- 1));
259 ResetPosition (startindex
);
260 // ahh.. now we are at the correct place
261 // create a buffer of required length
262 // who cares if the buffer size does not align well
263 // with page boundary
264 char[] temp_buf
= new char [length
];
265 int num_read
= _reader
.Read (temp_buf
, 0, length
);
266 if (num_read
< length
) {
269 _length
= startindex
+ num_read
;
271 // discard data and reset stream position
272 int t
= (_eof
? _length
:_position
+ _block_size
);
274 return new String (temp_buf
);
277 // streamreader does not allow seeking
278 // seek on its basestream does not reflect the position
279 // of the reader - it is governed by the buffer size
280 // of the underlying stream
281 // :( so, read character by character from beginning ...
282 private void ResetPosition (int pos
)
284 _reader
.DiscardBufferedData ();
285 _reader
.BaseStream
.Position
= 0;
286 // read in chunks of block_size
287 int n1
= pos
/ _block_size
;
288 int n2
= pos
% _block_size
;
289 char[] tmp
= new char [_block_size
];
290 // yo ho... start reading till we have reach pos
291 // hopefully, reader will buffer itself, so we can be mean and get one char at a time
292 for (int i
= 0; i
< n1
; ++i
)
293 _reader
.Read (tmp
, 0, _block_size
);
294 for (int i
= 0; i
< n2
; ++i
)
299 public override string Substring (int startindex
, int length
)
302 HtmlDocument
.Debug ("substring:" + startindex
+ " " + length
+ " " + _position
+ ":");
305 if (length
> _block_size
|| startindex
< _position
- _block_size
) {
306 return OutOfBandRead (startindex
, length
);
308 if (startindex
+ length
- 1 >= _position
+ _block_size
) {
312 if (startindex
< _position
) {
313 int len_1
= _position
- startindex
;
315 substr
= new String (_buf_previous
, _block_size
- len_1
, length
);
317 substr
= new String (_buf_previous
, _block_size
- len_1
, len_1
);
318 substr
+= new String (_buf_current
, 0, length
- len_1
);
321 substr
= new String (_buf_current
, startindex
- _position
, length
);
326 // FIXME: Is this costly ?
327 public override int FullLength
{
329 return (int)_reader
.BaseStream
.Length
;
334 // A dummy StreamAsArray wrapper around a string
335 class DummyStreamAsArray
: StreamAsArray
{
336 private string _base_string
;
339 public DummyStreamAsArray(string str
)
342 _length
= str
.Length
;
345 public override bool Eof(int index
)
347 return (index
>= _length
);
350 public new char this[int index
] {
351 get { return _base_string [index]; }
354 public override string Substring (int startindex
, int length
)
356 return _base_string
.Substring (startindex
, length
);
359 public override int FullLength
{
360 get { return _length; }
365 /// Represents a complete HTML document.
367 public class HtmlDocument
: IXPathNavigable
369 // SLIM: Make the parser event driven
370 // callback for FilterHtml
371 // return value is a way for the callback to signal to continue or stop parsing
372 public delegate bool NodeHandler (HtmlNode node
);
373 public NodeHandler ReportNode
;
374 // misnomer ... should be called event_driven_mode
375 private bool _streammode
= false;
376 private bool _stop_parsing
= false;
378 internal static readonly string HtmlExceptionRefNotChild
= "Reference node must be a child of this node";
379 internal static readonly string HtmlExceptionUseIdAttributeFalse
= "You need to set UseIdAttribute property to true to enable this feature";
381 internal Hashtable _openednodes
;
382 internal Hashtable _lastnodes
= new Hashtable();
383 internal Hashtable _nodesid
;
384 private HtmlNode _documentnode
;
385 //SLIM: internal string _text;
386 internal StreamAsArray _text
;
387 private HtmlNode _currentnode
;
388 private HtmlNode _lastparentnode
;
389 private HtmlAttribute _currentattribute
;
392 private int _lineposition
, _maxlineposition
;
394 private bool _fullcomment
;
395 private System
.Text
.Encoding _streamencoding
;
396 private System
.Text
.Encoding _declaredencoding
;
397 private ArrayList _parseerrors
= new ArrayList();
398 private ParseState _state
, _oldstate
;
399 private Crc32 _crc32
= null;
400 private bool _onlyDetectEncoding
= false;
401 private int _pcdata_quote_char
= '\0';
403 private static bool _debug
= false;
404 internal static void Debug (string s
)
407 Console
.WriteLine (s
);
413 /// Defines if a checksum must be computed for the document while parsing. Default is false.
415 public bool OptionComputeChecksum
= false;
418 /// Defines if declared encoding must be read from the document.
419 /// Declared encoding is determined using the meta http-equiv="content-type" content="text/html;charset=XXXXX" html node.
422 public bool OptionReadEncoding
= true;
426 /// Defines if non closed nodes will be checked at the end of parsing. Default is true.
428 public bool OptionCheckSyntax
= true;
431 /// Defines if the 'id' attribute must be specifically used. Default is true.
433 public bool OptionUseIdAttribute
= true;
436 /// Defines if empty nodes must be written as closed during output. Default is false.
438 public bool OptionWriteEmptyNodes
= false;
441 /// Defines if output must conform to XML, instead of HTML.
443 public bool OptionOutputAsXml
= false;
446 /// Defines if name must be output in uppercase. Default is false.
448 public bool OptionOutputUpperCase
= false;
451 /// Defines if attribute value output must be optimized (not bound with double quotes if it is possible). Default is false.
453 public bool OptionOutputOptimizeAttributeValues
= false;
456 /// Adds Debugging attributes to node. Default is false.
458 public bool OptionAddDebuggingAttributes
= false;
461 /// Defines if source text must be extracted while parsing errors.
462 /// If the document has a lot of errors, or cascading errors, parsing performance can be dramatically affected if set to true.
463 /// Default is false.
465 public bool OptionExtractErrorSourceText
= false; // turning this on can dramatically slow performance if a lot of errors are detected
468 /// Defines if closing for non closed nodes must be done at the end or directly in the document.
469 /// Setting this to true can actually change how browsers render the page. Default is false.
471 public bool OptionAutoCloseOnEnd
= false; // close errors at the end
474 /// Defines if LI, TR, TH, TD tags must be partially fixed when nesting errors are detected. Default is false.
476 public bool OptionFixNestedTags
= false; // fix li, tr, th, td tags
479 /// Defines the maximum length of source text or parse errors. Default is 100.
481 public int OptionExtractErrorSourceTextMaxLength
= 100;
484 /// Defines the default stream encoding to use. Default is System.Text.Encoding.Default.
486 // From http://www.w3.org/TR/REC-html40/charset.html
487 // The HTTP protocol ([RFC2616], section 3.7.1) mentions ISO-8859-1 as a default character encoding when the "charset" parameter is absent from the "Content-Type" header field.
488 // So, however we are still using UTF-8 for some unknown reason
489 //FIXME: Fix the default encoding!
490 public System
.Text
.Encoding OptionDefaultStreamEncoding
= Encoding
.UTF8
;
493 /// Gets a list of parse errors found in the document.
495 public ArrayList ParseErrors
504 /// Gets the document's stream encoding.
506 public System
.Text
.Encoding StreamEncoding
510 return _streamencoding
;
515 /// Gets the document's declared encoding.
516 /// Declared encoding is determined using the meta http-equiv="content-type" content="text/html;charset=XXXXX" html node.
518 public System
.Text
.Encoding DeclaredEncoding
522 return _declaredencoding
;
527 /// Creates an instance of an HTML document.
529 public HtmlDocument()
531 _documentnode
= CreateNode(HtmlNodeType
.Document
, 0);
534 internal HtmlNode
GetXmlDeclaration()
536 if (!_documentnode
.HasChildNodes
)
541 foreach(HtmlNode node
in _documentnode
._childnodes
)
543 if (node
.Name
== "?xml") // it's ok, names are case sensitive
552 /// Applies HTML encoding to a specified string.
554 /// <param name="html">The input string to encode. May not be null.</param>
555 /// <returns>The encoded string.</returns>
556 public static string HtmlEncode(string html
)
560 throw new ArgumentNullException("html");
562 // replace & by & but only once!
563 Regex rx
= new Regex("&(?!(amp;)|(lt;)|(gt;)|(quot;))", RegexOptions
.IgnoreCase
);
564 return rx
.Replace(html
, "&").Replace("<", "<").Replace(">", ">").Replace("\"", """);
568 /// Detects the encoding of an HTML stream.
570 /// <param name="stream">The input stream. May not be null.</param>
571 /// <returns>The detected encoding.</returns>
572 public Encoding
DetectEncoding(Stream stream
)
576 throw new ArgumentNullException("stream");
578 return DetectEncoding(new StreamReader(stream
));
582 /// Detects the encoding of an HTML file.
584 /// <param name="path">Path for the file containing the HTML document to detect. May not be null.</param>
585 /// <returns>The detected encoding.</returns>
586 public Encoding
DetectEncoding(string path
)
590 throw new ArgumentNullException("path");
592 StreamReader sr
= new StreamReader(path
, OptionDefaultStreamEncoding
);
593 Encoding encoding
= DetectEncoding(sr
);
599 /// Detects the encoding of an HTML text.
601 /// <param name="html">The input html text. May not be null.</param>
602 /// <returns>The detected encoding.</returns>
603 public Encoding
DetectEncodingHtml(string html
)
607 throw new ArgumentNullException("html");
609 StringReader sr
= new StringReader(html
);
610 Encoding encoding
= DetectEncoding(sr
);
616 /// Detects the encoding of an HTML text provided on a TextReader.
618 /// <param name="reader">The TextReader used to feed the HTML. May not be null.</param>
619 /// <returns>The detected encoding.</returns>
620 public Encoding
DetectEncoding(TextReader reader
)
624 throw new ArgumentNullException("reader");
626 _onlyDetectEncoding
= true;
627 if (OptionCheckSyntax
)
629 _openednodes
= new Hashtable();
636 if (OptionUseIdAttribute
)
638 _nodesid
= new Hashtable();
645 StreamReader sr
= reader
as StreamReader
;
648 _streamencoding
= sr
.CurrentEncoding
;
649 _text
= new ImplStreamAsArray (sr
);
653 _streamencoding
= null;
654 // Expensive, but cannot avoid since TextReader doesnt have any length of the underlying data
655 _text
= new DummyStreamAsArray (reader
.ReadToEnd());
657 _declaredencoding
= null;
659 // SLIM: _text = reader.ReadToEnd();
660 _documentnode
= CreateNode(HtmlNodeType
.Document
, 0);
662 // this is a hack, but it allows us not to muck with the original parsing code
667 catch(EncodingFoundException ex
)
676 /// Loads an HTML document from a stream.
678 /// <param name="stream">The input stream.</param>
679 public void Load(Stream stream
)
681 Load(new StreamReader(stream
, OptionDefaultStreamEncoding
));
685 /// Loads an HTML document from a stream.
687 /// <param name="stream">The input stream.</param>
688 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the stream.</param>
689 public void Load(Stream stream
, bool detectEncodingFromByteOrderMarks
)
691 Load(new StreamReader(stream
, detectEncodingFromByteOrderMarks
));
695 /// Loads an HTML document from a stream.
697 /// <param name="stream">The input stream.</param>
698 /// <param name="encoding">The character encoding to use.</param>
699 public void Load(Stream stream
, Encoding encoding
)
701 Load(new StreamReader(stream
, encoding
));
705 /// Loads an HTML document from a stream.
707 /// <param name="stream">The input stream.</param>
708 /// <param name="encoding">The character encoding to use.</param>
709 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the stream.</param>
710 public void Load(Stream stream
, Encoding encoding
, bool detectEncodingFromByteOrderMarks
)
712 Load(new StreamReader(stream
, encoding
, detectEncodingFromByteOrderMarks
));
716 /// Loads an HTML document from a stream.
718 /// <param name="stream">The input stream.</param>
719 /// <param name="encoding">The character encoding to use.</param>
720 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the stream.</param>
721 /// <param name="buffersize">The minimum buffer size.</param>
722 public void Load(Stream stream
, Encoding encoding
, bool detectEncodingFromByteOrderMarks
, int buffersize
)
724 Load(new StreamReader(stream
, encoding
, detectEncodingFromByteOrderMarks
, buffersize
));
728 /// Loads an HTML document from a file.
730 /// <param name="path">The complete file path to be read. May not be null.</param>
731 public void Load(string path
)
735 throw new ArgumentNullException("path");
737 StreamReader sr
= new StreamReader(path
, OptionDefaultStreamEncoding
);
743 /// Loads an HTML document from a file.
745 /// <param name="path">The complete file path to be read. May not be null.</param>
746 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
747 public void Load(string path
, bool detectEncodingFromByteOrderMarks
)
751 throw new ArgumentNullException("path");
753 StreamReader sr
= new StreamReader(path
, detectEncodingFromByteOrderMarks
);
759 /// Loads an HTML document from a file.
761 /// <param name="path">The complete file path to be read. May not be null.</param>
762 /// <param name="encoding">The character encoding to use. May not be null.</param>
763 public void Load(string path
, Encoding encoding
)
767 throw new ArgumentNullException("path");
769 if (encoding
== null)
771 throw new ArgumentNullException("encoding");
773 StreamReader sr
= new StreamReader(path
, encoding
);
779 /// Loads an HTML document from a file.
781 /// <param name="path">The complete file path to be read. May not be null.</param>
782 /// <param name="encoding">The character encoding to use. May not be null.</param>
783 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
784 public void Load(string path
, Encoding encoding
, bool detectEncodingFromByteOrderMarks
)
788 throw new ArgumentNullException("path");
790 if (encoding
== null)
792 throw new ArgumentNullException("encoding");
794 StreamReader sr
= new StreamReader(path
, encoding
, detectEncodingFromByteOrderMarks
);
800 /// Loads an HTML document from a file.
802 /// <param name="path">The complete file path to be read. May not be null.</param>
803 /// <param name="encoding">The character encoding to use. May not be null.</param>
804 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
805 /// <param name="buffersize">The minimum buffer size.</param>
806 public void Load(string path
, Encoding encoding
, bool detectEncodingFromByteOrderMarks
, int buffersize
)
810 throw new ArgumentNullException("path");
812 if (encoding
== null)
814 throw new ArgumentNullException("encoding");
816 StreamReader sr
= new StreamReader(path
, encoding
, detectEncodingFromByteOrderMarks
, buffersize
);
822 /// Loads the HTML document from the specified string.
824 /// <param name="html">String containing the HTML document to load. May not be null.</param>
825 public void LoadHtml(string html
)
829 throw new ArgumentNullException("html");
831 StringReader sr
= new StringReader(html
);
837 /// Detects the encoding of an HTML document from a file first, and then loads the file.
839 /// <param name="path">The complete file path to be read.</param>
840 public void DetectEncodingAndLoad(string path
)
842 DetectEncodingAndLoad(path
, true);
846 /// Detects the encoding of an HTML document from a file first, and then loads the file.
848 /// <param name="path">The complete file path to be read. May not be null.</param>
849 /// <param name="detectEncoding">true to detect encoding, false otherwise.</param>
850 public void DetectEncodingAndLoad(string path
, bool detectEncoding
)
854 throw new ArgumentNullException("path");
856 System
.Text
.Encoding enc
;
859 enc
= DetectEncoding(path
);
877 /// Loads the HTML document from the specified TextReader.
879 /// <param name="reader">The TextReader used to feed the HTML data into the document. May not be null.</param>
880 public void Load(TextReader reader
)
882 // all Load methods pass down to this one
885 throw new ArgumentNullException("reader");
888 _onlyDetectEncoding
= false;
890 if (OptionCheckSyntax
)
892 _openednodes
= new Hashtable();
899 if (OptionUseIdAttribute
)
901 _nodesid
= new Hashtable();
908 StreamReader sr
= reader
as StreamReader
;
913 // trigger bom read if needed
920 _streamencoding
= sr
.CurrentEncoding
;
921 _text
= new ImplStreamAsArray (sr
);
925 _streamencoding
= null;
926 // Expensive, but cannot avoid since TextReader doesnt have any length of the underlying data
927 _text
= new DummyStreamAsArray (reader
.ReadToEnd());
929 _declaredencoding
= null;
931 // SLIM: _text = reader.ReadToEnd();
932 _documentnode
= CreateNode(HtmlNodeType
.Document
, 0);
935 if (OptionCheckSyntax
)
937 foreach(HtmlNode node
in _openednodes
.Values
)
939 if (!node
._starttag
) // already reported
945 if (OptionExtractErrorSourceText
)
947 html
= node
.OuterHtml
;
948 if (html
.Length
> OptionExtractErrorSourceTextMaxLength
)
950 html
= html
.Substring(0, OptionExtractErrorSourceTextMaxLength
);
958 HtmlParseErrorCode
.TagNotClosed
,
959 node
._line
, node
._lineposition
,
960 node
._streamposition
, html
,
961 "End tag </" + node
.Name
+ "> was not found");
964 // we don't need this anymore
965 _openednodes
.Clear();
969 internal System
.Text
.Encoding
GetOutEncoding()
971 // when unspecified, use the stream encoding first
972 if (_declaredencoding
!= null)
974 return _declaredencoding
;
978 if (_streamencoding
!= null)
980 return _streamencoding
;
983 return OptionDefaultStreamEncoding
;
988 /// Gets the document's output encoding.
990 public System
.Text
.Encoding Encoding
994 return GetOutEncoding();
999 /// Saves the HTML document to the specified stream.
1001 /// <param name="outStream">The stream to which you want to save.</param>
1002 public void Save(Stream outStream
)
1004 StreamWriter sw
= new StreamWriter(outStream
, GetOutEncoding());
1009 /// Saves the HTML document to the specified stream.
1011 /// <param name="outStream">The stream to which you want to save. May not be null.</param>
1012 /// <param name="encoding">The character encoding to use. May not be null.</param>
1013 public void Save(Stream outStream
, System
.Text
.Encoding encoding
)
1015 if (outStream
== null)
1017 throw new ArgumentNullException("outStream");
1019 if (encoding
== null)
1021 throw new ArgumentNullException("encoding");
1023 StreamWriter sw
= new StreamWriter(outStream
, encoding
);
1028 /// Saves the mixed document to the specified file.
1030 /// <param name="filename">The location of the file where you want to save the document.</param>
1031 public void Save(string filename
)
1033 StreamWriter sw
= new StreamWriter(filename
, false, GetOutEncoding());
1039 /// Saves the mixed document to the specified file.
1041 /// <param name="filename">The location of the file where you want to save the document. May not be null.</param>
1042 /// <param name="encoding">The character encoding to use. May not be null.</param>
1043 public void Save(string filename
, System
.Text
.Encoding encoding
)
1045 if (filename
== null)
1047 throw new ArgumentNullException("filename");
1049 if (encoding
== null)
1051 throw new ArgumentNullException("encoding");
1053 StreamWriter sw
= new StreamWriter(filename
, false, encoding
);
1059 /// Saves the HTML document to the specified StreamWriter.
1061 /// <param name="writer">The StreamWriter to which you want to save.</param>
1062 public void Save(StreamWriter writer
)
1064 Save((TextWriter
)writer
);
1068 /// Saves the HTML document to the specified TextWriter.
1070 /// <param name="writer">The TextWriter to which you want to save. May not be null.</param>
1071 public void Save(TextWriter writer
)
1075 throw new ArgumentNullException("writer");
1077 DocumentNode
.WriteTo(writer
);
1081 /// Saves the HTML document to the specified XmlWriter.
1083 /// <param name="writer">The XmlWriter to which you want to save.</param>
1084 public void Save(XmlWriter writer
)
1086 DocumentNode
.WriteTo(writer
);
1091 /// Creates a new XPathNavigator object for navigating this HTML document.
1093 /// <returns>An XPathNavigator object. The XPathNavigator is positioned on the root of the document.</returns>
1094 public XPathNavigator
CreateNavigator()
1096 return new HtmlNodeNavigator(this, _documentnode
);
1099 internal void SetIdForNode(HtmlNode node
, string id
)
1101 if (!OptionUseIdAttribute
)
1106 if ((_nodesid
== null) || (id
== null))
1113 _nodesid
.Remove(id
.ToLower());
1117 _nodesid
[id
.ToLower()] = node
;
1122 /// Gets the HTML node with the specified 'id' attribute value.
1124 /// <param name="id">The attribute id to match. May not be null.</param>
1125 /// <returns>The HTML node with the matching id or null if not found.</returns>
1126 public HtmlNode
GetElementbyId(string id
)
1130 throw new ArgumentNullException("id");
1132 if (_nodesid
== null)
1134 throw new Exception(HtmlExceptionUseIdAttributeFalse
);
1137 return _nodesid
[id
.ToLower()] as HtmlNode
;
1141 /// Creates an HTML element node with the specified name.
1143 /// <param name="name">The qualified name of the element. May not be null.</param>
1144 /// <returns>The new HTML node.</returns>
1145 public HtmlNode
CreateElement(string name
)
1149 throw new ArgumentNullException("name");
1151 HtmlNode node
= CreateNode(HtmlNodeType
.Element
);
1157 /// Creates an HTML comment node.
1159 /// <returns>The new HTML comment node.</returns>
1160 public HtmlCommentNode
CreateComment()
1162 return (HtmlCommentNode
)CreateNode(HtmlNodeType
.Comment
);
1166 /// Creates an HTML comment node with the specified comment text.
1168 /// <param name="comment">The comment text. May not be null.</param>
1169 /// <returns>The new HTML comment node.</returns>
1170 public HtmlCommentNode
CreateComment(string comment
)
1172 if (comment
== null)
1174 throw new ArgumentNullException("comment");
1176 HtmlCommentNode c
= CreateComment();
1177 c
.Comment
= comment
;
1182 /// Creates an HTML text node.
1184 /// <returns>The new HTML text node.</returns>
1185 public HtmlTextNode
CreateTextNode()
1187 return (HtmlTextNode
)CreateNode(HtmlNodeType
.Text
);
1191 /// Creates an HTML text node with the specified text.
1193 /// <param name="text">The text of the node. May not be null.</param>
1194 /// <returns>The new HTML text node.</returns>
1195 public HtmlTextNode
CreateTextNode(string text
)
1199 throw new ArgumentNullException("text");
1201 HtmlTextNode t
= CreateTextNode();
1206 internal HtmlNode
CreateNode(HtmlNodeType type
)
1208 return CreateNode(type
, -1);
1211 internal HtmlNode
CreateNode(HtmlNodeType type
, int index
)
1215 case HtmlNodeType
.Comment
:
1216 return new HtmlCommentNode(this, index
);
1218 case HtmlNodeType
.Text
:
1219 return new HtmlTextNode(this, index
);
1222 return new HtmlNode(type
, this, index
);
1226 internal HtmlAttribute
CreateAttribute()
1228 return new HtmlAttribute(this);
1232 /// Creates an HTML attribute with the specified name.
1234 /// <param name="name">The name of the attribute. May not be null.</param>
1235 /// <returns>The new HTML attribute.</returns>
1236 public HtmlAttribute
CreateAttribute(string name
)
1240 throw new ArgumentNullException("name");
1242 HtmlAttribute att
= CreateAttribute();
1248 /// Creates an HTML attribute with the specified name.
1250 /// <param name="name">The name of the attribute. May not be null.</param>
1251 /// <param name="value">The value of the attribute.</param>
1252 /// <returns>The new HTML attribute.</returns>
1253 public HtmlAttribute
CreateAttribute(string name
, string value)
1257 throw new ArgumentNullException("name");
1259 HtmlAttribute att
= CreateAttribute(name
);
1265 /// Gets the root node of the document.
1267 public HtmlNode DocumentNode
1271 return _documentnode
;
1276 /// Gets the document CRC32 checksum if OptionComputeChecksum was set to true before parsing, 0 otherwise.
1288 return (int)_crc32
.CheckSum
;
1293 public bool StreamMode
1301 _streammode
= value;
1305 private HtmlParseError
AddError(
1306 HtmlParseErrorCode code
,
1313 HtmlParseError err
= new HtmlParseError(code
, line
, linePosition
, streamPosition
, sourceText
, reason
);
1314 _parseerrors
.Add(err
);
1318 private enum ParseState
1326 AttributeBeforeEquals
,
1327 AttributeAfterEquals
,
1330 QuotedAttributeValue
,
1336 private void IncrementPosition()
1340 // REVIEW: should we add some checksum code in DecrementPosition too?
1341 _crc32
.AddToCRC32(_c
);
1345 _maxlineposition
= _lineposition
;
1357 private void DecrementPosition()
1360 if (_lineposition
== 1)
1362 _lineposition
= _maxlineposition
;
1371 private void Parse()
1374 if (OptionComputeChecksum
)
1376 _crc32
= new Crc32();
1379 _lastnodes
= new Hashtable();
1381 _fullcomment
= false;
1382 _parseerrors
= new ArrayList();
1385 _maxlineposition
= 1;
1387 _state
= ParseState
.Text
;
1389 _documentnode
._innerlength
= _text
.FullLength
;
1390 _documentnode
._outerlength
= _text
.FullLength
;
1392 _lastparentnode
= _documentnode
;
1393 _currentnode
= CreateNode(HtmlNodeType
.Text
, 0);
1394 _currentattribute
= null;
1397 PushNodeStart(HtmlNodeType
.Text
, 0);
1398 // SLIM: while (_index<_text.Length)
1399 while (! _stop_parsing
&& ! _text
.Eof (_index
))
1402 IncrementPosition();
1406 case ParseState
.Text
:
1411 case ParseState
.WhichTag
:
1416 PushNodeNameStart(false, _index
);
1420 PushNodeNameStart(true, _index
-1);
1421 DecrementPosition();
1423 _state
= ParseState
.Tag
;
1426 case ParseState
.Tag
:
1429 if (IsWhiteSpace(_c
))
1431 PushNodeNameEnd(_index
-1);
1432 if (_state
!= ParseState
.Tag
)
1434 _state
= ParseState
.BetweenAttributes
;
1439 PushNodeNameEnd(_index
-1);
1440 if (_state
!= ParseState
.Tag
)
1442 _state
= ParseState
.EmptyTag
;
1447 PushNodeNameEnd(_index
-1);
1448 if (_state
!= ParseState
.Tag
)
1450 PushNodeEnd(_index
, false);
1451 if (_state
!= ParseState
.Tag
)
1453 _state
= ParseState
.Text
;
1454 PushNodeStart(HtmlNodeType
.Text
, _index
);
1458 case ParseState
.BetweenAttributes
:
1462 if (IsWhiteSpace(_c
))
1465 if ((_c
== '/') || (_c
== '?'))
1467 _state
= ParseState
.EmptyTag
;
1473 PushNodeEnd(_index
, false);
1474 if (_state
!= ParseState
.BetweenAttributes
)
1476 _state
= ParseState
.Text
;
1477 PushNodeStart(HtmlNodeType
.Text
, _index
);
1481 PushAttributeNameStart(_index
-1);
1482 _state
= ParseState
.AttributeName
;
1485 case ParseState
.EmptyTag
:
1491 PushNodeEnd(_index
, true);
1492 if (_state
!= ParseState
.EmptyTag
)
1494 _state
= ParseState
.Text
;
1495 PushNodeStart(HtmlNodeType
.Text
, _index
);
1498 _state
= ParseState
.BetweenAttributes
;
1501 case ParseState
.AttributeName
:
1505 if (IsWhiteSpace(_c
))
1507 PushAttributeNameEnd(_index
-1);
1508 _state
= ParseState
.AttributeBeforeEquals
;
1513 PushAttributeNameEnd(_index
-1);
1514 _state
= ParseState
.AttributeAfterEquals
;
1519 PushAttributeNameEnd(_index
-1);
1520 PushNodeEnd(_index
, false);
1521 if (_state
!= ParseState
.AttributeName
)
1523 _state
= ParseState
.Text
;
1524 PushNodeStart(HtmlNodeType
.Text
, _index
);
1529 case ParseState
.AttributeBeforeEquals
:
1533 if (IsWhiteSpace(_c
))
1537 PushNodeEnd(_index
, false);
1538 if (_state
!= ParseState
.AttributeBeforeEquals
)
1540 _state
= ParseState
.Text
;
1541 PushNodeStart(HtmlNodeType
.Text
, _index
);
1546 _state
= ParseState
.AttributeAfterEquals
;
1549 // no equals, no whitespace, it's a new attrribute starting
1550 _state
= ParseState
.BetweenAttributes
;
1551 DecrementPosition();
1554 case ParseState
.AttributeAfterEquals
:
1558 if (IsWhiteSpace(_c
))
1561 if ((_c
== '\'') || (_c
== '"'))
1563 _state
= ParseState
.QuotedAttributeValue
;
1564 PushAttributeValueStart(_index
);
1570 PushNodeEnd(_index
, false);
1571 if (_state
!= ParseState
.AttributeAfterEquals
)
1573 _state
= ParseState
.Text
;
1574 PushNodeStart(HtmlNodeType
.Text
, _index
);
1577 PushAttributeValueStart(_index
-1);
1578 _state
= ParseState
.AttributeValue
;
1581 case ParseState
.AttributeValue
:
1585 if (IsWhiteSpace(_c
))
1587 PushAttributeValueEnd(_index
-1);
1588 _state
= ParseState
.BetweenAttributes
;
1594 PushAttributeValueEnd(_index
-1);
1595 PushNodeEnd(_index
, false);
1596 if (_state
!= ParseState
.AttributeValue
)
1598 _state
= ParseState
.Text
;
1599 PushNodeStart(HtmlNodeType
.Text
, _index
);
1604 case ParseState
.QuotedAttributeValue
:
1605 if (_c
== lastquote
)
1607 PushAttributeValueEnd(_index
-1);
1608 _state
= ParseState
.BetweenAttributes
;
1613 //SLIM: if (_index<_text.Length)
1614 if (!_text
.Eof (_index
))
1616 if (_text
[_index
] == '%')
1619 _state
= ParseState
.ServerSideCode
;
1626 case ParseState
.Comment
:
1631 if ((_text
[_index
-2] != '-') ||
1632 (_text
[_index
-3] != '-'))
1637 PushNodeEnd(_index
, false);
1638 _state
= ParseState
.Text
;
1639 PushNodeStart(HtmlNodeType
.Text
, _index
);
1644 case ParseState
.ServerSideCode
:
1647 //SLIM: if (_index<_text.Length)
1648 if (! _text
.Eof (_index
))
1650 if (_text
[_index
] == '>')
1654 case ParseState
.AttributeAfterEquals
:
1655 _state
= ParseState
.AttributeValue
;
1658 case ParseState
.BetweenAttributes
:
1659 PushAttributeNameEnd(_index
+1);
1660 _state
= ParseState
.BetweenAttributes
;
1667 IncrementPosition();
1673 // handle <script>a="</script>"</script>
1674 case ParseState
.PcDataQuote
:
1675 if ((_c
== _pcdata_quote_char
) && (_text
[_index
- 2] != '\\')) {
1676 _pcdata_quote_char
= '\0';
1677 _state
= ParseState
.PcData
;
1681 case ParseState
.PcData
:
1682 Debug ("PCDATA " + _currentnode
.Name
+ " " + _text
.Substring(_index
-1, _currentnode
._namelength
+2));
1683 if (_c
== '\"' || _c
== '\''){
1684 _pcdata_quote_char
= _c
;
1685 _state
= ParseState
.PcDataQuote
;
1688 // look for </tag + 1 char
1691 //SLIM: if ((_currentnode._namelength+3)<=(_text.Length-(_index-1)))
1692 if (! _text
.Eof (_currentnode
._namelength
+ _index
+ 1))
1694 if (string.Compare(_text
.Substring(_index
-1, _currentnode
._namelength
+2),
1695 "</" + _currentnode
.Name
, true) == 0)
1697 int c
= _text
[_index
-1 + 2 + _currentnode
.Name
.Length
];
1698 if ((c
== '>') || (IsWhiteSpace(c
)))
1700 // add the script as a text node
1701 HtmlNode script
= CreateNode(HtmlNodeType
.Text
,
1702 _currentnode
._outerstartindex
+ _currentnode
._outerlength
);
1703 script
._outerlength
= _index
-1 - script
._outerstartindex
;
1704 if (_streammode
&& ReportNode
!= null)
1705 _stop_parsing
= ! ReportNode (script
);
1707 _currentnode
.AppendChild(script
);
1708 Debug ("Found script: [" + script
.InnerText
+ "]");
1710 PushNodeStart(HtmlNodeType
.Element
, _index
-1);
1711 PushNodeNameStart(false, _index
-1 +2);
1712 _state
= ParseState
.Tag
;
1713 IncrementPosition();
1721 // finish the current work
1722 if (_currentnode
._namestartindex
> 0)
1724 PushNodeNameEnd(_index
);
1726 PushNodeEnd(_index
, false);
1728 // we don't need this anymore
1732 private bool NewCheck()
1738 //SLIM: if (_index<_text.Length)
1739 if (! _text
.Eof (_index
))
1741 if (_text
[_index
] == '%')
1745 case ParseState
.AttributeAfterEquals
:
1746 PushAttributeValueStart(_index
-1);
1749 case ParseState
.BetweenAttributes
:
1750 PushAttributeNameStart(_index
-1);
1753 case ParseState
.WhichTag
:
1754 PushNodeNameStart(true, _index
-1);
1755 _state
= ParseState
.Tag
;
1759 _state
= ParseState
.ServerSideCode
;
1764 PushNodeEnd(_index
-1, true);
1765 _state
= ParseState
.WhichTag
;
1766 //SLIM: if ((_index-1) <= (_text.Length-2))
1767 if (!_text
.Eof (_index
))
1769 if (_text
[_index
] == '!')
1771 PushNodeStart(HtmlNodeType
.Comment
, _index
-1);
1772 PushNodeNameStart(true, _index
);
1773 PushNodeNameEnd(_index
+1);
1774 _state
= ParseState
.Comment
;
1775 //SLIM: if (_index<(_text.Length-2))
1776 if (! _text
.Eof (_index
+ 2))
1778 if ((_text
[_index
+1] == '-') &&
1779 (_text
[_index
+2] == '-'))
1781 _fullcomment
= true;
1785 _fullcomment
= false;
1791 PushNodeStart(HtmlNodeType
.Element
, _index
-1);
1795 private void ReadDocumentEncoding(HtmlNode node
)
1797 if (!OptionReadEncoding
)
1800 // <meta http-equiv="content-type" content="text/html;charset=iso-8859-1" />
1802 // when we append a child, we are in node end, so attributes are already populated
1803 if (node
._namelength
== 4) // quick check, avoids string alloc
1805 // only these nodes can occur before meta
1806 // if we started seeing any other node, we will never see a meta node
1807 if (node
.NodeType
== HtmlNodeType
.Element
&&
1808 (node
.Name
!= "head" && node
.Name
!= "script" &&
1809 node
.Name
!= "style" && node
.Name
!= "title" &&
1810 node
.Name
!= "head" && node
.Name
!= "link" &&
1811 node
.Name
!= "html" && node
.Name
!= "meta"))
1812 throw new EncodingFoundException (null);
1813 else if (node
.Name
== "meta") // all nodes names are lowercase
1815 HtmlAttribute att
= node
.Attributes
["http-equiv"];
1818 if (string.Compare(att
.Value
, "content-type", true) == 0)
1820 HtmlAttribute content
= node
.Attributes
["content"];
1821 if (content
!= null)
1823 string charset
= NameValuePairList
.GetNameValuePairsValue(content
.Value
, "charset");
1824 if (charset
!= null)
1826 _declaredencoding
= Encoding
.GetEncoding(charset
);
1827 if (_onlyDetectEncoding
)
1829 throw new EncodingFoundException(_declaredencoding
);
1832 if (_streamencoding
!= null)
1834 if (_declaredencoding
.WindowsCodePage
!= _streamencoding
.WindowsCodePage
)
1837 HtmlParseErrorCode
.CharsetMismatch
,
1838 _line
, _lineposition
,
1839 _index
, node
.OuterHtml
,
1840 "Encoding mismatch between StreamEncoding: " +
1841 _streamencoding
.WebName
+ " and DeclaredEncoding: " + _declaredencoding
.WebName
);
1852 private void PushAttributeNameStart(int index
)
1854 _currentattribute
= CreateAttribute();
1855 _currentattribute
._namestartindex
= index
;
1856 _currentattribute
._line
= _line
;
1857 _currentattribute
._lineposition
= _lineposition
;
1858 _currentattribute
._streamposition
= index
;
1861 private void PushAttributeNameEnd(int index
)
1863 _currentattribute
._namelength
= index
- _currentattribute
._namestartindex
;
1864 _currentnode
.Attributes
.Append(_currentattribute
);
1867 private void PushAttributeValueStart(int index
)
1869 _currentattribute
._valuestartindex
= index
;
1872 private void PushAttributeValueEnd(int index
)
1874 _currentattribute
._valuelength
= index
- _currentattribute
._valuestartindex
;
1877 private void PushNodeStart(HtmlNodeType type
, int index
)
1879 _currentnode
= CreateNode(type
, index
);
1880 _currentnode
._line
= _line
;
1881 _currentnode
._lineposition
= _lineposition
;
1882 if (type
== HtmlNodeType
.Element
)
1884 _currentnode
._lineposition
--;
1886 _currentnode
._streamposition
= index
;
1889 private void PushNodeEnd(int index
, bool close
)
1891 _currentnode
._outerlength
= index
- _currentnode
._outerstartindex
;
1893 //SLIM: inform caller
1894 if (_streammode
&& ReportNode
!= null)
1895 _stop_parsing
= ! ReportNode (_currentnode
);
1898 if (_currentnode
._nodetype
== HtmlNodeType
.Text
)
1899 Debug ("Text:" + _currentnode
.InnerText
);
1901 Debug ((_currentnode
.StartTag
? "Start-" : "End-") + _currentnode
.Name
);
1903 if ((_currentnode
._nodetype
== HtmlNodeType
.Text
) ||
1904 (_currentnode
._nodetype
== HtmlNodeType
.Comment
))
1906 // forget about void nodes
1907 if (_currentnode
._outerlength
>0)
1909 _currentnode
._innerlength
= _currentnode
._outerlength
;
1910 _currentnode
._innerstartindex
= _currentnode
._outerstartindex
;
1911 // SLIM: no need to append child in stream mode
1912 // SLIM: whatever the caller needs to do, tell it to do now
1913 if (!_streammode
&& _lastparentnode
!= null)
1915 _lastparentnode
.AppendChild(_currentnode
);
1921 if ((_currentnode
._starttag
) && (_lastparentnode
!= _currentnode
))
1923 // add to parent node
1924 // SLIM: no need to append child in stream mode
1925 // SLIM: whatever the caller needs to do, tell it to do now
1926 if (!_streammode
&& _lastparentnode
!= null)
1928 _lastparentnode
.AppendChild(_currentnode
);
1931 ReadDocumentEncoding(_currentnode
);
1933 // remember last node of this kind
1934 // SLIM: we still to store _currentnode to help other tags in the same level
1935 HtmlNode prev
= (HtmlNode
)_lastnodes
[_currentnode
.Name
];
1936 _currentnode
._prevwithsamename
= prev
;
1937 _lastnodes
[_currentnode
.Name
] = _currentnode
;
1940 if ((_currentnode
.NodeType
== HtmlNodeType
.Document
) ||
1941 (_currentnode
.NodeType
== HtmlNodeType
.Element
))
1943 _lastparentnode
= _currentnode
;
1946 if (HtmlNode
.IsCDataElement(CurrentNodeName()))
1948 _state
= ParseState
.PcData
;
1952 if ((HtmlNode
.IsClosedElement(_currentnode
.Name
)) ||
1953 (HtmlNode
.IsEmptyElement(_currentnode
.Name
)))
1960 if ((close
) || (!_currentnode
._starttag
))
1963 if ((_currentnode
._nodetype
== HtmlNodeType
.Text
) ||
1964 (_currentnode
._nodetype
== HtmlNodeType
.Comment
))
1965 _currentnode
= null;
1969 private void PushNodeNameStart(bool starttag
, int index
)
1971 _currentnode
._starttag
= starttag
;
1972 _currentnode
._namestartindex
= index
;
1975 private string[] GetResetters(string name
)
1980 return new string[]{"ul"}
;
1983 return new string[]{"table"}
;
1987 return new string[]{"tr", "table"}
;
1994 private void FixNestedTags()
1996 // we are only interested by start tags, not closing tags
1997 if (!_currentnode
._starttag
)
2000 string name
= CurrentNodeName().ToLower();
2001 FixNestedTag(name
, GetResetters(name
));
2004 private void FixNestedTag(string name
, string[] resetters
)
2006 if (resetters
== null)
2011 // if we find a previous unclosed same name node, without a resetter node between, we must close it
2012 prev
= (HtmlNode
)_lastnodes
[name
];
2013 if ((prev
!= null) && (!prev
.Closed
))
2016 // try to find a resetter node, if found, we do nothing
2017 if (FindResetterNodes(prev
, resetters
))
2022 // ok we need to close the prev now
2023 // create a fake closer node
2024 HtmlNode close
= new HtmlNode(prev
.NodeType
, this, -1);
2025 close
._endnode
= close
;
2026 prev
.CloseNode(close
);
2031 private bool FindResetterNodes(HtmlNode node
, string[] names
)
2037 for(int i
=0;i
<names
.Length
;i
++)
2039 if (FindResetterNode(node
, names
[i
]) != null)
2047 private HtmlNode
FindResetterNode(HtmlNode node
, string name
)
2049 HtmlNode resetter
= (HtmlNode
)_lastnodes
[name
];
2050 if (resetter
== null)
2052 if (resetter
.Closed
)
2056 if (resetter
._streamposition
<node
._streamposition
)
2063 private void PushNodeNameEnd(int index
)
2065 _currentnode
._namelength
= index
- _currentnode
._namestartindex
;
2066 if (OptionFixNestedTags
)
2072 private void CloseCurrentNode()
2074 if (_currentnode
.Closed
) // text or document are by def closed
2079 // find last node of this kind
2080 HtmlNode prev
= (HtmlNode
)_lastnodes
[_currentnode
.Name
];
2083 if (HtmlNode
.IsClosedElement(_currentnode
.Name
))
2085 // </br> will be seen as <br>
2086 _currentnode
.CloseNode(_currentnode
);
2088 // add to parent node
2089 if (_lastparentnode
!= null)
2091 HtmlNode foundNode
= null;
2092 Stack futureChild
= new Stack();
2093 for (HtmlNode node
= _lastparentnode
.LastChild
; node
!= null; node
= node
.PreviousSibling
)
2095 if ((node
.Name
== _currentnode
.Name
) && (! node
.HasChildNodes
))
2100 futureChild
.Push(node
);
2102 if (foundNode
!= null)
2104 HtmlNode node
= null;
2105 while(futureChild
.Count
!= 0)
2107 node
= (HtmlNode
)futureChild
.Pop();
2108 _lastparentnode
.RemoveChild(node
);
2109 foundNode
.AppendChild(node
);
2114 _lastparentnode
.AppendChild(_currentnode
);
2121 // node has no parent
2122 // node is not a closed node
2124 if (HtmlNode
.CanOverlapElement(_currentnode
.Name
))
2126 // this is a hack: add it as a text node
2127 HtmlNode closenode
= CreateNode(HtmlNodeType
.Text
, _currentnode
._outerstartindex
);
2128 closenode
._outerlength
= _currentnode
._outerlength
;
2129 ((HtmlTextNode
)closenode
).Text
= ((HtmlTextNode
)closenode
).Text
.ToLower();
2130 if (_lastparentnode
!= null)
2132 _lastparentnode
.AppendChild(closenode
);
2138 if (HtmlNode
.IsEmptyElement(_currentnode
.Name
))
2141 HtmlParseErrorCode
.EndTagNotRequired
,
2142 _currentnode
._line
, _currentnode
._lineposition
,
2143 _currentnode
._streamposition
, _currentnode
.OuterHtml
,
2144 "End tag </" + _currentnode
.Name
+ "> is not required");
2148 // node cannot overlap, node is not empty
2150 HtmlParseErrorCode
.TagNotOpened
,
2151 _currentnode
._line
, _currentnode
._lineposition
,
2152 _currentnode
._streamposition
, _currentnode
.OuterHtml
,
2153 "Start tag <" + _currentnode
.Name
+ "> was not found");
2161 if (OptionFixNestedTags
)
2163 if (FindResetterNodes(prev
, GetResetters(_currentnode
.Name
)))
2166 HtmlParseErrorCode
.EndTagInvalidHere
,
2167 _currentnode
._line
, _currentnode
._lineposition
,
2168 _currentnode
._streamposition
, _currentnode
.OuterHtml
,
2169 "End tag </" + _currentnode
.Name
+ "> invalid here");
2176 _lastnodes
[_currentnode
.Name
] = prev
._prevwithsamename
;
2177 prev
.CloseNode(_currentnode
);
2182 // we close this node, get grandparent
2185 if ((_lastparentnode
!= null) &&
2186 ((!HtmlNode
.IsClosedElement(_currentnode
.Name
)) ||
2187 (_currentnode
._starttag
)))
2189 UpdateLastParentNode();
2194 internal void UpdateLastParentNode()
2198 if (_lastparentnode
.Closed
)
2200 _lastparentnode
= _lastparentnode
.ParentNode
;
2203 while ((_lastparentnode
!= null) && (_lastparentnode
.Closed
));
2204 if (_lastparentnode
== null)
2206 _lastparentnode
= _documentnode
;
2210 private string CurrentAttributeName()
2212 return _text
.Substring(_currentattribute
._namestartindex
, _currentattribute
._namelength
);
2215 private string CurrentAttributeValue()
2217 return _text
.Substring(_currentattribute
._valuestartindex
, _currentattribute
._valuelength
);
2220 private string CurrentNodeName()
2222 return _text
.Substring(_currentnode
._namestartindex
, _currentnode
._namelength
);
2225 private string CurrentNodeOuter()
2227 return _text
.Substring(_currentnode
._outerstartindex
, _currentnode
._outerlength
);
2230 private string CurrentNodeInner()
2232 return _text
.Substring(_currentnode
._innerstartindex
, _currentnode
._innerlength
);
2236 /// Determines if the specified character is considered as a whitespace character.
2238 /// <param name="c">The character to check.</param>
2239 /// <returns>true if if the specified character is considered as a whitespace character.</returns>
2240 public static bool IsWhiteSpace(int c
)
2242 if ((c
== 10) || (c
== 13) || (c
== 32) || (c
== 9))
2251 internal class EncodingFoundException
: Exception
2253 private Encoding _encoding
;
2255 internal EncodingFoundException(Encoding encoding
)
2257 _encoding
= encoding
;
2260 internal Encoding Encoding