1 // HtmlAgilityPack V1.0 - Simon Mourier <simonm@microsoft.com>
4 Copyright (C) 2003 Simon Mourier <simonm@microsoft.com>
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions
10 1. Redistributions of source code must retain the above copyright
11 notice, this list of conditions and the following disclaimer.
12 2. Redistributions in binary form must reproduce the above copyright
13 notice, this list of conditions and the following disclaimer in the
14 documentation and/or other materials provided with the distribution.
15 3. The name of the author may not be used to endorse or promote products
16 derived from this software without specific prior written permission.
18 THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
19 IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20 OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21 IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
22 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
23 NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
27 THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 using System
.Diagnostics
;
34 using System
.Collections
;
35 using System
.Text
.RegularExpressions
;
37 using System
.Xml
.XPath
;
40 // Legend: SLIM=Comment added describing changes to original HtmlAgilityPack
41 // to reduce memory consumption
42 // Once the parser is free of bugs, the comments will be taken out
43 namespace HtmlAgilityPack
46 /// Represents the type of parsing error.
48 public enum HtmlParseErrorCode
51 /// A tag was not closed.
56 /// A tag was not opened.
61 /// There is a charset mismatch between stream and declared (META) encoding.
66 /// An end tag was not required.
71 /// An end tag is invalid at this position.
77 /// Represents a parsing error found during document parsing.
79 public class HtmlParseError
81 private HtmlParseErrorCode _code
;
83 private int _linePosition
;
84 private int _streamPosition
;
85 private string _sourceText
;
86 private string _reason
;
88 internal HtmlParseError(
89 HtmlParseErrorCode code
,
98 _linePosition
= linePosition
;
99 _streamPosition
= streamPosition
;
100 _sourceText
= sourceText
;
105 /// Gets the type of error.
107 public HtmlParseErrorCode Code
116 /// Gets the line number of this error in the document.
127 /// Gets the column number of this error in the document.
129 public int LinePosition
133 return _linePosition
;
138 /// Gets the absolstream position of this error in the document, relative to the start of the document.
140 public int StreamPosition
144 return _streamPosition
;
149 /// Gets the the full text of the line containing the error.
151 public string SourceText
160 /// Gets a description for the error.
172 abstract class StreamAsArray
{
173 public abstract bool Eof (int index
);
174 public abstract char this [int index
] { get;}
175 public abstract string Substring (int startindex
, int length
);
176 public abstract int FullLength { get;}
179 // SLIM: creating this class to wrap around a textreader
180 // to emulate ReadToEnd () behaviour
181 class ImplStreamAsArray
: StreamAsArray
{
182 private StreamReader _reader
;
184 private int _position
;
186 private char[] _buf_previous
; // could have used only one array
187 private char[] _buf_current
; // but, this is cleaner
188 private int _block_size
;
190 public ImplStreamAsArray (StreamReader r
)
198 _buf_previous
= new char [_block_size
];
199 _buf_current
= new char [_block_size
];
204 private void Read (bool initial
)
207 Array
.Copy (_buf_current
, _buf_previous
, _block_size
);
208 _position
+= _block_size
;
210 HtmlDocument
.Debug ("Debug: Read in buffer at:" + _position
);
212 int num_read
= _reader
.Read (_buf_current
, 0, _block_size
);
213 if (num_read
< _block_size
) {
215 _length
= _position
+ num_read
;
217 HtmlDocument
.Debug ("[" + new string (_buf_current
, 0, num_read
) + "]");
220 public override bool Eof (int index
) {
222 return (index
== _length
);
224 if (index
>= _position
+ _block_size
&&
225 index
< _position
+ _block_size
+ _block_size
)
228 return (index
== _length
);
234 public override char this[int index
] {
236 if (index
>= _position
&&
237 index
< _position
+ _block_size
)
238 return _buf_current
[index
% _block_size
];
239 if (index
>= _position
- _block_size
&&
241 return _buf_previous
[ index
% _block_size
];
242 if (index
>= _position
+ _block_size
&&
243 index
< _position
+ _block_size
+ _block_size
) {
245 return _buf_current
[index
% _block_size
];
247 return OutOfBandRead (index
, 1) [0];
251 // evil function ... you get what you pay for!
252 private string OutOfBandRead (int startindex
, int length
)
254 HtmlDocument
.Debug ("Out of band read! From " + startindex
+ " to " + (startindex
+ length
- 1));
255 ResetPosition (startindex
);
256 // ahh.. now we are at the correct place
257 // create a buffer of required length
258 // who cares if the buffer size does not align well
259 // with page boundary
260 char[] temp_buf
= new char [length
];
261 int num_read
= _reader
.Read (temp_buf
, 0, length
);
262 if (num_read
< length
) {
265 _length
= startindex
+ num_read
;
267 // discard data and reset stream position
268 int t
= (_eof
? _length
:_position
+ _block_size
);
270 return new String (temp_buf
);
273 // streamreader does not allow seeking
274 // seek on its basestream does not reflect the position
275 // of the reader - it is governed by the buffer size
276 // of the underlying stream
277 // :( so, read character by character from beginning ...
278 private void ResetPosition (int pos
)
280 _reader
.DiscardBufferedData ();
281 _reader
.BaseStream
.Position
= 0;
282 // read in chunks of block_size
283 int n1
= pos
/ _block_size
;
284 int n2
= pos
% _block_size
;
285 char[] tmp
= new char [_block_size
];
286 // yo ho... start reading till we have reach pos
287 // hopefully, reader will buffer itself, so we can be mean and get one char at a time
288 for (int i
= 0; i
< n1
; ++i
)
289 _reader
.Read (tmp
, 0, _block_size
);
290 for (int i
= 0; i
< n2
; ++i
)
295 public override string Substring (int startindex
, int length
)
298 HtmlDocument
.Debug ("substring:" + startindex
+ " " + length
+ " " + _position
+ ":");
301 if (length
> _block_size
|| startindex
< _position
- _block_size
) {
302 return OutOfBandRead (startindex
, length
);
304 while (startindex
+ length
- 1 >= _position
+ _block_size
) {
308 if (startindex
< _position
) {
309 int len_1
= _position
- startindex
;
311 substr
= new String (_buf_previous
, _block_size
- len_1
, length
);
313 substr
= new String (_buf_previous
, _block_size
- len_1
, len_1
);
314 substr
+= new String (_buf_current
, 0, length
- len_1
);
317 substr
= new String (_buf_current
, startindex
- _position
, length
);
322 // FIXME: Is this costly ?
323 public override int FullLength
{
325 return (int)_reader
.BaseStream
.Length
;
330 // A dummy StreamAsArray wrapper around a string
331 class DummyStreamAsArray
: StreamAsArray
{
332 private string _base_string
;
335 public DummyStreamAsArray(string str
)
338 _length
= str
.Length
;
341 public override bool Eof(int index
)
343 return (index
>= _length
);
346 public new char this[int index
] {
347 get { return _base_string [index]; }
350 public override string Substring (int startindex
, int length
)
352 return _base_string
.Substring (startindex
, length
);
355 public override int FullLength
{
356 get { return _length; }
361 /// Represents a complete HTML document.
363 public class HtmlDocument
: IXPathNavigable
365 // SLIM: Make the parser event driven
366 // callback for FilterHtml
367 // return value is a way for the callback to signal to continue or stop parsing
368 public delegate bool NodeHandler (HtmlNode node
);
369 public NodeHandler ReportNode
;
370 // misnomer ... should be called event_driven_mode
371 private bool _streammode
= false;
372 private bool _stop_parsing
= false;
374 internal static readonly string HtmlExceptionRefNotChild
= "Reference node must be a child of this node";
375 internal static readonly string HtmlExceptionUseIdAttributeFalse
= "You need to set UseIdAttribute property to true to enable this feature";
377 internal Hashtable _openednodes
;
378 internal Hashtable _lastnodes
= new Hashtable();
379 internal Hashtable _nodesid
;
380 private HtmlNode _documentnode
;
381 //SLIM: internal string _text;
382 internal StreamAsArray _text
;
383 private HtmlNode _currentnode
;
384 private HtmlNode _lastparentnode
;
385 private HtmlAttribute _currentattribute
;
388 private int _lineposition
, _maxlineposition
;
390 private bool _fullcomment
;
391 private System
.Text
.Encoding _streamencoding
;
392 private System
.Text
.Encoding _declaredencoding
;
393 private ArrayList _parseerrors
= new ArrayList();
394 private ParseState _state
, _oldstate
;
395 private Crc32 _crc32
= null;
396 private bool _onlyDetectEncoding
= false;
397 private int _pcdata_quote_char
= '\0';
399 private static bool _debug
= false;
400 internal static void Debug (string s
)
403 Console
.WriteLine (s
);
409 /// Defines if a checksum must be computed for the document while parsing. Default is false.
411 public bool OptionComputeChecksum
= false;
414 /// Defines if declared encoding must be read from the document.
415 /// Declared encoding is determined using the meta http-equiv="content-type" content="text/html;charset=XXXXX" html node.
418 public bool OptionReadEncoding
= true;
422 /// Defines if non closed nodes will be checked at the end of parsing. Default is true.
424 public bool OptionCheckSyntax
= true;
427 /// Defines if the 'id' attribute must be specifically used. Default is true.
429 public bool OptionUseIdAttribute
= true;
432 /// Defines if empty nodes must be written as closed during output. Default is false.
434 public bool OptionWriteEmptyNodes
= false;
437 /// Defines if output must conform to XML, instead of HTML.
439 public bool OptionOutputAsXml
= false;
442 /// Defines if name must be output in uppercase. Default is false.
444 public bool OptionOutputUpperCase
= false;
447 /// Defines if attribute value output must be optimized (not bound with double quotes if it is possible). Default is false.
449 public bool OptionOutputOptimizeAttributeValues
= false;
452 /// Adds Debugging attributes to node. Default is false.
454 public bool OptionAddDebuggingAttributes
= false;
457 /// Defines if source text must be extracted while parsing errors.
458 /// If the document has a lot of errors, or cascading errors, parsing performance can be dramatically affected if set to true.
459 /// Default is false.
461 public bool OptionExtractErrorSourceText
= false; // turning this on can dramatically slow performance if a lot of errors are detected
464 /// Defines if closing for non closed nodes must be done at the end or directly in the document.
465 /// Setting this to true can actually change how browsers render the page. Default is false.
467 public bool OptionAutoCloseOnEnd
= false; // close errors at the end
470 /// Defines if LI, TR, TH, TD tags must be partially fixed when nesting errors are detected. Default is false.
472 public bool OptionFixNestedTags
= false; // fix li, tr, th, td tags
475 /// Defines the maximum length of source text or parse errors. Default is 100.
477 public int OptionExtractErrorSourceTextMaxLength
= 100;
480 /// Defines the default stream encoding to use. Default is System.Text.Encoding.Default.
482 // From http://www.w3.org/TR/REC-html40/charset.html
483 // The HTTP protocol ([RFC2616], section 3.7.1) mentions ISO-8859-1 as a default character encoding when the "charset" parameter is absent from the "Content-Type" header field.
484 // So, however we are still using UTF-8 for some unknown reason
485 //FIXME: Fix the default encoding!
486 public System
.Text
.Encoding OptionDefaultStreamEncoding
= Encoding
.UTF8
;
489 /// Gets a list of parse errors found in the document.
491 public ArrayList ParseErrors
500 /// Gets the document's stream encoding.
502 public System
.Text
.Encoding StreamEncoding
506 return _streamencoding
;
511 /// Gets the document's declared encoding.
512 /// Declared encoding is determined using the meta http-equiv="content-type" content="text/html;charset=XXXXX" html node.
514 public System
.Text
.Encoding DeclaredEncoding
518 return _declaredencoding
;
523 /// Creates an instance of an HTML document.
525 public HtmlDocument()
527 _documentnode
= CreateNode(HtmlNodeType
.Document
, 0);
530 internal HtmlNode
GetXmlDeclaration()
532 if (!_documentnode
.HasChildNodes
)
537 foreach(HtmlNode node
in _documentnode
._childnodes
)
539 if (node
.Name
== "?xml") // it's ok, names are case sensitive
548 /// Applies HTML encoding to a specified string.
550 /// <param name="html">The input string to encode. May not be null.</param>
551 /// <returns>The encoded string.</returns>
552 public static string HtmlEncode(string html
)
556 throw new ArgumentNullException("html");
558 // replace & by & but only once!
559 Regex rx
= new Regex("&(?!(amp;)|(lt;)|(gt;)|(quot;))", RegexOptions
.IgnoreCase
);
560 return rx
.Replace(html
, "&").Replace("<", "<").Replace(">", ">").Replace("\"", """);
564 /// Detects the encoding of an HTML stream.
566 /// <param name="stream">The input stream. May not be null.</param>
567 /// <returns>The detected encoding.</returns>
568 public Encoding
DetectEncoding(Stream stream
)
572 throw new ArgumentNullException("stream");
574 return DetectEncoding(new StreamReader(stream
));
578 /// Detects the encoding of an HTML file.
580 /// <param name="path">Path for the file containing the HTML document to detect. May not be null.</param>
581 /// <returns>The detected encoding.</returns>
582 public Encoding
DetectEncoding(string path
)
586 throw new ArgumentNullException("path");
588 StreamReader sr
= new StreamReader(path
, OptionDefaultStreamEncoding
);
589 Encoding encoding
= DetectEncoding(sr
);
595 /// Detects the encoding of an HTML text.
597 /// <param name="html">The input html text. May not be null.</param>
598 /// <returns>The detected encoding.</returns>
599 public Encoding
DetectEncodingHtml(string html
)
603 throw new ArgumentNullException("html");
605 StringReader sr
= new StringReader(html
);
606 Encoding encoding
= DetectEncoding(sr
);
612 /// Detects the encoding of an HTML text provided on a TextReader.
614 /// <param name="reader">The TextReader used to feed the HTML. May not be null.</param>
615 /// <returns>The detected encoding.</returns>
616 public Encoding
DetectEncoding(TextReader reader
)
620 throw new ArgumentNullException("reader");
622 _onlyDetectEncoding
= true;
623 if (OptionCheckSyntax
)
625 _openednodes
= new Hashtable();
632 if (OptionUseIdAttribute
)
634 _nodesid
= new Hashtable();
641 StreamReader sr
= reader
as StreamReader
;
644 _streamencoding
= sr
.CurrentEncoding
;
645 _text
= new ImplStreamAsArray (sr
);
649 _streamencoding
= null;
650 // Expensive, but cannot avoid since TextReader doesnt have any length of the underlying data
651 _text
= new DummyStreamAsArray (reader
.ReadToEnd());
653 _declaredencoding
= null;
655 // SLIM: _text = reader.ReadToEnd();
656 _documentnode
= CreateNode(HtmlNodeType
.Document
, 0);
658 // this is a hack, but it allows us not to muck with the original parsing code
663 catch(EncodingFoundException ex
)
672 /// Loads an HTML document from a stream.
674 /// <param name="stream">The input stream.</param>
675 public void Load(Stream stream
)
677 Load(new StreamReader(stream
, OptionDefaultStreamEncoding
));
681 /// Loads an HTML document from a stream.
683 /// <param name="stream">The input stream.</param>
684 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the stream.</param>
685 public void Load(Stream stream
, bool detectEncodingFromByteOrderMarks
)
687 Load(new StreamReader(stream
, detectEncodingFromByteOrderMarks
));
691 /// Loads an HTML document from a stream.
693 /// <param name="stream">The input stream.</param>
694 /// <param name="encoding">The character encoding to use.</param>
695 public void Load(Stream stream
, Encoding encoding
)
697 Load(new StreamReader(stream
, encoding
));
701 /// Loads an HTML document from a stream.
703 /// <param name="stream">The input stream.</param>
704 /// <param name="encoding">The character encoding to use.</param>
705 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the stream.</param>
706 public void Load(Stream stream
, Encoding encoding
, bool detectEncodingFromByteOrderMarks
)
708 Load(new StreamReader(stream
, encoding
, detectEncodingFromByteOrderMarks
));
712 /// Loads an HTML document from a stream.
714 /// <param name="stream">The input stream.</param>
715 /// <param name="encoding">The character encoding to use.</param>
716 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the stream.</param>
717 /// <param name="buffersize">The minimum buffer size.</param>
718 public void Load(Stream stream
, Encoding encoding
, bool detectEncodingFromByteOrderMarks
, int buffersize
)
720 Load(new StreamReader(stream
, encoding
, detectEncodingFromByteOrderMarks
, buffersize
));
724 /// Loads an HTML document from a file.
726 /// <param name="path">The complete file path to be read. May not be null.</param>
727 public void Load(string path
)
731 throw new ArgumentNullException("path");
733 StreamReader sr
= new StreamReader(path
, OptionDefaultStreamEncoding
);
739 /// Loads an HTML document from a file.
741 /// <param name="path">The complete file path to be read. May not be null.</param>
742 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
743 public void Load(string path
, bool detectEncodingFromByteOrderMarks
)
747 throw new ArgumentNullException("path");
749 StreamReader sr
= new StreamReader(path
, detectEncodingFromByteOrderMarks
);
755 /// Loads an HTML document from a file.
757 /// <param name="path">The complete file path to be read. May not be null.</param>
758 /// <param name="encoding">The character encoding to use. May not be null.</param>
759 public void Load(string path
, Encoding encoding
)
763 throw new ArgumentNullException("path");
765 if (encoding
== null)
767 throw new ArgumentNullException("encoding");
769 StreamReader sr
= new StreamReader(path
, encoding
);
775 /// Loads an HTML document from a file.
777 /// <param name="path">The complete file path to be read. May not be null.</param>
778 /// <param name="encoding">The character encoding to use. May not be null.</param>
779 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
780 public void Load(string path
, Encoding encoding
, bool detectEncodingFromByteOrderMarks
)
784 throw new ArgumentNullException("path");
786 if (encoding
== null)
788 throw new ArgumentNullException("encoding");
790 StreamReader sr
= new StreamReader(path
, encoding
, detectEncodingFromByteOrderMarks
);
796 /// Loads an HTML document from a file.
798 /// <param name="path">The complete file path to be read. May not be null.</param>
799 /// <param name="encoding">The character encoding to use. May not be null.</param>
800 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
801 /// <param name="buffersize">The minimum buffer size.</param>
802 public void Load(string path
, Encoding encoding
, bool detectEncodingFromByteOrderMarks
, int buffersize
)
806 throw new ArgumentNullException("path");
808 if (encoding
== null)
810 throw new ArgumentNullException("encoding");
812 StreamReader sr
= new StreamReader(path
, encoding
, detectEncodingFromByteOrderMarks
, buffersize
);
818 /// Loads the HTML document from the specified string.
820 /// <param name="html">String containing the HTML document to load. May not be null.</param>
821 public void LoadHtml(string html
)
825 throw new ArgumentNullException("html");
827 StringReader sr
= new StringReader(html
);
833 /// Detects the encoding of an HTML document from a file first, and then loads the file.
835 /// <param name="path">The complete file path to be read.</param>
836 public void DetectEncodingAndLoad(string path
)
838 DetectEncodingAndLoad(path
, true);
842 /// Detects the encoding of an HTML document from a file first, and then loads the file.
844 /// <param name="path">The complete file path to be read. May not be null.</param>
845 /// <param name="detectEncoding">true to detect encoding, false otherwise.</param>
846 public void DetectEncodingAndLoad(string path
, bool detectEncoding
)
850 throw new ArgumentNullException("path");
852 System
.Text
.Encoding enc
;
855 enc
= DetectEncoding(path
);
873 /// Loads the HTML document from the specified TextReader.
875 /// <param name="reader">The TextReader used to feed the HTML data into the document. May not be null.</param>
876 public void Load(TextReader reader
)
878 // all Load methods pass down to this one
881 throw new ArgumentNullException("reader");
884 _onlyDetectEncoding
= false;
886 if (OptionCheckSyntax
)
888 _openednodes
= new Hashtable();
895 if (OptionUseIdAttribute
)
897 _nodesid
= new Hashtable();
904 StreamReader sr
= reader
as StreamReader
;
909 // trigger bom read if needed
916 _streamencoding
= sr
.CurrentEncoding
;
917 _text
= new ImplStreamAsArray (sr
);
921 _streamencoding
= null;
922 // Expensive, but cannot avoid since TextReader doesnt have any length of the underlying data
923 _text
= new DummyStreamAsArray (reader
.ReadToEnd());
925 _declaredencoding
= null;
927 // SLIM: _text = reader.ReadToEnd();
928 _documentnode
= CreateNode(HtmlNodeType
.Document
, 0);
931 if (OptionCheckSyntax
)
933 foreach(HtmlNode node
in _openednodes
.Values
)
935 if (!node
._starttag
) // already reported
941 if (OptionExtractErrorSourceText
)
943 html
= node
.OuterHtml
;
944 if (html
.Length
> OptionExtractErrorSourceTextMaxLength
)
946 html
= html
.Substring(0, OptionExtractErrorSourceTextMaxLength
);
954 HtmlParseErrorCode
.TagNotClosed
,
955 node
._line
, node
._lineposition
,
956 node
._streamposition
, html
,
957 "End tag </" + node
.Name
+ "> was not found");
960 // we don't need this anymore
961 _openednodes
.Clear();
965 internal System
.Text
.Encoding
GetOutEncoding()
967 // when unspecified, use the stream encoding first
968 if (_declaredencoding
!= null)
970 return _declaredencoding
;
974 if (_streamencoding
!= null)
976 return _streamencoding
;
979 return OptionDefaultStreamEncoding
;
984 /// Gets the document's output encoding.
986 public System
.Text
.Encoding Encoding
990 return GetOutEncoding();
995 /// Saves the HTML document to the specified stream.
997 /// <param name="outStream">The stream to which you want to save.</param>
998 public void Save(Stream outStream
)
1000 StreamWriter sw
= new StreamWriter(outStream
, GetOutEncoding());
1005 /// Saves the HTML document to the specified stream.
1007 /// <param name="outStream">The stream to which you want to save. May not be null.</param>
1008 /// <param name="encoding">The character encoding to use. May not be null.</param>
1009 public void Save(Stream outStream
, System
.Text
.Encoding encoding
)
1011 if (outStream
== null)
1013 throw new ArgumentNullException("outStream");
1015 if (encoding
== null)
1017 throw new ArgumentNullException("encoding");
1019 StreamWriter sw
= new StreamWriter(outStream
, encoding
);
1024 /// Saves the mixed document to the specified file.
1026 /// <param name="filename">The location of the file where you want to save the document.</param>
1027 public void Save(string filename
)
1029 StreamWriter sw
= new StreamWriter(filename
, false, GetOutEncoding());
1035 /// Saves the mixed document to the specified file.
1037 /// <param name="filename">The location of the file where you want to save the document. May not be null.</param>
1038 /// <param name="encoding">The character encoding to use. May not be null.</param>
1039 public void Save(string filename
, System
.Text
.Encoding encoding
)
1041 if (filename
== null)
1043 throw new ArgumentNullException("filename");
1045 if (encoding
== null)
1047 throw new ArgumentNullException("encoding");
1049 StreamWriter sw
= new StreamWriter(filename
, false, encoding
);
1055 /// Saves the HTML document to the specified StreamWriter.
1057 /// <param name="writer">The StreamWriter to which you want to save.</param>
1058 public void Save(StreamWriter writer
)
1060 Save((TextWriter
)writer
);
1064 /// Saves the HTML document to the specified TextWriter.
1066 /// <param name="writer">The TextWriter to which you want to save. May not be null.</param>
1067 public void Save(TextWriter writer
)
1071 throw new ArgumentNullException("writer");
1073 DocumentNode
.WriteTo(writer
);
1077 /// Saves the HTML document to the specified XmlWriter.
1079 /// <param name="writer">The XmlWriter to which you want to save.</param>
1080 public void Save(XmlWriter writer
)
1082 DocumentNode
.WriteTo(writer
);
1087 /// Creates a new XPathNavigator object for navigating this HTML document.
1089 /// <returns>An XPathNavigator object. The XPathNavigator is positioned on the root of the document.</returns>
1090 public XPathNavigator
CreateNavigator()
1092 return new HtmlNodeNavigator(this, _documentnode
);
1095 internal void SetIdForNode(HtmlNode node
, string id
)
1097 if (!OptionUseIdAttribute
)
1102 if ((_nodesid
== null) || (id
== null))
1109 _nodesid
.Remove(id
.ToLower());
1113 _nodesid
[id
.ToLower()] = node
;
1118 /// Gets the HTML node with the specified 'id' attribute value.
1120 /// <param name="id">The attribute id to match. May not be null.</param>
1121 /// <returns>The HTML node with the matching id or null if not found.</returns>
1122 public HtmlNode
GetElementbyId(string id
)
1126 throw new ArgumentNullException("id");
1128 if (_nodesid
== null)
1130 throw new Exception(HtmlExceptionUseIdAttributeFalse
);
1133 return _nodesid
[id
.ToLower()] as HtmlNode
;
1137 /// Creates an HTML element node with the specified name.
1139 /// <param name="name">The qualified name of the element. May not be null.</param>
1140 /// <returns>The new HTML node.</returns>
1141 public HtmlNode
CreateElement(string name
)
1145 throw new ArgumentNullException("name");
1147 HtmlNode node
= CreateNode(HtmlNodeType
.Element
);
1153 /// Creates an HTML comment node.
1155 /// <returns>The new HTML comment node.</returns>
1156 public HtmlCommentNode
CreateComment()
1158 return (HtmlCommentNode
)CreateNode(HtmlNodeType
.Comment
);
1162 /// Creates an HTML comment node with the specified comment text.
1164 /// <param name="comment">The comment text. May not be null.</param>
1165 /// <returns>The new HTML comment node.</returns>
1166 public HtmlCommentNode
CreateComment(string comment
)
1168 if (comment
== null)
1170 throw new ArgumentNullException("comment");
1172 HtmlCommentNode c
= CreateComment();
1173 c
.Comment
= comment
;
1178 /// Creates an HTML text node.
1180 /// <returns>The new HTML text node.</returns>
1181 public HtmlTextNode
CreateTextNode()
1183 return (HtmlTextNode
)CreateNode(HtmlNodeType
.Text
);
1187 /// Creates an HTML text node with the specified text.
1189 /// <param name="text">The text of the node. May not be null.</param>
1190 /// <returns>The new HTML text node.</returns>
1191 public HtmlTextNode
CreateTextNode(string text
)
1195 throw new ArgumentNullException("text");
1197 HtmlTextNode t
= CreateTextNode();
1202 internal HtmlNode
CreateNode(HtmlNodeType type
)
1204 return CreateNode(type
, -1);
1207 internal HtmlNode
CreateNode(HtmlNodeType type
, int index
)
1211 case HtmlNodeType
.Comment
:
1212 return new HtmlCommentNode(this, index
);
1214 case HtmlNodeType
.Text
:
1215 return new HtmlTextNode(this, index
);
1218 return new HtmlNode(type
, this, index
);
1222 internal HtmlAttribute
CreateAttribute()
1224 return new HtmlAttribute(this);
1228 /// Creates an HTML attribute with the specified name.
1230 /// <param name="name">The name of the attribute. May not be null.</param>
1231 /// <returns>The new HTML attribute.</returns>
1232 public HtmlAttribute
CreateAttribute(string name
)
1236 throw new ArgumentNullException("name");
1238 HtmlAttribute att
= CreateAttribute();
1244 /// Creates an HTML attribute with the specified name.
1246 /// <param name="name">The name of the attribute. May not be null.</param>
1247 /// <param name="value">The value of the attribute.</param>
1248 /// <returns>The new HTML attribute.</returns>
1249 public HtmlAttribute
CreateAttribute(string name
, string value)
1253 throw new ArgumentNullException("name");
1255 HtmlAttribute att
= CreateAttribute(name
);
1261 /// Gets the root node of the document.
1263 public HtmlNode DocumentNode
1267 return _documentnode
;
1272 /// Gets the document CRC32 checksum if OptionComputeChecksum was set to true before parsing, 0 otherwise.
1284 return (int)_crc32
.CheckSum
;
1289 public bool StreamMode
1297 _streammode
= value;
1301 private HtmlParseError
AddError(
1302 HtmlParseErrorCode code
,
1309 HtmlParseError err
= new HtmlParseError(code
, line
, linePosition
, streamPosition
, sourceText
, reason
);
1310 _parseerrors
.Add(err
);
1314 private enum ParseState
1322 AttributeBeforeEquals
,
1323 AttributeAfterEquals
,
1326 QuotedAttributeValue
,
1332 private void IncrementPosition()
1336 // REVIEW: should we add some checksum code in DecrementPosition too?
1337 _crc32
.AddToCRC32(_c
);
1341 _maxlineposition
= _lineposition
;
1353 private void DecrementPosition()
1356 if (_lineposition
== 1)
1358 _lineposition
= _maxlineposition
;
1367 private void Parse()
1370 if (OptionComputeChecksum
)
1372 _crc32
= new Crc32();
1375 _lastnodes
= new Hashtable();
1377 _fullcomment
= false;
1378 _parseerrors
= new ArrayList();
1381 _maxlineposition
= 1;
1383 _state
= ParseState
.Text
;
1385 _documentnode
._innerlength
= _text
.FullLength
;
1386 _documentnode
._outerlength
= _text
.FullLength
;
1388 _lastparentnode
= _documentnode
;
1389 _currentnode
= CreateNode(HtmlNodeType
.Text
, 0);
1390 _currentattribute
= null;
1393 PushNodeStart(HtmlNodeType
.Text
, 0);
1394 // SLIM: while (_index<_text.Length)
1395 while (! _stop_parsing
&& ! _text
.Eof (_index
))
1398 IncrementPosition();
1402 case ParseState
.Text
:
1407 case ParseState
.WhichTag
:
1412 PushNodeNameStart(false, _index
);
1416 PushNodeNameStart(true, _index
-1);
1417 DecrementPosition();
1419 _state
= ParseState
.Tag
;
1422 case ParseState
.Tag
:
1425 if (IsWhiteSpace(_c
))
1427 PushNodeNameEnd(_index
-1);
1428 if (_state
!= ParseState
.Tag
)
1430 _state
= ParseState
.BetweenAttributes
;
1435 PushNodeNameEnd(_index
-1);
1436 if (_state
!= ParseState
.Tag
)
1438 _state
= ParseState
.EmptyTag
;
1443 PushNodeNameEnd(_index
-1);
1444 if (_state
!= ParseState
.Tag
)
1446 PushNodeEnd(_index
, false);
1447 if (_state
!= ParseState
.Tag
)
1449 _state
= ParseState
.Text
;
1450 PushNodeStart(HtmlNodeType
.Text
, _index
);
1454 case ParseState
.BetweenAttributes
:
1458 if (IsWhiteSpace(_c
))
1461 if ((_c
== '/') || (_c
== '?'))
1463 _state
= ParseState
.EmptyTag
;
1469 PushNodeEnd(_index
, false);
1470 if (_state
!= ParseState
.BetweenAttributes
)
1472 _state
= ParseState
.Text
;
1473 PushNodeStart(HtmlNodeType
.Text
, _index
);
1477 PushAttributeNameStart(_index
-1);
1478 _state
= ParseState
.AttributeName
;
1481 case ParseState
.EmptyTag
:
1487 PushNodeEnd(_index
, true);
1488 if (_state
!= ParseState
.EmptyTag
)
1490 _state
= ParseState
.Text
;
1491 PushNodeStart(HtmlNodeType
.Text
, _index
);
1494 _state
= ParseState
.BetweenAttributes
;
1497 case ParseState
.AttributeName
:
1501 if (IsWhiteSpace(_c
))
1503 PushAttributeNameEnd(_index
-1);
1504 _state
= ParseState
.AttributeBeforeEquals
;
1509 PushAttributeNameEnd(_index
-1);
1510 _state
= ParseState
.AttributeAfterEquals
;
1515 PushAttributeNameEnd(_index
-1);
1516 PushNodeEnd(_index
, false);
1517 if (_state
!= ParseState
.AttributeName
)
1519 _state
= ParseState
.Text
;
1520 PushNodeStart(HtmlNodeType
.Text
, _index
);
1525 case ParseState
.AttributeBeforeEquals
:
1529 if (IsWhiteSpace(_c
))
1533 PushNodeEnd(_index
, false);
1534 if (_state
!= ParseState
.AttributeBeforeEquals
)
1536 _state
= ParseState
.Text
;
1537 PushNodeStart(HtmlNodeType
.Text
, _index
);
1542 _state
= ParseState
.AttributeAfterEquals
;
1545 // no equals, no whitespace, it's a new attrribute starting
1546 _state
= ParseState
.BetweenAttributes
;
1547 DecrementPosition();
1550 case ParseState
.AttributeAfterEquals
:
1554 if (IsWhiteSpace(_c
))
1557 if ((_c
== '\'') || (_c
== '"'))
1559 _state
= ParseState
.QuotedAttributeValue
;
1560 PushAttributeValueStart(_index
);
1566 PushNodeEnd(_index
, false);
1567 if (_state
!= ParseState
.AttributeAfterEquals
)
1569 _state
= ParseState
.Text
;
1570 PushNodeStart(HtmlNodeType
.Text
, _index
);
1573 PushAttributeValueStart(_index
-1);
1574 _state
= ParseState
.AttributeValue
;
1577 case ParseState
.AttributeValue
:
1581 if (IsWhiteSpace(_c
))
1583 PushAttributeValueEnd(_index
-1);
1584 _state
= ParseState
.BetweenAttributes
;
1590 PushAttributeValueEnd(_index
-1);
1591 PushNodeEnd(_index
, false);
1592 if (_state
!= ParseState
.AttributeValue
)
1594 _state
= ParseState
.Text
;
1595 PushNodeStart(HtmlNodeType
.Text
, _index
);
1600 case ParseState
.QuotedAttributeValue
:
1601 if (_c
== lastquote
)
1603 PushAttributeValueEnd(_index
-1);
1604 _state
= ParseState
.BetweenAttributes
;
1609 //SLIM: if (_index<_text.Length)
1610 if (!_text
.Eof (_index
))
1612 if (_text
[_index
] == '%')
1615 _state
= ParseState
.ServerSideCode
;
1622 case ParseState
.Comment
:
1627 if ((_text
[_index
-2] != '-') ||
1628 (_text
[_index
-3] != '-'))
1633 PushNodeEnd(_index
, false);
1634 _state
= ParseState
.Text
;
1635 PushNodeStart(HtmlNodeType
.Text
, _index
);
1640 case ParseState
.ServerSideCode
:
1643 //SLIM: if (_index<_text.Length)
1644 if (! _text
.Eof (_index
))
1646 if (_text
[_index
] == '>')
1650 case ParseState
.AttributeAfterEquals
:
1651 _state
= ParseState
.AttributeValue
;
1654 case ParseState
.BetweenAttributes
:
1655 PushAttributeNameEnd(_index
+1);
1656 _state
= ParseState
.BetweenAttributes
;
1663 IncrementPosition();
1669 // handle <script>a="</script>"</script>
1670 case ParseState
.PcDataQuote
:
1671 if ((_c
== _pcdata_quote_char
) && (_text
[_index
- 2] != '\\')) {
1672 _pcdata_quote_char
= '\0';
1673 _state
= ParseState
.PcData
;
1677 case ParseState
.PcData
:
1678 Debug ("PCDATA " + _currentnode
.Name
+ " " + _text
.Substring(_index
-1, _currentnode
._namelength
+2));
1679 if (_c
== '\"' || _c
== '\''){
1680 _pcdata_quote_char
= _c
;
1681 _state
= ParseState
.PcDataQuote
;
1684 // look for </tag + 1 char
1687 //SLIM: if ((_currentnode._namelength+3)<=(_text.Length-(_index-1)))
1688 if (! _text
.Eof (_currentnode
._namelength
+ _index
+ 1))
1690 if (string.Compare(_text
.Substring(_index
-1, _currentnode
._namelength
+2),
1691 "</" + _currentnode
.Name
, true) == 0)
1693 int c
= _text
[_index
-1 + 2 + _currentnode
.Name
.Length
];
1694 if ((c
== '>') || (IsWhiteSpace(c
)))
1696 // add the script as a text node
1697 HtmlNode script
= CreateNode(HtmlNodeType
.Text
,
1698 _currentnode
._outerstartindex
+ _currentnode
._outerlength
);
1699 script
._outerlength
= _index
-1 - script
._outerstartindex
;
1700 if (_streammode
&& ReportNode
!= null)
1701 _stop_parsing
= ! ReportNode (script
);
1703 _currentnode
.AppendChild(script
);
1704 Debug ("Found script: [" + script
.InnerText
+ "]");
1706 PushNodeStart(HtmlNodeType
.Element
, _index
-1);
1707 PushNodeNameStart(false, _index
-1 +2);
1708 _state
= ParseState
.Tag
;
1709 IncrementPosition();
1717 // finish the current work
1718 if (_currentnode
._namestartindex
> 0)
1720 PushNodeNameEnd(_index
);
1722 PushNodeEnd(_index
, false);
1724 // we don't need this anymore
1728 private bool NewCheck()
1734 //SLIM: if (_index<_text.Length)
1735 if (! _text
.Eof (_index
))
1737 if (_text
[_index
] == '%')
1741 case ParseState
.AttributeAfterEquals
:
1742 PushAttributeValueStart(_index
-1);
1745 case ParseState
.BetweenAttributes
:
1746 PushAttributeNameStart(_index
-1);
1749 case ParseState
.WhichTag
:
1750 PushNodeNameStart(true, _index
-1);
1751 _state
= ParseState
.Tag
;
1755 _state
= ParseState
.ServerSideCode
;
1760 PushNodeEnd(_index
-1, true);
1761 _state
= ParseState
.WhichTag
;
1762 //SLIM: if ((_index-1) <= (_text.Length-2))
1763 if (!_text
.Eof (_index
))
1765 if (_text
[_index
] == '!')
1767 PushNodeStart(HtmlNodeType
.Comment
, _index
-1);
1768 PushNodeNameStart(true, _index
);
1769 PushNodeNameEnd(_index
+1);
1770 _state
= ParseState
.Comment
;
1771 //SLIM: if (_index<(_text.Length-2))
1772 if (! _text
.Eof (_index
+ 2))
1774 if ((_text
[_index
+1] == '-') &&
1775 (_text
[_index
+2] == '-'))
1777 _fullcomment
= true;
1781 _fullcomment
= false;
1787 PushNodeStart(HtmlNodeType
.Element
, _index
-1);
1791 private void ReadDocumentEncoding(HtmlNode node
)
1793 if (!OptionReadEncoding
)
1796 // <meta http-equiv="content-type" content="text/html;charset=iso-8859-1" />
1798 // when we append a child, we are in node end, so attributes are already populated
1799 if (node
._namelength
== 4) // quick check, avoids string alloc
1801 // only these nodes can occur before meta
1802 // if we started seeing any other node, we will never see a meta node
1803 if (node
.NodeType
== HtmlNodeType
.Element
&&
1804 (node
.Name
!= "head" && node
.Name
!= "script" &&
1805 node
.Name
!= "style" && node
.Name
!= "title" &&
1806 node
.Name
!= "head" && node
.Name
!= "link" &&
1807 node
.Name
!= "html" && node
.Name
!= "meta")) {
1808 _declaredencoding
= null;
1809 if (_onlyDetectEncoding
)
1810 throw new EncodingFoundException (null);
1813 // FIXME: Should also handle declaredencoding mismatch with detected
1814 // encoding, as done below. None of the current filters run in error
1815 // detection mode currently, so its not needed now.
1817 else if (node
.Name
== "meta") // all nodes names are lowercase
1819 HtmlAttribute att
= node
.Attributes
["http-equiv"];
1822 if (string.Compare(att
.Value
, "content-type", true) == 0)
1824 HtmlAttribute content
= node
.Attributes
["content"];
1825 if (content
!= null)
1827 string charset
= NameValuePairList
.GetNameValuePairsValue(content
.Value
, "charset");
1828 if (charset
!= null)
1830 _declaredencoding
= Encoding
.GetEncoding(charset
);
1831 if (_onlyDetectEncoding
)
1833 throw new EncodingFoundException(_declaredencoding
);
1836 if (_streamencoding
!= null)
1838 if (_declaredencoding
.WindowsCodePage
!= _streamencoding
.WindowsCodePage
)
1841 HtmlParseErrorCode
.CharsetMismatch
,
1842 _line
, _lineposition
,
1843 _index
, node
.OuterHtml
,
1844 "Encoding mismatch between StreamEncoding: " +
1845 _streamencoding
.WebName
+ " and DeclaredEncoding: " + _declaredencoding
.WebName
);
1856 private void PushAttributeNameStart(int index
)
1858 _currentattribute
= CreateAttribute();
1859 _currentattribute
._namestartindex
= index
;
1860 _currentattribute
._line
= _line
;
1861 _currentattribute
._lineposition
= _lineposition
;
1862 _currentattribute
._streamposition
= index
;
1865 private void PushAttributeNameEnd(int index
)
1867 _currentattribute
._namelength
= index
- _currentattribute
._namestartindex
;
1868 _currentnode
.Attributes
.Append(_currentattribute
);
1871 private void PushAttributeValueStart(int index
)
1873 _currentattribute
._valuestartindex
= index
;
1876 private void PushAttributeValueEnd(int index
)
1878 _currentattribute
._valuelength
= index
- _currentattribute
._valuestartindex
;
1881 private void PushNodeStart(HtmlNodeType type
, int index
)
1883 _currentnode
= CreateNode(type
, index
);
1884 _currentnode
._line
= _line
;
1885 _currentnode
._lineposition
= _lineposition
;
1886 if (type
== HtmlNodeType
.Element
)
1888 _currentnode
._lineposition
--;
1890 _currentnode
._streamposition
= index
;
1893 private void PushNodeEnd(int index
, bool close
)
1895 _currentnode
._outerlength
= index
- _currentnode
._outerstartindex
;
1897 //SLIM: inform caller
1898 if (_streammode
&& ReportNode
!= null)
1899 _stop_parsing
= ! ReportNode (_currentnode
);
1902 if (_currentnode
._nodetype
== HtmlNodeType
.Text
)
1903 Debug ("Text:" + _currentnode
.InnerText
);
1905 Debug ((_currentnode
.StartTag
? "Start-" : "End-") + _currentnode
.Name
);
1907 if ((_currentnode
._nodetype
== HtmlNodeType
.Text
) ||
1908 (_currentnode
._nodetype
== HtmlNodeType
.Comment
))
1910 // forget about void nodes
1911 if (_currentnode
._outerlength
>0)
1913 _currentnode
._innerlength
= _currentnode
._outerlength
;
1914 _currentnode
._innerstartindex
= _currentnode
._outerstartindex
;
1915 // SLIM: no need to append child in stream mode
1916 // SLIM: whatever the caller needs to do, tell it to do now
1917 if (!_streammode
&& _lastparentnode
!= null)
1919 _lastparentnode
.AppendChild(_currentnode
);
1925 if ((_currentnode
._starttag
) && (_lastparentnode
!= _currentnode
))
1927 // add to parent node
1928 // SLIM: no need to append child in stream mode
1929 // SLIM: whatever the caller needs to do, tell it to do now
1930 if (!_streammode
&& _lastparentnode
!= null)
1932 _lastparentnode
.AppendChild(_currentnode
);
1935 ReadDocumentEncoding(_currentnode
);
1937 // remember last node of this kind
1938 // SLIM: we still to store _currentnode to help other tags in the same level
1939 HtmlNode prev
= (HtmlNode
)_lastnodes
[_currentnode
.Name
];
1940 _currentnode
._prevwithsamename
= prev
;
1941 _lastnodes
[_currentnode
.Name
] = _currentnode
;
1944 if ((_currentnode
.NodeType
== HtmlNodeType
.Document
) ||
1945 (_currentnode
.NodeType
== HtmlNodeType
.Element
))
1947 _lastparentnode
= _currentnode
;
1950 if (HtmlNode
.IsCDataElement(CurrentNodeName()))
1952 _state
= ParseState
.PcData
;
1956 if ((HtmlNode
.IsClosedElement(_currentnode
.Name
)) ||
1957 (HtmlNode
.IsEmptyElement(_currentnode
.Name
)))
1964 if ((close
) || (!_currentnode
._starttag
))
1967 if ((_currentnode
._nodetype
== HtmlNodeType
.Text
) ||
1968 (_currentnode
._nodetype
== HtmlNodeType
.Comment
))
1969 _currentnode
= null;
1973 private void PushNodeNameStart(bool starttag
, int index
)
1975 _currentnode
._starttag
= starttag
;
1976 _currentnode
._namestartindex
= index
;
1979 private string[] GetResetters(string name
)
1984 return new string[]{"ul"}
;
1987 return new string[]{"table"}
;
1991 return new string[]{"tr", "table"}
;
1998 private void FixNestedTags()
2000 // we are only interested by start tags, not closing tags
2001 if (!_currentnode
._starttag
)
2004 string name
= CurrentNodeName().ToLower();
2005 FixNestedTag(name
, GetResetters(name
));
2008 private void FixNestedTag(string name
, string[] resetters
)
2010 if (resetters
== null)
2015 // if we find a previous unclosed same name node, without a resetter node between, we must close it
2016 prev
= (HtmlNode
)_lastnodes
[name
];
2017 if ((prev
!= null) && (!prev
.Closed
))
2020 // try to find a resetter node, if found, we do nothing
2021 if (FindResetterNodes(prev
, resetters
))
2026 // ok we need to close the prev now
2027 // create a fake closer node
2028 HtmlNode close
= new HtmlNode(prev
.NodeType
, this, -1);
2029 close
._endnode
= close
;
2030 prev
.CloseNode(close
);
2035 private bool FindResetterNodes(HtmlNode node
, string[] names
)
2041 for(int i
=0;i
<names
.Length
;i
++)
2043 if (FindResetterNode(node
, names
[i
]) != null)
2051 private HtmlNode
FindResetterNode(HtmlNode node
, string name
)
2053 HtmlNode resetter
= (HtmlNode
)_lastnodes
[name
];
2054 if (resetter
== null)
2056 if (resetter
.Closed
)
2060 if (resetter
._streamposition
<node
._streamposition
)
2067 private void PushNodeNameEnd(int index
)
2069 _currentnode
._namelength
= index
- _currentnode
._namestartindex
;
2070 if (OptionFixNestedTags
)
2076 private void CloseCurrentNode()
2078 if (_currentnode
.Closed
) // text or document are by def closed
2083 // find last node of this kind
2084 HtmlNode prev
= (HtmlNode
)_lastnodes
[_currentnode
.Name
];
2087 if (HtmlNode
.IsClosedElement(_currentnode
.Name
))
2089 // </br> will be seen as <br>
2090 _currentnode
.CloseNode(_currentnode
);
2092 // add to parent node
2093 if (_lastparentnode
!= null)
2095 HtmlNode foundNode
= null;
2096 Stack futureChild
= new Stack();
2097 for (HtmlNode node
= _lastparentnode
.LastChild
; node
!= null; node
= node
.PreviousSibling
)
2099 if ((node
.Name
== _currentnode
.Name
) && (! node
.HasChildNodes
))
2104 futureChild
.Push(node
);
2106 if (foundNode
!= null)
2108 HtmlNode node
= null;
2109 while(futureChild
.Count
!= 0)
2111 node
= (HtmlNode
)futureChild
.Pop();
2112 _lastparentnode
.RemoveChild(node
);
2113 foundNode
.AppendChild(node
);
2118 _lastparentnode
.AppendChild(_currentnode
);
2125 // node has no parent
2126 // node is not a closed node
2128 if (HtmlNode
.CanOverlapElement(_currentnode
.Name
))
2130 // this is a hack: add it as a text node
2131 HtmlNode closenode
= CreateNode(HtmlNodeType
.Text
, _currentnode
._outerstartindex
);
2132 closenode
._outerlength
= _currentnode
._outerlength
;
2133 ((HtmlTextNode
)closenode
).Text
= ((HtmlTextNode
)closenode
).Text
.ToLower();
2134 if (_lastparentnode
!= null)
2136 _lastparentnode
.AppendChild(closenode
);
2142 if (HtmlNode
.IsEmptyElement(_currentnode
.Name
))
2145 HtmlParseErrorCode
.EndTagNotRequired
,
2146 _currentnode
._line
, _currentnode
._lineposition
,
2147 _currentnode
._streamposition
, _currentnode
.OuterHtml
,
2148 "End tag </" + _currentnode
.Name
+ "> is not required");
2152 // node cannot overlap, node is not empty
2154 HtmlParseErrorCode
.TagNotOpened
,
2155 _currentnode
._line
, _currentnode
._lineposition
,
2156 _currentnode
._streamposition
, _currentnode
.OuterHtml
,
2157 "Start tag <" + _currentnode
.Name
+ "> was not found");
2165 if (OptionFixNestedTags
)
2167 if (FindResetterNodes(prev
, GetResetters(_currentnode
.Name
)))
2170 HtmlParseErrorCode
.EndTagInvalidHere
,
2171 _currentnode
._line
, _currentnode
._lineposition
,
2172 _currentnode
._streamposition
, _currentnode
.OuterHtml
,
2173 "End tag </" + _currentnode
.Name
+ "> invalid here");
2180 _lastnodes
[_currentnode
.Name
] = prev
._prevwithsamename
;
2181 prev
.CloseNode(_currentnode
);
2186 // we close this node, get grandparent
2189 if ((_lastparentnode
!= null) &&
2190 ((!HtmlNode
.IsClosedElement(_currentnode
.Name
)) ||
2191 (_currentnode
._starttag
)))
2193 UpdateLastParentNode();
2198 internal void UpdateLastParentNode()
2202 if (_lastparentnode
.Closed
)
2204 _lastparentnode
= _lastparentnode
.ParentNode
;
2207 while ((_lastparentnode
!= null) && (_lastparentnode
.Closed
));
2208 if (_lastparentnode
== null)
2210 _lastparentnode
= _documentnode
;
2214 private string CurrentAttributeName()
2216 return _text
.Substring(_currentattribute
._namestartindex
, _currentattribute
._namelength
);
2219 private string CurrentAttributeValue()
2221 return _text
.Substring(_currentattribute
._valuestartindex
, _currentattribute
._valuelength
);
2224 private string CurrentNodeName()
2226 return _text
.Substring(_currentnode
._namestartindex
, _currentnode
._namelength
);
2229 private string CurrentNodeOuter()
2231 return _text
.Substring(_currentnode
._outerstartindex
, _currentnode
._outerlength
);
2234 private string CurrentNodeInner()
2236 return _text
.Substring(_currentnode
._innerstartindex
, _currentnode
._innerlength
);
2240 /// Determines if the specified character is considered as a whitespace character.
2242 /// <param name="c">The character to check.</param>
2243 /// <returns>true if if the specified character is considered as a whitespace character.</returns>
2244 public static bool IsWhiteSpace(int c
)
2246 if ((c
== 10) || (c
== 13) || (c
== 32) || (c
== 9))
2255 internal class EncodingFoundException
: Exception
2257 private Encoding _encoding
;
2259 internal EncodingFoundException(Encoding encoding
)
2261 _encoding
= encoding
;
2264 internal Encoding Encoding