1 // HtmlAgilityPack V1.0
4 Copyright (C) 2003 Simon Mourier <simonm@microsoft.com>
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions
10 1. Redistributions of source code must retain the above copyright
11 notice, this list of conditions and the following disclaimer.
12 2. Redistributions in binary form must reproduce the above copyright
13 notice, this list of conditions and the following disclaimer in the
14 documentation and/or other materials provided with the distribution.
15 3. The name of the author may not be used to endorse or promote products
16 derived from this software without specific prior written permission.
18 THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
19 IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20 OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21 IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
22 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
23 NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
27 THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 using System
.Xml
.Serialization
;
36 using Microsoft
.Win32
;
38 namespace HtmlAgilityPack
41 /// A utility class to get HTML document from HTTP.
46 /// Represents the method that will handle the PreRequest event.
48 public delegate bool PreRequestHandler(HttpWebRequest request
);
51 /// Represents the method that will handle the PostResponse event.
53 public delegate void PostResponseHandler(HttpWebRequest request
, HttpWebResponse response
);
56 /// Represents the method that will handle the PreHandleDocument event.
58 public delegate void PreHandleDocumentHandler(HtmlDocument document
);
60 private int _streamBufferSize
= 1024;
61 private string _cachePath
;
62 private bool _usingCache
;
63 private bool _fromCache
;
64 private bool _cacheOnly
;
65 private bool _useCookies
;
66 private int _requestDuration
;
67 private bool _autoDetectEncoding
= true;
68 private HttpStatusCode _statusCode
= HttpStatusCode
.OK
;
69 private Uri _responseUri
;
72 /// Occurs before an HTTP request is executed.
74 public PreRequestHandler PreRequest
;
77 /// Occurs after an HTTP request has been executed.
79 public PostResponseHandler PostResponse
;
82 /// Occurs before an HTML document is handled.
84 public PreHandleDocumentHandler PreHandleDocument
;
87 /// Creates an instance of an HtmlWeb class.
94 /// Gets an HTML document from an Internet resource and saves it to the specified file.
96 /// <param name="url">The requested URL, such as "http://Myserver/Mypath/Myfile.asp".</param>
97 /// <param name="path">The location of the file where you want to save the document.</param>
98 public void Get(string url
, string path
)
100 Get(url
, path
, "GET");
104 /// Gets an HTML document from an Internet resource and saves it to the specified file.
106 /// <param name="url">The requested URL, such as "http://Myserver/Mypath/Myfile.asp".</param>
107 /// <param name="path">The location of the file where you want to save the document.</param>
108 /// <param name="method">The HTTP method used to open the connection, such as GET, POST, PUT, or PROPFIND.</param>
109 public void Get(string url
, string path
, string method
)
111 Uri uri
= new Uri(url
);
112 if ((uri
.Scheme
== Uri
.UriSchemeHttps
) ||
113 (uri
.Scheme
== Uri
.UriSchemeHttp
))
115 Get(uri
, method
, path
, null);
119 throw new HtmlWebException("Unsupported uri scheme: '" + uri
.Scheme
+ "'.");
124 /// Gets an HTML document from an Internet resource.
126 /// <param name="url">The requested URL, such as "http://Myserver/Mypath/Myfile.asp".</param>
127 /// <returns>A new HTML document.</returns>
128 public HtmlDocument
Load(string url
)
130 return Load(url
, "GET");
134 /// Loads an HTML document from an Internet resource.
136 /// <param name="url">The requested URL, such as "http://Myserver/Mypath/Myfile.asp".</param>
137 /// <param name="method">The HTTP method used to open the connection, such as GET, POST, PUT, or PROPFIND.</param>
138 /// <returns>A new HTML document.</returns>
139 public HtmlDocument
Load(string url
, string method
)
141 Uri uri
= new Uri(url
);
143 if ((uri
.Scheme
== Uri
.UriSchemeHttps
) ||
144 (uri
.Scheme
== Uri
.UriSchemeHttp
))
146 doc
= LoadUrl(uri
, method
);
151 if (uri
.Scheme
== Uri
.UriSchemeFile
)
153 doc
= new HtmlDocument();
154 doc
.OptionAutoCloseOnEnd
= false;
155 doc
.OptionAutoCloseOnEnd
= true;
156 doc
.DetectEncodingAndLoad(url
, _autoDetectEncoding
);
160 throw new HtmlWebException("Unsupported uri scheme: '" + uri
.Scheme
+ "'.");
163 if (PreHandleDocument
!= null)
165 PreHandleDocument(doc
);
170 private bool IsCacheHtmlContent(string path
)
172 string ct
= GetContentTypeForExtension(Path
.GetExtension(path
), null);
173 return IsHtmlContent(ct
);
176 private bool IsHtmlContent(string contentType
)
178 return contentType
.ToLower().StartsWith("text/html");
181 private string GetCacheHeadersPath(Uri uri
)
183 //return Path.Combine(GetCachePath(uri), ".h.xml");
184 return GetCachePath(uri
) + ".h.xml";
188 /// Gets the cache file path for a specified url.
190 /// <param name="uri">The url fo which to retrieve the cache path. May not be null.</param>
191 /// <returns>The cache file path.</returns>
192 public string GetCachePath(Uri uri
)
196 throw new ArgumentNullException("uri");
200 throw new HtmlWebException("Cache is not enabled. Set UsingCache to true first.");
203 if (uri
.AbsolutePath
== "/")
205 cachePath
= Path
.Combine(_cachePath
, ".htm");
209 cachePath
= Path
.Combine(_cachePath
, (uri
.Host
+ uri
.AbsolutePath
).Replace('/', '\\'));
215 /// Gets a value indicating if the last document was retrieved from the cache.
217 public bool FromCache
226 /// Gets the URI of the Internet resource that actually responded to the request.
228 public Uri ResponseUri
237 /// Gets or Sets a value indicating whether to get document only from the cache.
238 /// If this is set to true and document is not found in the cache, nothing will be loaded.
240 public bool CacheOnly
248 if ((value) && !UsingCache
)
250 throw new HtmlWebException("Cache is not enabled. Set UsingCache to true first.");
257 /// Gets or Sets a value indicating if cookies will be stored.
259 public bool UseCookies
272 /// Gets the last request duration in milliseconds.
274 public int RequestDuration
278 return _requestDuration
;
283 /// Gets or Sets a value indicating if document encoding must be automatically detected.
285 public bool AutoDetectEncoding
289 return _autoDetectEncoding
;
293 _autoDetectEncoding
= value;
298 /// Gets the last request status.
300 public HttpStatusCode StatusCode
309 /// Gets or Sets the size of the buffer used for memory operations.
311 public int StreamBufferSize
315 return _streamBufferSize
;
319 if (_streamBufferSize
<= 0)
321 throw new ArgumentException("Size must be greater than zero.");
323 _streamBufferSize
= value;
327 private HtmlDocument
LoadUrl(Uri uri
, string method
)
329 HtmlDocument doc
= new HtmlDocument();
330 doc
.OptionAutoCloseOnEnd
= false;
331 doc
.OptionFixNestedTags
= true;
332 _statusCode
= Get(uri
, method
, null, doc
);
333 if (_statusCode
== HttpStatusCode
.NotModified
)
335 // read cached encoding
336 doc
.DetectEncodingAndLoad(GetCachePath(uri
));
341 private HttpStatusCode
Get(Uri uri
, string method
, string path
, HtmlDocument doc
)
343 string cachePath
= null;
345 bool oldFile
= false;
347 req
= WebRequest
.Create(uri
) as HttpWebRequest
;
351 _requestDuration
= 0;
352 int tc
= Environment
.TickCount
;
355 cachePath
= GetCachePath(req
.RequestUri
);
356 if (File
.Exists(cachePath
))
358 req
.IfModifiedSince
= File
.GetLastAccessTime(cachePath
);
365 if (!File
.Exists(cachePath
))
367 throw new HtmlWebException("File was not found at cache path: '" + cachePath
+ "'");
372 IOLibrary
.CopyAlways(cachePath
, path
);
374 File
.SetLastWriteTime(path
, File
.GetLastWriteTime(cachePath
));
377 return HttpStatusCode
.NotModified
;
382 req
.CookieContainer
= new CookieContainer();
385 if (PreRequest
!= null)
387 // allow our user to change the request at will
388 if (!PreRequest(req
))
390 return HttpStatusCode
.ResetContent
;
396 // foreach(Cookie cookie in req.CookieContainer.GetCookies(req.RequestUri))
398 // HtmlLibrary.Trace("Cookie " + cookie.Name + "=" + cookie.Value + " path=" + cookie.Path + " domain=" + cookie.Domain);
403 HttpWebResponse resp
;
407 resp
= req
.GetResponse() as HttpWebResponse
;
409 catch (WebException we
)
411 _requestDuration
= Environment
.TickCount
- tc
;
412 resp
= (HttpWebResponse
)we
.Response
;
419 IOLibrary
.CopyAlways(cachePath
, path
);
421 File
.SetLastWriteTime(path
, File
.GetLastWriteTime(cachePath
));
423 return HttpStatusCode
.NotModified
;
430 _requestDuration
= Environment
.TickCount
- tc
;
434 // allow our user to get some info from the response
435 if (PostResponse
!= null)
437 PostResponse(req
, resp
);
440 _requestDuration
= Environment
.TickCount
- tc
;
441 _responseUri
= resp
.ResponseUri
;
443 bool html
= IsHtmlContent(resp
.ContentType
);
444 System
.Text
.Encoding respenc
;
446 if ((resp
.ContentEncoding
!= null) && (resp
.ContentEncoding
.Length
>0))
448 respenc
= System
.Text
.Encoding
.GetEncoding(resp
.ContentEncoding
);
455 if (resp
.StatusCode
== HttpStatusCode
.NotModified
)
462 IOLibrary
.CopyAlways(cachePath
, path
);
464 File
.SetLastWriteTime(path
, File
.GetLastWriteTime(cachePath
));
466 return resp
.StatusCode
;
470 // this should *never* happen...
471 throw new HtmlWebException("Server has send a NotModifed code, without cache enabled.");
474 Stream s
= resp
.GetResponseStream();
479 // NOTE: LastModified does not contain milliseconds, so we remove them to the file
480 SaveStream(s
, cachePath
, RemoveMilliseconds(resp
.LastModified
), _streamBufferSize
);
483 SaveCacheHeaders(req
.RequestUri
, resp
);
487 // copy and touch the file
488 IOLibrary
.CopyAlways(cachePath
, path
);
489 File
.SetLastWriteTime(path
, File
.GetLastWriteTime(cachePath
));
494 // try to work in-memory
495 if ((doc
!= null) && (html
))
509 return resp
.StatusCode
;
512 private string GetCacheHeader(Uri requestUri
, string name
, string def
)
514 // note: some headers are collection (ex: www-authenticate)
515 // we don't handle that here
516 XmlDocument doc
= new XmlDocument();
517 doc
.Load(GetCacheHeadersPath(requestUri
));
518 XmlNode node
= doc
.SelectSingleNode("//h[translate(@n, 'abcdefghijklmnopqrstuvwxyz','ABCDEFGHIJKLMNOPQRSTUVWXYZ')='" + name
.ToUpper() + "']");
523 // attribute should exist
524 return node
.Attributes
[name
].Value
;
527 private void SaveCacheHeaders(Uri requestUri
, HttpWebResponse resp
)
529 // we cache the original headers aside the cached document.
530 string file
= GetCacheHeadersPath(requestUri
);
531 XmlDocument doc
= new XmlDocument();
532 doc
.LoadXml("<c></c>");
533 XmlNode cache
= doc
.FirstChild
;
534 foreach(string header
in resp
.Headers
)
536 XmlNode entry
= doc
.CreateElement("h");
537 XmlAttribute att
= doc
.CreateAttribute("n");
539 entry
.Attributes
.Append(att
);
541 att
= doc
.CreateAttribute("v");
542 att
.Value
= resp
.Headers
[header
];
543 entry
.Attributes
.Append(att
);
545 cache
.AppendChild(entry
);
550 private static long SaveStream(Stream stream
, string path
, DateTime touchDate
, int streamBufferSize
)
552 FilePreparePath(path
);
553 FileStream fs
= new FileStream(path
, FileMode
.Create
, FileAccess
.Write
);
554 BinaryReader br
= null;
555 BinaryWriter bw
= null;
559 br
= new BinaryReader(stream
);
560 bw
= new BinaryWriter(fs
);
565 buffer
= br
.ReadBytes(streamBufferSize
);
566 len
+= buffer
.Length
;
572 while (buffer
.Length
>0);
590 File
.SetLastWriteTime(path
, touchDate
);
594 private static void FilePreparePath(string target
)
596 if (File
.Exists(target
))
598 FileAttributes atts
= File
.GetAttributes(target
);
599 File
.SetAttributes(target
, atts
& ~FileAttributes
.ReadOnly
);
603 string dir
= Path
.GetDirectoryName(target
);
604 if (!Directory
.Exists(dir
))
606 Directory
.CreateDirectory(dir
);
611 private static DateTime
RemoveMilliseconds(DateTime t
)
613 return new DateTime(t
.Year
, t
.Month
, t
.Day
, t
.Hour
, t
.Minute
, t
.Second
, 0);
617 /// Gets the path extension for a given MIME content type.
619 /// <param name="contentType">The input MIME content type.</param>
620 /// <param name="def">The default path extension to return if any error occurs.</param>
621 /// <returns>The MIME content type's path extension.</returns>
622 public static string GetExtensionForContentType(string contentType
, string def
)
624 if ((contentType
== null) || (contentType
.Length
== 0))
631 RegistryKey reg
= Registry
.ClassesRoot
;
632 reg
= reg
.OpenSubKey(@"MIME\Database\Content Type\" + contentType
, false);
633 ext
= (string)reg
.GetValue("Extension", def
);
643 /// Gets the MIME content type for a given path extension.
645 /// <param name="extension">The input path extension.</param>
646 /// <param name="def">The default content type to return if any error occurs.</param>
647 /// <returns>The path extention's MIME content type.</returns>
648 public static string GetContentTypeForExtension(string extension
, string def
)
650 if ((extension
== null) || (extension
.Length
== 0))
657 RegistryKey reg
= Registry
.ClassesRoot
;
658 reg
= reg
.OpenSubKey(extension
, false);
659 contentType
= (string)reg
.GetValue("", def
);
669 /// Loads an HTML document from an Internet resource and saves it to the specified XmlTextWriter.
671 /// <param name="htmlUrl">The requested URL, such as "http://Myserver/Mypath/Myfile.asp".</param>
672 /// <param name="writer">The XmlTextWriter to which you want to save.</param>
673 public void LoadHtmlAsXml(string htmlUrl
, XmlTextWriter writer
)
675 HtmlDocument doc
= Load(htmlUrl
);
680 /// Loads an HTML document from an Internet resource and saves it to the specified XmlTextWriter, after an XSLT transformation.
682 /// <param name="htmlUrl">The requested URL, such as "http://Myserver/Mypath/Myfile.asp".</param>
683 /// <param name="xsltUrl">The URL that specifies the XSLT stylesheet to load.</param>
684 /// <param name="xsltArgs">An XsltArgumentList containing the namespace-qualified arguments used as input to the transform.</param>
685 /// <param name="writer">The XmlTextWriter to which you want to save.</param>
686 public void LoadHtmlAsXml(string htmlUrl
, string xsltUrl
, XsltArgumentList xsltArgs
, XmlTextWriter writer
)
688 LoadHtmlAsXml(htmlUrl
, xsltUrl
, xsltArgs
, writer
, null);
692 /// Loads an HTML document from an Internet resource and saves it to the specified XmlTextWriter, after an XSLT transformation.
694 /// <param name="htmlUrl">The requested URL, such as "http://Myserver/Mypath/Myfile.asp". May not be null.</param>
695 /// <param name="xsltUrl">The URL that specifies the XSLT stylesheet to load.</param>
696 /// <param name="xsltArgs">An XsltArgumentList containing the namespace-qualified arguments used as input to the transform.</param>
697 /// <param name="writer">The XmlTextWriter to which you want to save.</param>
698 /// <param name="xmlPath">A file path where the temporary XML before transformation will be saved. Mostly used for debugging purposes.</param>
699 public void LoadHtmlAsXml(string htmlUrl
, string xsltUrl
, XsltArgumentList xsltArgs
, XmlTextWriter writer
, string xmlPath
)
703 throw new ArgumentNullException("htmlUrl");
706 HtmlDocument doc
= Load(htmlUrl
);
710 XmlTextWriter w
= new XmlTextWriter(xmlPath
, doc
.Encoding
);
714 if (xsltArgs
== null)
716 xsltArgs
= new XsltArgumentList();
719 // add some useful variables to the xslt doc
720 xsltArgs
.AddParam("url", "", htmlUrl
);
721 xsltArgs
.AddParam("requestDuration", "", RequestDuration
);
722 xsltArgs
.AddParam("fromCache", "", FromCache
);
724 XslTransform xslt
= new XslTransform();
726 xslt
.Transform(doc
, xsltArgs
, writer
);
730 /// Creates an instance of the given type from the specified Internet resource.
732 /// <param name="url">The requested URL, such as "http://Myserver/Mypath/Myfile.asp".</param>
733 /// <param name="type">The requested type.</param>
734 /// <returns>An newly created instance.</returns>
735 public object CreateInstance(string url
, Type type
)
737 return CreateInstance(url
, null, null, type
);
741 /// Creates an instance of the given type from the specified Internet resource.
743 /// <param name="htmlUrl">The requested URL, such as "http://Myserver/Mypath/Myfile.asp".</param>
744 /// <param name="xsltUrl">The URL that specifies the XSLT stylesheet to load.</param>
745 /// <param name="xsltArgs">An XsltArgumentList containing the namespace-qualified arguments used as input to the transform.</param>
746 /// <param name="type">The requested type.</param>
747 /// <returns>An newly created instance.</returns>
748 public object CreateInstance(string htmlUrl
, string xsltUrl
, XsltArgumentList xsltArgs
, Type type
)
750 return CreateInstance(htmlUrl
, xsltUrl
, xsltArgs
, type
, null);
754 /// Creates an instance of the given type from the specified Internet resource.
756 /// <param name="htmlUrl">The requested URL, such as "http://Myserver/Mypath/Myfile.asp".</param>
757 /// <param name="xsltUrl">The URL that specifies the XSLT stylesheet to load.</param>
758 /// <param name="xsltArgs">An XsltArgumentList containing the namespace-qualified arguments used as input to the transform.</param>
759 /// <param name="type">The requested type.</param>
760 /// <param name="xmlPath">A file path where the temporary XML before transformation will be saved. Mostly used for debugging purposes.</param>
761 /// <returns>An newly created instance.</returns>
762 public object CreateInstance(string htmlUrl
, string xsltUrl
, XsltArgumentList xsltArgs
, Type type
, string xmlPath
)
764 StringWriter sw
= new StringWriter();
765 XmlTextWriter writer
= new XmlTextWriter(sw
);
768 LoadHtmlAsXml(htmlUrl
, writer
);
774 LoadHtmlAsXml(htmlUrl
, xsltUrl
, xsltArgs
, writer
);
778 LoadHtmlAsXml(htmlUrl
, xsltUrl
, xsltArgs
, writer
, xmlPath
);
782 StringReader sr
= new StringReader(sw
.ToString());
783 XmlTextReader reader
= new XmlTextReader(sr
);
784 XmlSerializer serializer
= new XmlSerializer(type
);
788 o
= serializer
.Deserialize(reader
);
790 catch(InvalidOperationException ex
)
792 throw new Exception(ex
.ToString() + ", --- xml:" + sw
.ToString());
798 /// Gets or Sets the cache path. If null, no caching mechanism will be used.
800 public string CachePath
813 /// Gets or Sets a value indicating whether the caching mechanisms should be used or not.
815 public bool UsingCache
819 if (_cachePath
== null)
827 if ((value) && (_cachePath
== null))
829 throw new HtmlWebException("You need to define a CachePath first.");
837 /// Represents an exception thrown by the HtmlWeb utility class.
839 public class HtmlWebException
: Exception
842 /// Creates an instance of the HtmlWebException.
844 /// <param name="message">The exception's message.</param>
845 public HtmlWebException(string message
)