Change the GC.GetTotalMemory() threshold to 10%; otherwise there are just too many...
[beagle.git] / Filters / HtmlAgilityPack / HtmlWeb.cs
blobb7a4b40c208d5b7dbf689017ad476019857e08b6
1 // HtmlAgilityPack V1.0
3 /*
4 Copyright (C) 2003 Simon Mourier <simonm@microsoft.com>
5 All rights reserved.
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions
9 are met:
10 1. Redistributions of source code must retain the above copyright
11 notice, this list of conditions and the following disclaimer.
12 2. Redistributions in binary form must reproduce the above copyright
13 notice, this list of conditions and the following disclaimer in the
14 documentation and/or other materials provided with the distribution.
15 3. The name of the author may not be used to endorse or promote products
16 derived from this software without specific prior written permission.
18 THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
19 IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20 OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21 IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
22 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
23 NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
27 THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 using System;
31 using System.IO;
32 using System.Net;
33 using System.Xml;
34 using System.Xml.Serialization;
35 using System.Xml.Xsl;
36 using Microsoft.Win32;
38 namespace HtmlAgilityPack
40 /// <summary>
41 /// A utility class to get HTML document from HTTP.
42 /// </summary>
43 public class HtmlWeb
45 /// <summary>
46 /// Represents the method that will handle the PreRequest event.
47 /// </summary>
48 public delegate bool PreRequestHandler(HttpWebRequest request);
50 /// <summary>
51 /// Represents the method that will handle the PostResponse event.
52 /// </summary>
53 public delegate void PostResponseHandler(HttpWebRequest request, HttpWebResponse response);
55 /// <summary>
56 /// Represents the method that will handle the PreHandleDocument event.
57 /// </summary>
58 public delegate void PreHandleDocumentHandler(HtmlDocument document);
60 private int _streamBufferSize = 1024;
61 private string _cachePath;
62 private bool _usingCache;
63 private bool _fromCache;
64 private bool _cacheOnly;
65 private bool _useCookies;
66 private int _requestDuration;
67 private bool _autoDetectEncoding = true;
68 private HttpStatusCode _statusCode = HttpStatusCode.OK;
69 private Uri _responseUri;
71 /// <summary>
72 /// Occurs before an HTTP request is executed.
73 /// </summary>
74 public PreRequestHandler PreRequest;
76 /// <summary>
77 /// Occurs after an HTTP request has been executed.
78 /// </summary>
79 public PostResponseHandler PostResponse;
81 /// <summary>
82 /// Occurs before an HTML document is handled.
83 /// </summary>
84 public PreHandleDocumentHandler PreHandleDocument;
86 /// <summary>
87 /// Creates an instance of an HtmlWeb class.
88 /// </summary>
89 public HtmlWeb()
93 /// <summary>
94 /// Gets an HTML document from an Internet resource and saves it to the specified file.
95 /// </summary>
96 /// <param name="url">The requested URL, such as "http://Myserver/Mypath/Myfile.asp".</param>
97 /// <param name="path">The location of the file where you want to save the document.</param>
98 public void Get(string url, string path)
100 Get(url, path, "GET");
103 /// <summary>
104 /// Gets an HTML document from an Internet resource and saves it to the specified file.
105 /// </summary>
106 /// <param name="url">The requested URL, such as "http://Myserver/Mypath/Myfile.asp".</param>
107 /// <param name="path">The location of the file where you want to save the document.</param>
108 /// <param name="method">The HTTP method used to open the connection, such as GET, POST, PUT, or PROPFIND.</param>
109 public void Get(string url, string path, string method)
111 Uri uri = new Uri(url);
112 if ((uri.Scheme == Uri.UriSchemeHttps) ||
113 (uri.Scheme == Uri.UriSchemeHttp))
115 Get(uri, method, path, null);
117 else
119 throw new HtmlWebException("Unsupported uri scheme: '" + uri.Scheme + "'.");
123 /// <summary>
124 /// Gets an HTML document from an Internet resource.
125 /// </summary>
126 /// <param name="url">The requested URL, such as "http://Myserver/Mypath/Myfile.asp".</param>
127 /// <returns>A new HTML document.</returns>
128 public HtmlDocument Load(string url)
130 return Load(url, "GET");
133 /// <summary>
134 /// Loads an HTML document from an Internet resource.
135 /// </summary>
136 /// <param name="url">The requested URL, such as "http://Myserver/Mypath/Myfile.asp".</param>
137 /// <param name="method">The HTTP method used to open the connection, such as GET, POST, PUT, or PROPFIND.</param>
138 /// <returns>A new HTML document.</returns>
139 public HtmlDocument Load(string url, string method)
141 Uri uri = new Uri(url);
142 HtmlDocument doc;
143 if ((uri.Scheme == Uri.UriSchemeHttps) ||
144 (uri.Scheme == Uri.UriSchemeHttp))
146 doc = LoadUrl(uri, method);
148 else
151 if (uri.Scheme == Uri.UriSchemeFile)
153 doc = new HtmlDocument();
154 doc.OptionAutoCloseOnEnd = false;
155 doc.OptionAutoCloseOnEnd = true;
156 doc.DetectEncodingAndLoad(url, _autoDetectEncoding);
158 else
160 throw new HtmlWebException("Unsupported uri scheme: '" + uri.Scheme + "'.");
163 if (PreHandleDocument != null)
165 PreHandleDocument(doc);
167 return doc;
170 private bool IsCacheHtmlContent(string path)
172 string ct = GetContentTypeForExtension(Path.GetExtension(path), null);
173 return IsHtmlContent(ct);
176 private bool IsHtmlContent(string contentType)
178 return contentType.ToLower().StartsWith("text/html");
181 private string GetCacheHeadersPath(Uri uri)
183 //return Path.Combine(GetCachePath(uri), ".h.xml");
184 return GetCachePath(uri) + ".h.xml";
187 /// <summary>
188 /// Gets the cache file path for a specified url.
189 /// </summary>
190 /// <param name="uri">The url fo which to retrieve the cache path. May not be null.</param>
191 /// <returns>The cache file path.</returns>
192 public string GetCachePath(Uri uri)
194 if (uri == null)
196 throw new ArgumentNullException("uri");
198 if (!UsingCache)
200 throw new HtmlWebException("Cache is not enabled. Set UsingCache to true first.");
202 string cachePath;
203 if (uri.AbsolutePath == "/")
205 cachePath = Path.Combine(_cachePath, ".htm");
207 else
209 cachePath = Path.Combine(_cachePath, (uri.Host + uri.AbsolutePath).Replace('/', '\\'));
211 return cachePath;
214 /// <summary>
215 /// Gets a value indicating if the last document was retrieved from the cache.
216 /// </summary>
217 public bool FromCache
221 return _fromCache;
225 /// <summary>
226 /// Gets the URI of the Internet resource that actually responded to the request.
227 /// </summary>
228 public Uri ResponseUri
232 return _responseUri;
236 /// <summary>
237 /// Gets or Sets a value indicating whether to get document only from the cache.
238 /// If this is set to true and document is not found in the cache, nothing will be loaded.
239 /// </summary>
240 public bool CacheOnly
244 return _cacheOnly;
248 if ((value) && !UsingCache)
250 throw new HtmlWebException("Cache is not enabled. Set UsingCache to true first.");
252 _cacheOnly = value;
256 /// <summary>
257 /// Gets or Sets a value indicating if cookies will be stored.
258 /// </summary>
259 public bool UseCookies
263 return _useCookies;
267 _useCookies = value;
271 /// <summary>
272 /// Gets the last request duration in milliseconds.
273 /// </summary>
274 public int RequestDuration
278 return _requestDuration;
282 /// <summary>
283 /// Gets or Sets a value indicating if document encoding must be automatically detected.
284 /// </summary>
285 public bool AutoDetectEncoding
289 return _autoDetectEncoding;
293 _autoDetectEncoding = value;
297 /// <summary>
298 /// Gets the last request status.
299 /// </summary>
300 public HttpStatusCode StatusCode
304 return _statusCode;
308 /// <summary>
309 /// Gets or Sets the size of the buffer used for memory operations.
310 /// </summary>
311 public int StreamBufferSize
315 return _streamBufferSize;
319 if (_streamBufferSize <= 0)
321 throw new ArgumentException("Size must be greater than zero.");
323 _streamBufferSize = value;
327 private HtmlDocument LoadUrl(Uri uri, string method)
329 HtmlDocument doc = new HtmlDocument();
330 doc.OptionAutoCloseOnEnd = false;
331 doc.OptionFixNestedTags = true;
332 _statusCode = Get(uri, method, null, doc);
333 if (_statusCode == HttpStatusCode.NotModified)
335 // read cached encoding
336 doc.DetectEncodingAndLoad(GetCachePath(uri));
338 return doc;
341 private HttpStatusCode Get(Uri uri, string method, string path, HtmlDocument doc)
343 string cachePath = null;
344 HttpWebRequest req;
345 bool oldFile = false;
347 req = WebRequest.Create(uri) as HttpWebRequest;
348 req.Method = method;
350 _fromCache = false;
351 _requestDuration = 0;
352 int tc = Environment.TickCount;
353 if (UsingCache)
355 cachePath = GetCachePath(req.RequestUri);
356 if (File.Exists(cachePath))
358 req.IfModifiedSince = File.GetLastAccessTime(cachePath);
359 oldFile = true;
363 if (_cacheOnly)
365 if (!File.Exists(cachePath))
367 throw new HtmlWebException("File was not found at cache path: '" + cachePath + "'");
370 if (path != null)
372 IOLibrary.CopyAlways(cachePath, path);
373 // touch the file
374 File.SetLastWriteTime(path, File.GetLastWriteTime(cachePath));
376 _fromCache = true;
377 return HttpStatusCode.NotModified;
380 if (_useCookies)
382 req.CookieContainer = new CookieContainer();
385 if (PreRequest != null)
387 // allow our user to change the request at will
388 if (!PreRequest(req))
390 return HttpStatusCode.ResetContent;
393 // dump cookie
394 // if (_useCookies)
395 // {
396 // foreach(Cookie cookie in req.CookieContainer.GetCookies(req.RequestUri))
397 // {
398 // HtmlLibrary.Trace("Cookie " + cookie.Name + "=" + cookie.Value + " path=" + cookie.Path + " domain=" + cookie.Domain);
399 // }
400 // }
403 HttpWebResponse resp;
407 resp = req.GetResponse() as HttpWebResponse;
409 catch (WebException we)
411 _requestDuration = Environment.TickCount - tc;
412 resp = (HttpWebResponse)we.Response;
413 if (resp == null)
415 if (oldFile)
417 if (path != null)
419 IOLibrary.CopyAlways(cachePath, path);
420 // touch the file
421 File.SetLastWriteTime(path, File.GetLastWriteTime(cachePath));
423 return HttpStatusCode.NotModified;
425 throw;
428 catch(Exception)
430 _requestDuration = Environment.TickCount - tc;
431 throw;
434 // allow our user to get some info from the response
435 if (PostResponse != null)
437 PostResponse(req, resp);
440 _requestDuration = Environment.TickCount - tc;
441 _responseUri = resp.ResponseUri;
443 bool html = IsHtmlContent(resp.ContentType);
444 System.Text.Encoding respenc;
446 if ((resp.ContentEncoding != null) && (resp.ContentEncoding.Length>0))
448 respenc = System.Text.Encoding.GetEncoding(resp.ContentEncoding);
450 else
452 respenc = null;
455 if (resp.StatusCode == HttpStatusCode.NotModified)
457 if (UsingCache)
459 _fromCache = true;
460 if (path != null)
462 IOLibrary.CopyAlways(cachePath, path);
463 // touch the file
464 File.SetLastWriteTime(path, File.GetLastWriteTime(cachePath));
466 return resp.StatusCode;
468 else
470 // this should *never* happen...
471 throw new HtmlWebException("Server has send a NotModifed code, without cache enabled.");
474 Stream s = resp.GetResponseStream();
475 if (s != null)
477 if (UsingCache)
479 // NOTE: LastModified does not contain milliseconds, so we remove them to the file
480 SaveStream(s, cachePath, RemoveMilliseconds(resp.LastModified), _streamBufferSize);
482 // save headers
483 SaveCacheHeaders(req.RequestUri, resp);
485 if (path != null)
487 // copy and touch the file
488 IOLibrary.CopyAlways(cachePath, path);
489 File.SetLastWriteTime(path, File.GetLastWriteTime(cachePath));
492 else
494 // try to work in-memory
495 if ((doc != null) && (html))
497 if (respenc != null)
499 doc.Load(s,respenc);
501 else
503 doc.Load(s);
507 resp.Close();
509 return resp.StatusCode;
512 private string GetCacheHeader(Uri requestUri, string name, string def)
514 // note: some headers are collection (ex: www-authenticate)
515 // we don't handle that here
516 XmlDocument doc = new XmlDocument();
517 doc.Load(GetCacheHeadersPath(requestUri));
518 XmlNode node = doc.SelectSingleNode("//h[translate(@n, 'abcdefghijklmnopqrstuvwxyz','ABCDEFGHIJKLMNOPQRSTUVWXYZ')='" + name.ToUpper() + "']");
519 if (node == null)
521 return def;
523 // attribute should exist
524 return node.Attributes[name].Value;
527 private void SaveCacheHeaders(Uri requestUri, HttpWebResponse resp)
529 // we cache the original headers aside the cached document.
530 string file = GetCacheHeadersPath(requestUri);
531 XmlDocument doc = new XmlDocument();
532 doc.LoadXml("<c></c>");
533 XmlNode cache = doc.FirstChild;
534 foreach(string header in resp.Headers)
536 XmlNode entry = doc.CreateElement("h");
537 XmlAttribute att = doc.CreateAttribute("n");
538 att.Value = header;
539 entry.Attributes.Append(att);
541 att = doc.CreateAttribute("v");
542 att.Value = resp.Headers[header];
543 entry.Attributes.Append(att);
545 cache.AppendChild(entry);
547 doc.Save(file);
550 private static long SaveStream(Stream stream, string path, DateTime touchDate, int streamBufferSize)
552 FilePreparePath(path);
553 FileStream fs = new FileStream(path, FileMode.Create, FileAccess.Write);
554 BinaryReader br = null;
555 BinaryWriter bw = null;
556 long len;
559 br = new BinaryReader(stream);
560 bw = new BinaryWriter(fs);
561 len = 0;
562 byte[] buffer;
565 buffer = br.ReadBytes(streamBufferSize);
566 len += buffer.Length;
567 if (buffer.Length>0)
569 bw.Write(buffer);
572 while (buffer.Length>0);
574 finally
576 if (br != null)
578 br.Close();
580 if (bw != null)
582 bw.Flush();
583 bw.Close();
585 if (fs != null)
587 fs.Close();
590 File.SetLastWriteTime(path, touchDate);
591 return len;
594 private static void FilePreparePath(string target)
596 if (File.Exists(target))
598 FileAttributes atts = File.GetAttributes(target);
599 File.SetAttributes(target, atts & ~FileAttributes.ReadOnly);
601 else
603 string dir = Path.GetDirectoryName(target);
604 if (!Directory.Exists(dir))
606 Directory.CreateDirectory(dir);
611 private static DateTime RemoveMilliseconds(DateTime t)
613 return new DateTime(t.Year, t.Month, t.Day, t.Hour, t.Minute, t.Second, 0);
616 /// <summary>
617 /// Gets the path extension for a given MIME content type.
618 /// </summary>
619 /// <param name="contentType">The input MIME content type.</param>
620 /// <param name="def">The default path extension to return if any error occurs.</param>
621 /// <returns>The MIME content type's path extension.</returns>
622 public static string GetExtensionForContentType(string contentType, string def)
624 if ((contentType == null) || (contentType.Length == 0))
626 return def;
628 string ext;
631 RegistryKey reg = Registry.ClassesRoot;
632 reg = reg.OpenSubKey(@"MIME\Database\Content Type\" + contentType, false);
633 ext = (string)reg.GetValue("Extension", def);
635 catch(Exception)
637 ext = def;
639 return ext;
642 /// <summary>
643 /// Gets the MIME content type for a given path extension.
644 /// </summary>
645 /// <param name="extension">The input path extension.</param>
646 /// <param name="def">The default content type to return if any error occurs.</param>
647 /// <returns>The path extention's MIME content type.</returns>
648 public static string GetContentTypeForExtension(string extension, string def)
650 if ((extension == null) || (extension.Length == 0))
652 return def;
654 string contentType;
657 RegistryKey reg = Registry.ClassesRoot;
658 reg = reg.OpenSubKey(extension, false);
659 contentType = (string)reg.GetValue("", def);
661 catch(Exception)
663 contentType = def;
665 return contentType;
668 /// <summary>
669 /// Loads an HTML document from an Internet resource and saves it to the specified XmlTextWriter.
670 /// </summary>
671 /// <param name="htmlUrl">The requested URL, such as "http://Myserver/Mypath/Myfile.asp".</param>
672 /// <param name="writer">The XmlTextWriter to which you want to save.</param>
673 public void LoadHtmlAsXml(string htmlUrl, XmlTextWriter writer)
675 HtmlDocument doc = Load(htmlUrl);
676 doc.Save(writer);
679 /// <summary>
680 /// Loads an HTML document from an Internet resource and saves it to the specified XmlTextWriter, after an XSLT transformation.
681 /// </summary>
682 /// <param name="htmlUrl">The requested URL, such as "http://Myserver/Mypath/Myfile.asp".</param>
683 /// <param name="xsltUrl">The URL that specifies the XSLT stylesheet to load.</param>
684 /// <param name="xsltArgs">An XsltArgumentList containing the namespace-qualified arguments used as input to the transform.</param>
685 /// <param name="writer">The XmlTextWriter to which you want to save.</param>
686 public void LoadHtmlAsXml(string htmlUrl, string xsltUrl, XsltArgumentList xsltArgs, XmlTextWriter writer)
688 LoadHtmlAsXml(htmlUrl, xsltUrl, xsltArgs, writer, null);
691 /// <summary>
692 /// Loads an HTML document from an Internet resource and saves it to the specified XmlTextWriter, after an XSLT transformation.
693 /// </summary>
694 /// <param name="htmlUrl">The requested URL, such as "http://Myserver/Mypath/Myfile.asp". May not be null.</param>
695 /// <param name="xsltUrl">The URL that specifies the XSLT stylesheet to load.</param>
696 /// <param name="xsltArgs">An XsltArgumentList containing the namespace-qualified arguments used as input to the transform.</param>
697 /// <param name="writer">The XmlTextWriter to which you want to save.</param>
698 /// <param name="xmlPath">A file path where the temporary XML before transformation will be saved. Mostly used for debugging purposes.</param>
699 public void LoadHtmlAsXml(string htmlUrl, string xsltUrl, XsltArgumentList xsltArgs, XmlTextWriter writer, string xmlPath)
701 if (htmlUrl == null)
703 throw new ArgumentNullException("htmlUrl");
706 HtmlDocument doc = Load(htmlUrl);
708 if (xmlPath != null)
710 XmlTextWriter w = new XmlTextWriter(xmlPath, doc.Encoding);
711 doc.Save(w);
712 w.Close();
714 if (xsltArgs == null)
716 xsltArgs = new XsltArgumentList();
719 // add some useful variables to the xslt doc
720 xsltArgs.AddParam("url", "", htmlUrl);
721 xsltArgs.AddParam("requestDuration", "", RequestDuration);
722 xsltArgs.AddParam("fromCache", "", FromCache);
724 XslTransform xslt = new XslTransform();
725 xslt.Load(xsltUrl);
726 xslt.Transform(doc, xsltArgs, writer);
729 /// <summary>
730 /// Creates an instance of the given type from the specified Internet resource.
731 /// </summary>
732 /// <param name="url">The requested URL, such as "http://Myserver/Mypath/Myfile.asp".</param>
733 /// <param name="type">The requested type.</param>
734 /// <returns>An newly created instance.</returns>
735 public object CreateInstance(string url, Type type)
737 return CreateInstance(url, null, null, type);
740 /// <summary>
741 /// Creates an instance of the given type from the specified Internet resource.
742 /// </summary>
743 /// <param name="htmlUrl">The requested URL, such as "http://Myserver/Mypath/Myfile.asp".</param>
744 /// <param name="xsltUrl">The URL that specifies the XSLT stylesheet to load.</param>
745 /// <param name="xsltArgs">An XsltArgumentList containing the namespace-qualified arguments used as input to the transform.</param>
746 /// <param name="type">The requested type.</param>
747 /// <returns>An newly created instance.</returns>
748 public object CreateInstance(string htmlUrl, string xsltUrl, XsltArgumentList xsltArgs, Type type)
750 return CreateInstance(htmlUrl, xsltUrl, xsltArgs, type, null);
753 /// <summary>
754 /// Creates an instance of the given type from the specified Internet resource.
755 /// </summary>
756 /// <param name="htmlUrl">The requested URL, such as "http://Myserver/Mypath/Myfile.asp".</param>
757 /// <param name="xsltUrl">The URL that specifies the XSLT stylesheet to load.</param>
758 /// <param name="xsltArgs">An XsltArgumentList containing the namespace-qualified arguments used as input to the transform.</param>
759 /// <param name="type">The requested type.</param>
760 /// <param name="xmlPath">A file path where the temporary XML before transformation will be saved. Mostly used for debugging purposes.</param>
761 /// <returns>An newly created instance.</returns>
762 public object CreateInstance(string htmlUrl, string xsltUrl, XsltArgumentList xsltArgs, Type type, string xmlPath)
764 StringWriter sw = new StringWriter();
765 XmlTextWriter writer = new XmlTextWriter(sw);
766 if (xsltUrl == null)
768 LoadHtmlAsXml(htmlUrl, writer);
770 else
772 if (xmlPath == null)
774 LoadHtmlAsXml(htmlUrl, xsltUrl, xsltArgs, writer);
776 else
778 LoadHtmlAsXml(htmlUrl, xsltUrl, xsltArgs, writer, xmlPath);
781 writer.Flush();
782 StringReader sr = new StringReader(sw.ToString());
783 XmlTextReader reader = new XmlTextReader(sr);
784 XmlSerializer serializer = new XmlSerializer(type);
785 object o = null;
788 o = serializer.Deserialize(reader);
790 catch(InvalidOperationException ex)
792 throw new Exception(ex.ToString() + ", --- xml:" + sw.ToString());
794 return o;
797 /// <summary>
798 /// Gets or Sets the cache path. If null, no caching mechanism will be used.
799 /// </summary>
800 public string CachePath
804 return _cachePath;
808 _cachePath = value;
812 /// <summary>
813 /// Gets or Sets a value indicating whether the caching mechanisms should be used or not.
814 /// </summary>
815 public bool UsingCache
819 if (_cachePath == null)
821 return false;
823 return _usingCache;
827 if ((value) && (_cachePath == null))
829 throw new HtmlWebException("You need to define a CachePath first.");
831 _usingCache = value;
836 /// <summary>
837 /// Represents an exception thrown by the HtmlWeb utility class.
838 /// </summary>
839 public class HtmlWebException: Exception
841 /// <summary>
842 /// Creates an instance of the HtmlWebException.
843 /// </summary>
844 /// <param name="message">The exception's message.</param>
845 public HtmlWebException(string message)
846 :base(message)