Change the GC.GetTotalMemory() threshold to 10%; otherwise there are just too many...
[beagle.git] / Filters / FilterHtml.cs
blobc00f4b938f8d5e257837efd6f0eff9bc776b5899
1 //
2 // FilterHtml.cs
3 //
4 // Copyright (C) 2005 Debajyoti Bera <dbera.web@gmail.com>
5 // Copyright (C) 2004 Novell, Inc.
6 //
8 //
9 // Permission is hereby granted, free of charge, to any person obtaining a
10 // copy of this software and associated documentation files (the "Software"),
11 // to deal in the Software without restriction, including without limitation
12 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 // and/or sell copies of the Software, and to permit persons to whom the
14 // Software is furnished to do so, subject to the following conditions:
16 // The above copyright notice and this permission notice shall be included in
17 // all copies or substantial portions of the Software.
19 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 // DEALINGS IN THE SOFTWARE.
29 using System;
30 using System.Collections;
31 using System.IO;
32 using System.Text;
33 using SW=System.Web;
35 using Beagle.Daemon;
36 using Beagle.Util;
38 using HtmlAgilityPack;
40 namespace Beagle.Filters {
42 public class FilterHtml : Beagle.Daemon.Filter {
43 // When see <b> push "b" in the stack
44 // When see </b> pop from the stack
45 // For good error checking, we should compare
46 // current element with what was popped
47 // Currently, we just pop, this might allow
48 // unmatched elements to pass through
49 private Stack hot_stack;
50 private Stack ignore_stack;
51 private bool building_text;
52 private StringBuilder builder;
53 protected Encoding enc;
55 // delegate types
56 public delegate int AppendTextCallback (string s);
57 public delegate void AddPropertyCallback (Beagle.Property p);
58 public delegate void AppendSpaceCallback ();
59 public delegate void HotCallback ();
61 // delegates
62 private new AppendTextCallback AppendText;
63 private new AddPropertyCallback AddProperty;
64 private new AppendSpaceCallback AppendWhiteSpace;
65 private new AppendSpaceCallback AppendStructuralBreak;
66 private new HotCallback HotUp;
67 private new HotCallback HotDown;
69 public FilterHtml (bool register_filter)
71 if (register_filter) {
72 // 1: Add meta keyword fields as meta:key
73 SetVersion (1);
74 RegisterSupportedTypes ();
75 SnippetMode = true;
77 AppendText = new AppendTextCallback (base.AppendText);
78 AddProperty = new AddPropertyCallback (base.AddProperty);
79 AppendWhiteSpace = new AppendSpaceCallback (base.AppendWhiteSpace);
80 AppendStructuralBreak = new AppendSpaceCallback (base.AppendStructuralBreak);
81 HotUp = new HotCallback (base.HotUp);
82 HotDown = new HotCallback (base.HotDown);
85 hot_stack = new Stack ();
86 ignore_stack = new Stack ();
87 building_text = false;
88 builder = new StringBuilder ();
91 public FilterHtml () : this (true) {}
93 // Safeguard against spurious stack pop ups...
94 // caused by mismatched tags in bad html files
95 // FIXME: If matching elements is not required
96 // and if HtmlAgilityPack matches elements itself,
97 // then we can just use a counter hot_stack_depth
98 // instead of the hot_stack
99 private void SafePop (Stack st)
101 if (st != null && st.Count != 0)
102 st.Pop ();
105 protected bool NodeIsHot (String nodeName)
107 return nodeName == "b"
108 || nodeName == "u"
109 || nodeName == "em"
110 || nodeName == "strong"
111 || nodeName == "big"
112 || nodeName == "h1"
113 || nodeName == "h2"
114 || nodeName == "h3"
115 || nodeName == "h4"
116 || nodeName == "h5"
117 || nodeName == "h6"
118 || nodeName == "i"
119 || nodeName == "th";
122 protected static bool NodeBreaksText (String nodeName)
124 return nodeName == "td"
125 || nodeName == "a"
126 || nodeName == "div"
127 || nodeName == "option";
130 protected static bool NodeBreaksStructure (string nodeName)
132 return nodeName == "p"
133 || nodeName == "br"
134 || nodeName == "h1"
135 || nodeName == "h2"
136 || nodeName == "h3"
137 || nodeName == "h4"
138 || nodeName == "h5"
139 || nodeName == "h6";
142 protected static bool NodeIsContentFree (String nodeName)
144 return nodeName == "script"
145 || nodeName == "map"
146 || nodeName == "style";
149 protected bool HandleNodeEvent (HtmlNode node)
151 switch (node.NodeType) {
153 case HtmlNodeType.Document:
154 case HtmlNodeType.Element:
155 if (node.Name == "title") {
156 if (node.StartTag) {
157 builder.Length = 0;
158 building_text = true;
159 } else {
160 String title = HtmlEntity.DeEntitize (builder.ToString ().Trim ());
161 AddProperty (Beagle.Property.New ("dc:title", title));
162 builder.Length = 0;
163 building_text = false;
165 } else if (node.Name == "meta") {
166 string name = node.GetAttributeValue ("name", "");
167 string content = node.GetAttributeValue ("content", "");
168 if (name != String.Empty && content != String.Empty)
169 AddProperty (Beagle.Property.New ("meta:" + name, content));
170 } else if (! NodeIsContentFree (node.Name)) {
171 bool isHot = NodeIsHot (node.Name);
172 bool breaksText = NodeBreaksText (node.Name);
173 bool breaksStructure = NodeBreaksStructure (node.Name);
175 if (breaksText)
176 AppendWhiteSpace ();
178 if (node.StartTag) {
179 if (isHot) {
180 if (hot_stack.Count == 0)
181 HotUp ();
182 hot_stack.Push (node.Name);
184 if (node.Name == "img") {
185 string attr = node.GetAttributeValue ("alt", "");
186 if (attr != String.Empty) {
187 AppendText (HtmlEntity.DeEntitize (attr));
188 AppendWhiteSpace ();
190 } else if (node.Name == "a") {
191 string attr = node.GetAttributeValue ("href", "");
192 if (attr != String.Empty) {
193 AppendText (HtmlEntity.DeEntitize (
194 SW.HttpUtility.UrlDecode (attr, enc)));
195 AppendWhiteSpace ();
198 } else { // (! node.StartTag)
199 if (isHot) {
200 SafePop (hot_stack);
201 if (hot_stack.Count == 0)
202 HotDown ();
204 if (breaksStructure)
205 AppendStructuralBreak ();
208 if (breaksText)
209 AppendWhiteSpace ();
210 } else {
211 // so node is a content-free node
212 // ignore contents of such node
213 if (node.StartTag)
214 ignore_stack.Push (node.Name);
215 else
216 SafePop (ignore_stack);
218 break;
220 case HtmlNodeType.Text:
221 // FIXME Do we need to trim the text ?
222 String text = ((HtmlTextNode)node).Text;
223 if (ignore_stack.Count != 0)
224 break; // still ignoring ...
225 if (building_text)
226 builder.Append (text);
227 else
228 AppendText (HtmlEntity.DeEntitize (text));
229 //if (hot_stack.Count != 0)
230 //Console.WriteLine (" TEXT:" + text + " ignore=" + ignore_stack.Count);
231 break;
234 if (! AllowMoreWords ())
235 return false;
236 return true;
239 override protected void DoOpen (FileInfo info)
241 enc = null;
243 foreach (Property prop in IndexableProperties) {
244 if (prop.Key != StringFu.UnindexedNamespace + "encoding")
245 continue;
247 try {
248 enc = Encoding.GetEncoding ((string) prop.Value);
249 } catch (NotSupportedException) {
250 // Encoding passed in isn't supported. Maybe
251 // we'll get lucky detecting it from the
252 // document instead.
255 break;
258 if (enc == null) {
259 // we need to tell the parser to detect encoding,
260 HtmlDocument temp_doc = new HtmlDocument ();
261 enc = temp_doc.DetectEncoding (Stream);
262 //Console.WriteLine ("Detected encoding:" + (enc == null ? "null" : enc.EncodingName));
263 temp_doc = null;
264 Stream.Seek (0, SeekOrigin.Begin);
267 HtmlDocument doc = new HtmlDocument ();
268 doc.ReportNode += HandleNodeEvent;
269 doc.StreamMode = true;
270 // we already determined encoding
271 doc.OptionReadEncoding = false;
273 try {
274 if (enc == null)
275 doc.Load (Stream);
276 else
277 doc.Load (Stream, enc);
278 } catch (NotSupportedException) {
279 enc = Encoding.ASCII;
280 doc.Load (Stream, enc);
281 } catch (Exception e) {
282 Log.Debug (e, "Exception while filtering HTML file " + info.FullName);
285 Finished ();
288 public void ExtractText (string html_string,
289 AppendTextCallback append_text_cb,
290 AddPropertyCallback add_prop_cb,
291 AppendSpaceCallback append_white_cb,
292 AppendSpaceCallback append_break_cb,
293 HotCallback hot_up_cb,
294 HotCallback hot_down_cb)
296 AppendText = append_text_cb;
297 AddProperty = add_prop_cb;
298 AppendWhiteSpace = append_white_cb;
299 AppendStructuralBreak = append_break_cb;
300 HotUp = hot_up_cb;
301 HotDown = hot_down_cb;
303 HtmlDocument doc = new HtmlDocument ();
304 doc.ReportNode += HandleNodeEvent;
305 doc.StreamMode = true;
307 try {
308 doc.LoadHtml (html_string);
309 } catch (Exception e) {
310 Log.Debug (e, "Exception while filtering html string [{0}]", html_string);
315 virtual protected void RegisterSupportedTypes ()
317 AddSupportedFlavor (FilterFlavor.NewFromMimeType ("text/html"));