NoiseFilter: Dont drop last word of apparent hostnames. Too many non-hostnames can...
[beagle.git] / Filters / FilterHtml.cs
blobd3658044d8fd036e365050b7a349a8095ade7b40
1 //
2 // FilterHtml.cs
3 //
4 // Copyright (C) 2005 Debajyoti Bera <dbera.web@gmail.com>
5 // Copyright (C) 2004 Novell, Inc.
6 //
8 //
9 // Permission is hereby granted, free of charge, to any person obtaining a
10 // copy of this software and associated documentation files (the "Software"),
11 // to deal in the Software without restriction, including without limitation
12 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 // and/or sell copies of the Software, and to permit persons to whom the
14 // Software is furnished to do so, subject to the following conditions:
16 // The above copyright notice and this permission notice shall be included in
17 // all copies or substantial portions of the Software.
19 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 // DEALINGS IN THE SOFTWARE.
29 using System;
30 using System.Collections;
31 using System.IO;
32 using System.Text;
33 using SW=System.Web;
35 using Beagle.Daemon;
36 using Beagle.Util;
38 using HtmlAgilityPack;
40 namespace Beagle.Filters {
42 public class FilterHtml : Beagle.Daemon.Filter {
43 // When see <b> push "b" in the stack
44 // When see </b> pop from the stack
45 // For good error checking, we should compare
46 // current element with what was popped
47 // Currently, we just pop, this might allow
48 // unmatched elements to pass through
49 private Stack hot_stack;
50 private Stack ignore_stack;
51 private bool building_text;
52 private StringBuilder builder;
54 // delegate types
55 public delegate int AppendTextCallback (string s);
56 public delegate void AddPropertyCallback (Beagle.Property p);
57 public delegate void AppendSpaceCallback ();
58 public delegate void HotCallback ();
60 // delegates
61 private new AppendTextCallback AppendText;
62 private new AddPropertyCallback AddProperty;
63 private new AppendSpaceCallback AppendWhiteSpace;
64 private new AppendSpaceCallback AppendStructuralBreak;
65 private new HotCallback HotUp;
66 private new HotCallback HotDown;
68 public FilterHtml (bool register_filter)
70 if (register_filter) {
71 // 1: Add meta keyword fields as meta:key
72 SetVersion (1);
73 RegisterSupportedTypes ();
74 SnippetMode = true;
76 AppendText = new AppendTextCallback (base.AppendText);
77 AddProperty = new AddPropertyCallback (base.AddProperty);
78 AppendWhiteSpace = new AppendSpaceCallback (base.AppendWhiteSpace);
79 AppendStructuralBreak = new AppendSpaceCallback (base.AppendStructuralBreak);
80 HotUp = new HotCallback (base.HotUp);
81 HotDown = new HotCallback (base.HotDown);
84 hot_stack = new Stack ();
85 ignore_stack = new Stack ();
86 building_text = false;
87 builder = new StringBuilder ();
90 public FilterHtml () : this (true) {}
92 // Safeguard against spurious stack pop ups...
93 // caused by mismatched tags in bad html files
94 // FIXME: If matching elements is not required
95 // and if HtmlAgilityPack matches elements itself,
96 // then we can just use a counter hot_stack_depth
97 // instead of the hot_stack
98 private void SafePop (Stack st)
100 if (st != null && st.Count != 0)
101 st.Pop ();
104 protected bool NodeIsHot (String nodeName)
106 return nodeName == "b"
107 || nodeName == "u"
108 || nodeName == "em"
109 || nodeName == "strong"
110 || nodeName == "big"
111 || nodeName == "h1"
112 || nodeName == "h2"
113 || nodeName == "h3"
114 || nodeName == "h4"
115 || nodeName == "h5"
116 || nodeName == "h6"
117 || nodeName == "i"
118 || nodeName == "th";
121 protected static bool NodeBreaksText (String nodeName)
123 return nodeName == "td"
124 || nodeName == "a"
125 || nodeName == "div"
126 || nodeName == "option";
129 protected static bool NodeBreaksStructure (string nodeName)
131 return nodeName == "p"
132 || nodeName == "br"
133 || nodeName == "h1"
134 || nodeName == "h2"
135 || nodeName == "h3"
136 || nodeName == "h4"
137 || nodeName == "h5"
138 || nodeName == "h6";
141 protected static bool NodeIsContentFree (String nodeName)
143 return nodeName == "script"
144 || nodeName == "map"
145 || nodeName == "style";
148 protected bool HandleNodeEvent (HtmlNode node)
150 switch (node.NodeType) {
152 case HtmlNodeType.Document:
153 case HtmlNodeType.Element:
154 if (node.Name == "title") {
155 if (node.StartTag) {
156 builder.Length = 0;
157 building_text = true;
158 } else {
159 String title = HtmlEntity.DeEntitize (builder.ToString ().Trim ());
160 AddProperty (Beagle.Property.New ("dc:title", title));
161 builder.Length = 0;
162 building_text = false;
164 } else if (node.Name == "meta") {
165 string name = node.GetAttributeValue ("name", "");
166 string content = node.GetAttributeValue ("content", "");
167 if (name != "" && content != "")
168 AddProperty (Beagle.Property.New ("meta:" + name, content));
169 } else if (! NodeIsContentFree (node.Name)) {
170 bool isHot = NodeIsHot (node.Name);
171 bool breaksText = NodeBreaksText (node.Name);
172 bool breaksStructure = NodeBreaksStructure (node.Name);
174 if (breaksText)
175 AppendWhiteSpace ();
177 if (node.StartTag) {
178 if (isHot) {
179 if (hot_stack.Count == 0)
180 HotUp ();
181 hot_stack.Push (node.Name);
183 if (node.Name == "img") {
184 string attr = node.GetAttributeValue ("alt", "");
185 if (attr != "") {
186 AppendText (HtmlEntity.DeEntitize (attr));
187 AppendWhiteSpace ();
189 } else if (node.Name == "a") {
190 string attr = node.GetAttributeValue ("href", "");
191 if (attr != "") {
192 AppendText (HtmlEntity.DeEntitize (SW.HttpUtility.UrlDecode (attr)));
193 AppendWhiteSpace ();
196 } else { // (! node.StartTag)
197 if (isHot) {
198 SafePop (hot_stack);
199 if (hot_stack.Count == 0)
200 HotDown ();
202 if (breaksStructure)
203 AppendStructuralBreak ();
206 if (breaksText)
207 AppendWhiteSpace ();
208 } else {
209 // so node is a content-free node
210 // ignore contents of such node
211 if (node.StartTag)
212 ignore_stack.Push (node.Name);
213 else
214 SafePop (ignore_stack);
216 break;
218 case HtmlNodeType.Text:
219 // FIXME Do we need to trim the text ?
220 String text = ((HtmlTextNode)node).Text;
221 if (ignore_stack.Count != 0)
222 break; // still ignoring ...
223 if (building_text)
224 builder.Append (text);
225 else
226 AppendText (HtmlEntity.DeEntitize (text));
227 //if (hot_stack.Count != 0)
228 //Console.WriteLine (" TEXT:" + text + " ignore=" + ignore_stack.Count);
229 break;
232 if (! AllowMoreWords ())
233 return false;
234 return true;
237 override protected void DoOpen (FileInfo info)
239 Encoding enc = null;
241 foreach (Property prop in IndexableProperties) {
242 if (prop.Key != StringFu.UnindexedNamespace + "encoding")
243 continue;
245 try {
246 enc = Encoding.GetEncoding ((string) prop.Value);
247 } catch (NotSupportedException) {
248 // Encoding passed in isn't supported. Maybe
249 // we'll get lucky detecting it from the
250 // document instead.
253 break;
256 if (enc == null) {
257 // we need to tell the parser to detect encoding,
258 HtmlDocument temp_doc = new HtmlDocument ();
259 enc = temp_doc.DetectEncoding (Stream);
260 //Console.WriteLine ("Detected encoding:" + (enc == null ? "null" : enc.EncodingName));
261 temp_doc = null;
262 Stream.Seek (0, SeekOrigin.Begin);
265 HtmlDocument doc = new HtmlDocument ();
266 doc.ReportNode += HandleNodeEvent;
267 doc.StreamMode = true;
268 // we already determined encoding
269 doc.OptionReadEncoding = false;
271 try {
272 if (enc == null)
273 doc.Load (Stream);
274 else
275 doc.Load (Stream, enc);
276 } catch (NotSupportedException e) {
277 doc.Load (Stream, Encoding.ASCII);
278 } catch (Exception e) {
279 Log.Debug (e, "Exception while filtering HTML file");
282 Finished ();
285 public void ExtractText (string html_string,
286 AppendTextCallback append_text_cb,
287 AddPropertyCallback add_prop_cb,
288 AppendSpaceCallback append_white_cb,
289 AppendSpaceCallback append_break_cb,
290 HotCallback hot_up_cb,
291 HotCallback hot_down_cb)
293 AppendText = append_text_cb;
294 AddProperty = add_prop_cb;
295 AppendWhiteSpace = append_white_cb;
296 AppendStructuralBreak = append_break_cb;
297 HotUp = hot_up_cb;
298 HotDown = hot_down_cb;
300 HtmlDocument doc = new HtmlDocument ();
301 doc.ReportNode += HandleNodeEvent;
302 doc.StreamMode = true;
304 try {
305 doc.LoadHtml (html_string);
306 } catch (Exception e) {
307 Log.Debug (e, "Exception while filtering html string [{0}]", html_string);
312 virtual protected void RegisterSupportedTypes ()
314 AddSupportedFlavor (FilterFlavor.NewFromMimeType ("text/html"));