2006-09-10 Francisco Javier F. Serrador <serrador@openshine.com>
[beagle.git] / Filters / FilterHtml.cs
blob0272c4fb22c9bec86e7ac3a911a7b21b94c90ccd
1 //
2 // FilterHtml.cs
3 //
4 // Copyright (C) 2005 Debajyoti Bera <dbera.web@gmail.com>
5 // Copyright (C) 2004 Novell, Inc.
6 //
8 //
9 // Permission is hereby granted, free of charge, to any person obtaining a
10 // copy of this software and associated documentation files (the "Software"),
11 // to deal in the Software without restriction, including without limitation
12 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 // and/or sell copies of the Software, and to permit persons to whom the
14 // Software is furnished to do so, subject to the following conditions:
16 // The above copyright notice and this permission notice shall be included in
17 // all copies or substantial portions of the Software.
19 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 // DEALINGS IN THE SOFTWARE.
29 using System;
30 using System.Collections;
31 using System.IO;
32 using System.Text;
33 using SW=System.Web;
35 using Beagle.Daemon;
36 using Beagle.Util;
38 using HtmlAgilityPack;
40 namespace Beagle.Filters {
42 public class FilterHtml : Beagle.Daemon.Filter {
43 // When see <b> push "b" in the stack
44 // When see </b> pop from the stack
45 // For good error checking, we should compare
46 // current element with what was popped
47 // Currently, we just pop, this might allow
48 // unmatched elements to pass through
49 private Stack hot_stack;
50 private Stack ignore_stack;
51 private bool building_text;
52 private StringBuilder builder;
54 public FilterHtml ()
56 // 1: Add meta keyword fields as meta:key
57 SetVersion (1);
59 RegisterSupportedTypes ();
60 SnippetMode = true;
61 hot_stack = new Stack ();
62 ignore_stack = new Stack ();
63 building_text = false;
64 builder = new StringBuilder ();
67 // Safeguard against spurious stack pop ups...
68 // caused by mismatched tags in bad html files
69 // FIXME: If matching elements is not required
70 // and if HtmlAgilityPack matches elements itself,
71 // then we can just use a counter hot_stack_depth
72 // instead of the hot_stack
73 private void SafePop (Stack st)
75 if (st != null && st.Count != 0)
76 st.Pop ();
79 protected bool NodeIsHot (String nodeName)
81 return nodeName == "b"
82 || nodeName == "u"
83 || nodeName == "em"
84 || nodeName == "strong"
85 || nodeName == "big"
86 || nodeName == "h1"
87 || nodeName == "h2"
88 || nodeName == "h3"
89 || nodeName == "h4"
90 || nodeName == "h5"
91 || nodeName == "h6"
92 || nodeName == "i"
93 || nodeName == "th";
96 protected static bool NodeBreaksText (String nodeName)
98 return nodeName == "td"
99 || nodeName == "a"
100 || nodeName == "div"
101 || nodeName == "option";
104 protected static bool NodeBreaksStructure (string nodeName)
106 return nodeName == "p"
107 || nodeName == "br"
108 || nodeName == "h1"
109 || nodeName == "h2"
110 || nodeName == "h3"
111 || nodeName == "h4"
112 || nodeName == "h5"
113 || nodeName == "h6";
116 protected static bool NodeIsContentFree (String nodeName)
118 return nodeName == "script"
119 || nodeName == "map"
120 || nodeName == "style";
123 protected bool HandleNodeEvent (HtmlNode node)
125 switch (node.NodeType) {
127 case HtmlNodeType.Document:
128 case HtmlNodeType.Element:
129 if (node.Name == "title") {
130 if (node.StartTag) {
131 builder.Length = 0;
132 building_text = true;
133 } else {
134 String title = HtmlEntity.DeEntitize (builder.ToString ().Trim ());
135 AddProperty (Beagle.Property.New ("dc:title", title));
136 builder.Length = 0;
137 building_text = false;
139 } else if (node.Name == "meta") {
140 string name = node.GetAttributeValue ("name", "");
141 string content = node.GetAttributeValue ("content", "");
142 if (name != "" && content != "")
143 AddProperty (Beagle.Property.New ("meta:" + name, content));
144 } else if (! NodeIsContentFree (node.Name)) {
145 bool isHot = NodeIsHot (node.Name);
146 bool breaksText = NodeBreaksText (node.Name);
147 bool breaksStructure = NodeBreaksStructure (node.Name);
148 if (isHot && node.StartTag) {
149 if (hot_stack.Count == 0)
150 HotUp ();
151 hot_stack.Push (node.Name);
153 if (breaksText)
154 AppendWhiteSpace ();
155 if (node.Name == "img" && node.StartTag) {
156 string attr = node.GetAttributeValue ("alt", "");
157 if (attr != "") {
158 AppendText (HtmlEntity.DeEntitize (attr));
159 AppendWhiteSpace ();
162 if (node.Name == "a" && node.StartTag) {
163 string attr = node.GetAttributeValue ("href", "");
164 if (attr != "") {
165 AppendText (HtmlEntity.DeEntitize (SW.HttpUtility.UrlDecode (attr)));
166 AppendWhiteSpace ();
169 if (breaksText)
170 AppendWhiteSpace ();
171 if (breaksStructure && !node.StartTag)
172 AppendStructuralBreak ();
173 if (isHot && !node.StartTag) {
174 if (hot_stack.Count != 0)
175 SafePop (hot_stack);
176 if (hot_stack.Count == 0)
177 HotDown ();
179 } else {
180 // so node is a content-free node
181 // ignore contents of such node
182 if (node.StartTag)
183 ignore_stack.Push (node.Name);
184 else
185 SafePop (ignore_stack);
187 break;
189 case HtmlNodeType.Text:
190 // FIXME Do we need to trim the text ?
191 String text = ((HtmlTextNode)node).Text;
192 if (ignore_stack.Count != 0)
193 break; // still ignoring ...
194 if (building_text)
195 builder.Append (text);
196 else
197 AppendText (HtmlEntity.DeEntitize (text));
198 //if (hot_stack.Count != 0)
199 //Console.WriteLine (" TEXT:" + text + " ignore=" + ignore_stack.Count);
200 break;
203 if (! AllowMoreWords ())
204 return false;
205 return true;
208 override protected void DoOpen (FileInfo info)
210 Encoding enc = null;
212 foreach (Property prop in IndexableProperties) {
213 if (prop.Key != StringFu.UnindexedNamespace + "encoding")
214 continue;
216 try {
217 enc = Encoding.GetEncoding ((string) prop.Value);
218 } catch (NotSupportedException) {
219 // Encoding passed in isn't supported. Maybe
220 // we'll get lucky detecting it from the
221 // document instead.
224 break;
227 if (enc == null) {
228 // we need to tell the parser to detect encoding,
229 HtmlDocument temp_doc = new HtmlDocument ();
230 enc = temp_doc.DetectEncoding (Stream);
231 //Console.WriteLine ("Detected encoding:" + (enc == null ? "null" : enc.EncodingName));
232 temp_doc = null;
233 Stream.Seek (0, SeekOrigin.Begin);
236 HtmlDocument doc = new HtmlDocument ();
237 doc.ReportNode += HandleNodeEvent;
238 doc.StreamMode = true;
239 // we already determined encoding
240 doc.OptionReadEncoding = false;
242 try {
243 if (enc == null)
244 doc.Load (Stream);
245 else
246 doc.Load (Stream, enc);
247 } catch (NotSupportedException e) {
248 doc.Load (Stream, Encoding.ASCII);
249 } catch (Exception e) {
250 Console.WriteLine (e.Message);
251 Console.WriteLine (e.StackTrace);
254 Finished ();
258 virtual protected void RegisterSupportedTypes ()
260 AddSupportedFlavor (FilterFlavor.NewFromMimeType ("text/html"));