Dont throw EncodingFoundException unless asked to. Should remove the occassional...
[beagle.git] / Filters / FilterHtml.cs
bloba795a3c425735d40f5617a00ecbab0c68c3443e4
1 //
2 // FilterHtml.cs
3 //
4 // Copyright (C) 2005 Debajyoti Bera <dbera.web@gmail.com>
5 // Copyright (C) 2004 Novell, Inc.
6 //
8 //
9 // Permission is hereby granted, free of charge, to any person obtaining a
10 // copy of this software and associated documentation files (the "Software"),
11 // to deal in the Software without restriction, including without limitation
12 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 // and/or sell copies of the Software, and to permit persons to whom the
14 // Software is furnished to do so, subject to the following conditions:
16 // The above copyright notice and this permission notice shall be included in
17 // all copies or substantial portions of the Software.
19 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 // DEALINGS IN THE SOFTWARE.
29 using System;
30 using System.Collections;
31 using System.IO;
32 using System.Text;
33 using SW=System.Web;
35 using Beagle.Daemon;
36 using Beagle.Util;
38 using HtmlAgilityPack;
40 namespace Beagle.Filters {
42 public class FilterHtml : Beagle.Daemon.Filter {
43 // When see <b> push "b" in the stack
44 // When see </b> pop from the stack
45 // For good error checking, we should compare
46 // current element with what was popped
47 // Currently, we just pop, this might allow
48 // unmatched elements to pass through
49 private Stack hot_stack;
50 private Stack ignore_stack;
51 private bool building_text;
52 private StringBuilder builder;
54 public FilterHtml ()
56 // 1: Add meta keyword fields as meta:key
57 SetVersion (1);
59 RegisterSupportedTypes ();
60 SnippetMode = true;
61 hot_stack = new Stack ();
62 ignore_stack = new Stack ();
63 building_text = false;
64 builder = new StringBuilder ();
67 // Safeguard against spurious stack pop ups...
68 // caused by mismatched tags in bad html files
69 // FIXME: If matching elements is not required
70 // and if HtmlAgilityPack matches elements itself,
71 // then we can just use a counter hot_stack_depth
72 // instead of the hot_stack
73 private void SafePop (Stack st)
75 if (st != null && st.Count != 0)
76 st.Pop ();
79 protected bool NodeIsHot (String nodeName)
81 return nodeName == "b"
82 || nodeName == "u"
83 || nodeName == "em"
84 || nodeName == "strong"
85 || nodeName == "big"
86 || nodeName == "h1"
87 || nodeName == "h2"
88 || nodeName == "h3"
89 || nodeName == "h4"
90 || nodeName == "h5"
91 || nodeName == "h6"
92 || nodeName == "i"
93 || nodeName == "th";
96 protected static bool NodeBreaksText (String nodeName)
98 return nodeName == "td"
99 || nodeName == "a"
100 || nodeName == "div"
101 || nodeName == "option";
104 protected static bool NodeBreaksStructure (string nodeName)
106 return nodeName == "p"
107 || nodeName == "br"
108 || nodeName == "h1"
109 || nodeName == "h2"
110 || nodeName == "h3"
111 || nodeName == "h4"
112 || nodeName == "h5"
113 || nodeName == "h6";
116 protected static bool NodeIsContentFree (String nodeName)
118 return nodeName == "script"
119 || nodeName == "map"
120 || nodeName == "style";
123 protected bool HandleNodeEvent (HtmlNode node)
125 switch (node.NodeType) {
127 case HtmlNodeType.Document:
128 case HtmlNodeType.Element:
129 if (node.Name == "title") {
130 if (node.StartTag) {
131 builder.Length = 0;
132 building_text = true;
133 } else {
134 String title = HtmlEntity.DeEntitize (builder.ToString ().Trim ());
135 AddProperty (Beagle.Property.New ("dc:title", title));
136 builder.Length = 0;
137 building_text = false;
139 } else if (node.Name == "meta") {
140 string name = node.GetAttributeValue ("name", "");
141 string content = node.GetAttributeValue ("content", "");
142 if (name != "" && content != "")
143 AddProperty (Beagle.Property.New ("meta:" + name, content));
144 } else if (! NodeIsContentFree (node.Name)) {
145 bool isHot = NodeIsHot (node.Name);
146 bool breaksText = NodeBreaksText (node.Name);
147 bool breaksStructure = NodeBreaksStructure (node.Name);
148 if (isHot && node.StartTag) {
149 if (hot_stack.Count == 0)
150 HotUp ();
151 hot_stack.Push (node.Name);
153 if (breaksText && node.StartTag)
154 AppendWhiteSpace ();
155 if (node.Name == "img" && node.StartTag) {
156 string attr = node.GetAttributeValue ("alt", "");
157 if (attr != "") {
158 AppendText (HtmlEntity.DeEntitize (attr));
161 if (node.Name == "a" && node.StartTag) {
162 string attr = node.GetAttributeValue ("href", "");
163 if (attr != "") {
164 AppendText (HtmlEntity.DeEntitize (SW.HttpUtility.UrlDecode (attr)));
167 if (breaksText && !node.StartTag)
168 AppendWhiteSpace ();
169 if (breaksStructure && !node.StartTag)
170 AppendStructuralBreak ();
171 if (isHot && !node.StartTag) {
172 if (hot_stack.Count != 0)
173 SafePop (hot_stack);
174 if (hot_stack.Count == 0)
175 HotDown ();
177 } else {
178 // so node is a content-free node
179 // ignore contents of such node
180 if (node.StartTag)
181 ignore_stack.Push (node.Name);
182 else
183 SafePop (ignore_stack);
185 break;
187 case HtmlNodeType.Text:
188 // FIXME Do we need to trim the text ?
189 String text = ((HtmlTextNode)node).Text;
190 if (ignore_stack.Count != 0)
191 break; // still ignoring ...
192 if (building_text)
193 builder.Append (text);
194 else
195 AppendText (HtmlEntity.DeEntitize (text));
196 //if (hot_stack.Count != 0)
197 //Console.WriteLine (" TEXT:" + text + " ignore=" + ignore_stack.Count);
198 break;
201 if (! AllowMoreWords ())
202 return false;
203 return true;
206 override protected void DoOpen (FileInfo info)
208 Encoding enc = null;
210 foreach (Property prop in IndexableProperties) {
211 if (prop.Key != StringFu.UnindexedNamespace + "encoding")
212 continue;
214 try {
215 enc = Encoding.GetEncoding ((string) prop.Value);
216 } catch (NotSupportedException) {
217 // Encoding passed in isn't supported. Maybe
218 // we'll get lucky detecting it from the
219 // document instead.
222 break;
225 if (enc == null) {
226 // we need to tell the parser to detect encoding,
227 HtmlDocument temp_doc = new HtmlDocument ();
228 enc = temp_doc.DetectEncoding (Stream);
229 //Console.WriteLine ("Detected encoding:" + (enc == null ? "null" : enc.EncodingName));
230 temp_doc = null;
231 Stream.Seek (0, SeekOrigin.Begin);
234 HtmlDocument doc = new HtmlDocument ();
235 doc.ReportNode += HandleNodeEvent;
236 doc.StreamMode = true;
237 // we already determined encoding
238 doc.OptionReadEncoding = false;
240 try {
241 if (enc == null)
242 doc.Load (Stream);
243 else
244 doc.Load (Stream, enc);
245 } catch (NotSupportedException e) {
246 doc.Load (Stream, Encoding.ASCII);
247 } catch (Exception e) {
248 Console.WriteLine (e.Message);
249 Console.WriteLine (e.StackTrace);
252 Finished ();
256 virtual protected void RegisterSupportedTypes ()
258 AddSupportedFlavor (FilterFlavor.NewFromMimeType ("text/html"));