4 // Copyright (C) 2005 Debajyoti Bera <dbera.web@gmail.com>
5 // Copyright (C) 2004 Novell, Inc.
9 // Permission is hereby granted, free of charge, to any person obtaining a
10 // copy of this software and associated documentation files (the "Software"),
11 // to deal in the Software without restriction, including without limitation
12 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 // and/or sell copies of the Software, and to permit persons to whom the
14 // Software is furnished to do so, subject to the following conditions:
16 // The above copyright notice and this permission notice shall be included in
17 // all copies or substantial portions of the Software.
19 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 // DEALINGS IN THE SOFTWARE.
30 using System
.Collections
;
38 using HtmlAgilityPack
;
40 namespace Beagle
.Filters
{
42 public class FilterHtml
: Beagle
.Daemon
.Filter
{
43 // When see <b> push "b" in the stack
44 // When see </b> pop from the stack
45 // For good error checking, we should compare
46 // current element with what was popped
47 // Currently, we just pop, this might allow
48 // unmatched elements to pass through
49 private Stack hot_stack
;
50 private Stack ignore_stack
;
51 private bool building_text
;
52 private StringBuilder builder
;
53 protected Encoding enc
;
56 public delegate int AppendTextCallback (string s
);
57 public delegate void AddPropertyCallback (Beagle
.Property p
);
58 public delegate void AppendSpaceCallback ();
59 public delegate void HotCallback ();
62 private new AppendTextCallback AppendText
;
63 private new AddPropertyCallback AddProperty
;
64 private new AppendSpaceCallback AppendWhiteSpace
;
65 private new AppendSpaceCallback AppendStructuralBreak
;
66 private new HotCallback HotUp
;
67 private new HotCallback HotDown
;
69 public FilterHtml (bool register_filter
)
71 if (register_filter
) {
72 // 1: Add meta keyword fields as meta:key
74 RegisterSupportedTypes ();
77 AppendText
= new AppendTextCallback (base.AppendText
);
78 AddProperty
= new AddPropertyCallback (base.AddProperty
);
79 AppendWhiteSpace
= new AppendSpaceCallback (base.AppendWhiteSpace
);
80 AppendStructuralBreak
= new AppendSpaceCallback (base.AppendStructuralBreak
);
81 HotUp
= new HotCallback (base.HotUp
);
82 HotDown
= new HotCallback (base.HotDown
);
85 hot_stack
= new Stack ();
86 ignore_stack
= new Stack ();
87 building_text
= false;
88 builder
= new StringBuilder ();
91 public FilterHtml () : this (true) {}
93 // Safeguard against spurious stack pop ups...
94 // caused by mismatched tags in bad html files
95 // FIXME: If matching elements is not required
96 // and if HtmlAgilityPack matches elements itself,
97 // then we can just use a counter hot_stack_depth
98 // instead of the hot_stack
99 private void SafePop (Stack st
)
101 if (st
!= null && st
.Count
!= 0)
105 protected bool NodeIsHot (String nodeName
)
107 return nodeName
== "b"
110 || nodeName
== "strong"
122 protected static bool NodeBreaksText (String nodeName
)
124 return nodeName
== "td"
127 || nodeName
== "option";
130 protected static bool NodeBreaksStructure (string nodeName
)
132 return nodeName
== "p"
142 protected static bool NodeIsContentFree (String nodeName
)
144 return nodeName
== "script"
146 || nodeName
== "style";
149 protected bool HandleNodeEvent (HtmlNode node
)
151 switch (node
.NodeType
) {
153 case HtmlNodeType
.Document
:
154 case HtmlNodeType
.Element
:
155 if (node
.Name
== "title") {
158 building_text
= true;
160 String title
= HtmlEntity
.DeEntitize (builder
.ToString ().Trim ());
161 AddProperty (Beagle
.Property
.New ("dc:title", title
));
163 building_text
= false;
165 } else if (node
.Name
== "meta") {
166 string name
= node
.GetAttributeValue ("name", "");
167 string content
= node
.GetAttributeValue ("content", "");
168 if (name
!= String
.Empty
&& content
!= String
.Empty
)
169 AddProperty (Beagle
.Property
.New ("meta:" + name
, content
));
170 } else if (! NodeIsContentFree (node
.Name
)) {
171 bool isHot
= NodeIsHot (node
.Name
);
172 bool breaksText
= NodeBreaksText (node
.Name
);
173 bool breaksStructure
= NodeBreaksStructure (node
.Name
);
180 if (hot_stack
.Count
== 0)
182 hot_stack
.Push (node
.Name
);
184 if (node
.Name
== "img") {
185 string attr
= node
.GetAttributeValue ("alt", "");
186 if (attr
!= String
.Empty
) {
187 AppendText (HtmlEntity
.DeEntitize (attr
));
190 } else if (node
.Name
== "a") {
191 string attr
= node
.GetAttributeValue ("href", "");
192 if (attr
!= String
.Empty
) {
193 AppendText (HtmlEntity
.DeEntitize (
194 SW
.HttpUtility
.UrlDecode (attr
, enc
)));
198 } else { // (! node.StartTag)
201 if (hot_stack
.Count
== 0)
205 AppendStructuralBreak ();
211 // so node is a content-free node
212 // ignore contents of such node
214 ignore_stack
.Push (node
.Name
);
216 SafePop (ignore_stack
);
220 case HtmlNodeType
.Text
:
221 // FIXME Do we need to trim the text ?
222 String text
= ((HtmlTextNode
)node
).Text
;
223 if (ignore_stack
.Count
!= 0)
224 break; // still ignoring ...
226 builder
.Append (text
);
228 AppendText (HtmlEntity
.DeEntitize (text
));
229 //if (hot_stack.Count != 0)
230 //Console.WriteLine (" TEXT:" + text + " ignore=" + ignore_stack.Count);
234 if (! AllowMoreWords ())
239 override protected void DoOpen (FileInfo info
)
243 foreach (Property prop
in IndexableProperties
) {
244 if (prop
.Key
!= StringFu
.UnindexedNamespace
+ "encoding")
248 enc
= Encoding
.GetEncoding ((string) prop
.Value
);
249 } catch (NotSupportedException
) {
250 // Encoding passed in isn't supported. Maybe
251 // we'll get lucky detecting it from the
259 // we need to tell the parser to detect encoding,
260 HtmlDocument temp_doc
= new HtmlDocument ();
261 enc
= temp_doc
.DetectEncoding (Stream
);
262 //Console.WriteLine ("Detected encoding:" + (enc == null ? "null" : enc.EncodingName));
264 Stream
.Seek (0, SeekOrigin
.Begin
);
267 HtmlDocument doc
= new HtmlDocument ();
268 doc
.ReportNode
+= HandleNodeEvent
;
269 doc
.StreamMode
= true;
270 // we already determined encoding
271 doc
.OptionReadEncoding
= false;
277 doc
.Load (Stream
, enc
);
278 } catch (NotSupportedException
) {
279 enc
= Encoding
.ASCII
;
280 doc
.Load (Stream
, enc
);
281 } catch (Exception e
) {
282 Log
.Debug (e
, "Exception while filtering HTML file " + info
.FullName
);
288 public void ExtractText (string html_string
,
289 AppendTextCallback append_text_cb
,
290 AddPropertyCallback add_prop_cb
,
291 AppendSpaceCallback append_white_cb
,
292 AppendSpaceCallback append_break_cb
,
293 HotCallback hot_up_cb
,
294 HotCallback hot_down_cb
)
296 AppendText
= append_text_cb
;
297 AddProperty
= add_prop_cb
;
298 AppendWhiteSpace
= append_white_cb
;
299 AppendStructuralBreak
= append_break_cb
;
301 HotDown
= hot_down_cb
;
303 HtmlDocument doc
= new HtmlDocument ();
304 doc
.ReportNode
+= HandleNodeEvent
;
305 doc
.StreamMode
= true;
308 doc
.LoadHtml (html_string
);
309 } catch (Exception e
) {
310 Log
.Debug (e
, "Exception while filtering html string [{0}]", html_string
);
315 virtual protected void RegisterSupportedTypes ()
317 AddSupportedFlavor (FilterFlavor
.NewFromMimeType ("text/html"));