4 // Copyright (C) 2005 Debajyoti Bera <dbera.web@gmail.com>
5 // Copyright (C) 2004 Novell, Inc.
9 // Permission is hereby granted, free of charge, to any person obtaining a
10 // copy of this software and associated documentation files (the "Software"),
11 // to deal in the Software without restriction, including without limitation
12 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 // and/or sell copies of the Software, and to permit persons to whom the
14 // Software is furnished to do so, subject to the following conditions:
16 // The above copyright notice and this permission notice shall be included in
17 // all copies or substantial portions of the Software.
19 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 // DEALINGS IN THE SOFTWARE.
30 using System
.Collections
;
38 using HtmlAgilityPack
;
40 namespace Beagle
.Filters
{
42 public class FilterHtml
: Beagle
.Daemon
.Filter
{
43 // When see <b> push "b" in the stack
44 // When see </b> pop from the stack
45 // For good error checking, we should compare
46 // current element with what was popped
47 // Currently, we just pop, this might allow
48 // unmatched elements to pass through
49 private Stack hot_stack
;
50 private Stack ignore_stack
;
51 private bool building_text
;
52 private StringBuilder builder
;
55 public delegate int AppendTextCallback (string s
);
56 public delegate void AddPropertyCallback (Beagle
.Property p
);
57 public delegate void AppendSpaceCallback ();
58 public delegate void HotCallback ();
61 private new AppendTextCallback AppendText
;
62 private new AddPropertyCallback AddProperty
;
63 private new AppendSpaceCallback AppendWhiteSpace
;
64 private new AppendSpaceCallback AppendStructuralBreak
;
65 private new HotCallback HotUp
;
66 private new HotCallback HotDown
;
68 public FilterHtml (bool register_filter
)
70 if (register_filter
) {
71 // 1: Add meta keyword fields as meta:key
73 RegisterSupportedTypes ();
76 AppendText
= new AppendTextCallback (base.AppendText
);
77 AddProperty
= new AddPropertyCallback (base.AddProperty
);
78 AppendWhiteSpace
= new AppendSpaceCallback (base.AppendWhiteSpace
);
79 AppendStructuralBreak
= new AppendSpaceCallback (base.AppendStructuralBreak
);
80 HotUp
= new HotCallback (base.HotUp
);
81 HotDown
= new HotCallback (base.HotDown
);
84 hot_stack
= new Stack ();
85 ignore_stack
= new Stack ();
86 building_text
= false;
87 builder
= new StringBuilder ();
90 public FilterHtml () : this (true) {}
92 // Safeguard against spurious stack pop ups...
93 // caused by mismatched tags in bad html files
94 // FIXME: If matching elements is not required
95 // and if HtmlAgilityPack matches elements itself,
96 // then we can just use a counter hot_stack_depth
97 // instead of the hot_stack
98 private void SafePop (Stack st
)
100 if (st
!= null && st
.Count
!= 0)
104 protected bool NodeIsHot (String nodeName
)
106 return nodeName
== "b"
109 || nodeName
== "strong"
121 protected static bool NodeBreaksText (String nodeName
)
123 return nodeName
== "td"
126 || nodeName
== "option";
129 protected static bool NodeBreaksStructure (string nodeName
)
131 return nodeName
== "p"
141 protected static bool NodeIsContentFree (String nodeName
)
143 return nodeName
== "script"
145 || nodeName
== "style";
148 protected bool HandleNodeEvent (HtmlNode node
)
150 switch (node
.NodeType
) {
152 case HtmlNodeType
.Document
:
153 case HtmlNodeType
.Element
:
154 if (node
.Name
== "title") {
157 building_text
= true;
159 String title
= HtmlEntity
.DeEntitize (builder
.ToString ().Trim ());
160 AddProperty (Beagle
.Property
.New ("dc:title", title
));
162 building_text
= false;
164 } else if (node
.Name
== "meta") {
165 string name
= node
.GetAttributeValue ("name", "");
166 string content
= node
.GetAttributeValue ("content", "");
167 if (name
!= "" && content
!= "")
168 AddProperty (Beagle
.Property
.New ("meta:" + name
, content
));
169 } else if (! NodeIsContentFree (node
.Name
)) {
170 bool isHot
= NodeIsHot (node
.Name
);
171 bool breaksText
= NodeBreaksText (node
.Name
);
172 bool breaksStructure
= NodeBreaksStructure (node
.Name
);
179 if (hot_stack
.Count
== 0)
181 hot_stack
.Push (node
.Name
);
183 if (node
.Name
== "img") {
184 string attr
= node
.GetAttributeValue ("alt", "");
186 AppendText (HtmlEntity
.DeEntitize (attr
));
189 } else if (node
.Name
== "a") {
190 string attr
= node
.GetAttributeValue ("href", "");
192 AppendText (HtmlEntity
.DeEntitize (SW
.HttpUtility
.UrlDecode (attr
)));
196 } else { // (! node.StartTag)
199 if (hot_stack
.Count
== 0)
203 AppendStructuralBreak ();
209 // so node is a content-free node
210 // ignore contents of such node
212 ignore_stack
.Push (node
.Name
);
214 SafePop (ignore_stack
);
218 case HtmlNodeType
.Text
:
219 // FIXME Do we need to trim the text ?
220 String text
= ((HtmlTextNode
)node
).Text
;
221 if (ignore_stack
.Count
!= 0)
222 break; // still ignoring ...
224 builder
.Append (text
);
226 AppendText (HtmlEntity
.DeEntitize (text
));
227 //if (hot_stack.Count != 0)
228 //Console.WriteLine (" TEXT:" + text + " ignore=" + ignore_stack.Count);
232 if (! AllowMoreWords ())
237 override protected void DoOpen (FileInfo info
)
241 foreach (Property prop
in IndexableProperties
) {
242 if (prop
.Key
!= StringFu
.UnindexedNamespace
+ "encoding")
246 enc
= Encoding
.GetEncoding ((string) prop
.Value
);
247 } catch (NotSupportedException
) {
248 // Encoding passed in isn't supported. Maybe
249 // we'll get lucky detecting it from the
257 // we need to tell the parser to detect encoding,
258 HtmlDocument temp_doc
= new HtmlDocument ();
259 enc
= temp_doc
.DetectEncoding (Stream
);
260 //Console.WriteLine ("Detected encoding:" + (enc == null ? "null" : enc.EncodingName));
262 Stream
.Seek (0, SeekOrigin
.Begin
);
265 HtmlDocument doc
= new HtmlDocument ();
266 doc
.ReportNode
+= HandleNodeEvent
;
267 doc
.StreamMode
= true;
268 // we already determined encoding
269 doc
.OptionReadEncoding
= false;
275 doc
.Load (Stream
, enc
);
276 } catch (NotSupportedException e
) {
277 doc
.Load (Stream
, Encoding
.ASCII
);
278 } catch (Exception e
) {
279 Log
.Debug (e
, "Exception while filtering HTML file");
285 public void ExtractText (string html_string
,
286 AppendTextCallback append_text_cb
,
287 AddPropertyCallback add_prop_cb
,
288 AppendSpaceCallback append_white_cb
,
289 AppendSpaceCallback append_break_cb
,
290 HotCallback hot_up_cb
,
291 HotCallback hot_down_cb
)
293 AppendText
= append_text_cb
;
294 AddProperty
= add_prop_cb
;
295 AppendWhiteSpace
= append_white_cb
;
296 AppendStructuralBreak
= append_break_cb
;
298 HotDown
= hot_down_cb
;
300 HtmlDocument doc
= new HtmlDocument ();
301 doc
.ReportNode
+= HandleNodeEvent
;
302 doc
.StreamMode
= true;
305 doc
.LoadHtml (html_string
);
306 } catch (Exception e
) {
307 Log
.Debug (e
, "Exception while filtering html string [{0}]", html_string
);
312 virtual protected void RegisterSupportedTypes ()
314 AddSupportedFlavor (FilterFlavor
.NewFromMimeType ("text/html"));