4 // Copyright (C) 2005 Debajyoti Bera <dbera.web@gmail.com>
5 // Copyright (C) 2004 Novell, Inc.
9 // Permission is hereby granted, free of charge, to any person obtaining a
10 // copy of this software and associated documentation files (the "Software"),
11 // to deal in the Software without restriction, including without limitation
12 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 // and/or sell copies of the Software, and to permit persons to whom the
14 // Software is furnished to do so, subject to the following conditions:
16 // The above copyright notice and this permission notice shall be included in
17 // all copies or substantial portions of the Software.
19 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 // DEALINGS IN THE SOFTWARE.
30 using System
.Collections
;
38 using HtmlAgilityPack
;
40 namespace Beagle
.Filters
{
42 public class FilterHtml
: Beagle
.Daemon
.Filter
{
43 // When see <b> push "b" in the stack
44 // When see </b> pop from the stack
45 // For good error checking, we should compare
46 // current element with what was popped
47 // Currently, we just pop, this might allow
48 // unmatched elements to pass through
49 private Stack hot_stack
;
50 private Stack ignore_stack
;
51 private bool building_text
;
52 private StringBuilder builder
;
56 // 1: Add meta keyword fields as meta:key
59 RegisterSupportedTypes ();
61 hot_stack
= new Stack ();
62 ignore_stack
= new Stack ();
63 building_text
= false;
64 builder
= new StringBuilder ();
67 // Safeguard against spurious stack pop ups...
68 // caused by mismatched tags in bad html files
69 // FIXME: If matching elements is not required
70 // and if HtmlAgilityPack matches elements itself,
71 // then we can just use a counter hot_stack_depth
72 // instead of the hot_stack
73 private void SafePop (Stack st
)
75 if (st
!= null && st
.Count
!= 0)
79 protected bool NodeIsHot (String nodeName
)
81 return nodeName
== "b"
84 || nodeName
== "strong"
96 protected static bool NodeBreaksText (String nodeName
)
98 return nodeName
== "td"
101 || nodeName
== "option";
104 protected static bool NodeBreaksStructure (string nodeName
)
106 return nodeName
== "p"
116 protected static bool NodeIsContentFree (String nodeName
)
118 return nodeName
== "script"
120 || nodeName
== "style";
123 protected bool HandleNodeEvent (HtmlNode node
)
125 switch (node
.NodeType
) {
127 case HtmlNodeType
.Document
:
128 case HtmlNodeType
.Element
:
129 if (node
.Name
== "title") {
132 building_text
= true;
134 String title
= HtmlEntity
.DeEntitize (builder
.ToString ().Trim ());
135 AddProperty (Beagle
.Property
.New ("dc:title", title
));
137 building_text
= false;
139 } else if (node
.Name
== "meta") {
140 string name
= node
.GetAttributeValue ("name", "");
141 string content
= node
.GetAttributeValue ("content", "");
142 if (name
!= "" && content
!= "")
143 AddProperty (Beagle
.Property
.New ("meta:" + name
, content
));
144 } else if (! NodeIsContentFree (node
.Name
)) {
145 bool isHot
= NodeIsHot (node
.Name
);
146 bool breaksText
= NodeBreaksText (node
.Name
);
147 bool breaksStructure
= NodeBreaksStructure (node
.Name
);
148 if (isHot
&& node
.StartTag
) {
149 if (hot_stack
.Count
== 0)
151 hot_stack
.Push (node
.Name
);
153 if (breaksText
&& node
.StartTag
)
155 if (node
.Name
== "img" && node
.StartTag
) {
156 string attr
= node
.GetAttributeValue ("alt", "");
158 AppendText (HtmlEntity
.DeEntitize (attr
));
161 if (node
.Name
== "a" && node
.StartTag
) {
162 string attr
= node
.GetAttributeValue ("href", "");
164 AppendText (HtmlEntity
.DeEntitize (SW
.HttpUtility
.UrlDecode (attr
)));
167 if (breaksText
&& !node
.StartTag
)
169 if (breaksStructure
&& !node
.StartTag
)
170 AppendStructuralBreak ();
171 if (isHot
&& !node
.StartTag
) {
172 if (hot_stack
.Count
!= 0)
174 if (hot_stack
.Count
== 0)
178 // so node is a content-free node
179 // ignore contents of such node
181 ignore_stack
.Push (node
.Name
);
183 SafePop (ignore_stack
);
187 case HtmlNodeType
.Text
:
188 // FIXME Do we need to trim the text ?
189 String text
= ((HtmlTextNode
)node
).Text
;
190 if (ignore_stack
.Count
!= 0)
191 break; // still ignoring ...
193 builder
.Append (text
);
195 AppendText (HtmlEntity
.DeEntitize (text
));
196 //if (hot_stack.Count != 0)
197 //Console.WriteLine (" TEXT:" + text + " ignore=" + ignore_stack.Count);
201 if (! AllowMoreWords ())
206 override protected void DoOpen (FileInfo info
)
210 foreach (Property prop
in IndexableProperties
) {
211 if (prop
.Key
!= StringFu
.UnindexedNamespace
+ "encoding")
215 enc
= Encoding
.GetEncoding ((string) prop
.Value
);
216 } catch (NotSupportedException
) {
217 // Encoding passed in isn't supported. Maybe
218 // we'll get lucky detecting it from the
226 // we need to tell the parser to detect encoding,
227 HtmlDocument temp_doc
= new HtmlDocument ();
228 enc
= temp_doc
.DetectEncoding (Stream
);
229 //Console.WriteLine ("Detected encoding:" + (enc == null ? "null" : enc.EncodingName));
231 Stream
.Seek (0, SeekOrigin
.Begin
);
234 HtmlDocument doc
= new HtmlDocument ();
235 doc
.ReportNode
+= HandleNodeEvent
;
236 doc
.StreamMode
= true;
237 // we already determined encoding
238 doc
.OptionReadEncoding
= false;
244 doc
.Load (Stream
, enc
);
245 } catch (NotSupportedException e
) {
246 doc
.Load (Stream
, Encoding
.ASCII
);
247 } catch (Exception e
) {
248 Console
.WriteLine (e
.Message
);
249 Console
.WriteLine (e
.StackTrace
);
256 virtual protected void RegisterSupportedTypes ()
258 AddSupportedFlavor (FilterFlavor
.NewFromMimeType ("text/html"));