4 // Copyright (C) 2004 Novell, Inc.
8 // Permission is hereby granted, free of charge, to any person obtaining a
9 // copy of this software and associated documentation files (the "Software"),
10 // to deal in the Software without restriction, including without limitation
11 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 // and/or sell copies of the Software, and to permit persons to whom the
13 // Software is furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 // DEALINGS IN THE SOFTWARE.
29 using System
.Collections
;
35 using HtmlAgilityPack
;
37 namespace Beagle
.Filters
{
39 public class FilterHtml
: Beagle
.Daemon
.Filter
{
43 RegisterSupportedTypes ();
47 protected bool NodeIsHot (String nodeName
)
49 return nodeName
== "b"
52 || nodeName
== "strong"
64 protected static bool NodeBreaksText (String nodeName
)
66 return nodeName
== "td"
69 || nodeName
== "option";
72 protected static bool NodeBreaksStructure (string nodeName
)
74 return nodeName
== "p"
84 protected static bool NodeIsContentFree (String nodeName
)
86 return nodeName
== "script"
88 || nodeName
== "style";
91 protected String
WalkChildNodesForText (HtmlNode node
)
93 StringBuilder builder
= new StringBuilder ("");
94 foreach (HtmlNode subnode
in node
.ChildNodes
) {
95 switch (subnode
.NodeType
) {
96 case HtmlNodeType
.Element
:
97 if (! NodeIsContentFree (subnode
.Name
)) {
98 String subtext
= WalkChildNodesForText (subnode
);
99 builder
.Append (subtext
);
103 case HtmlNodeType
.Text
:
104 String text
= ((HtmlTextNode
)subnode
).Text
;
105 text
= HtmlEntity
.DeEntitize (text
);
106 builder
.Append (text
);
110 return builder
.ToString ().Trim ();
113 protected void WalkHeadNodes (HtmlNode node
)
115 foreach (HtmlNode subnode
in node
.ChildNodes
) {
116 if (subnode
.NodeType
== HtmlNodeType
.Element
117 && subnode
.Name
== "title") {
118 String title
= WalkChildNodesForText (subnode
);
119 title
= HtmlEntity
.DeEntitize (title
);
120 AddProperty (Beagle
.Property
.New ("dc:title", title
));
122 if (subnode
.NodeType
== HtmlNodeType
.Element
123 && subnode
.Name
== "meta") {
124 string name
= subnode
.GetAttributeValue ("name", "");
125 string content
= subnode
.GetAttributeValue ("content", "");
126 if (name
!= "" && content
!= "")
127 AddProperty (Beagle
.Property
.New (name
, content
));
132 protected void WalkBodyNodes (HtmlNode node
)
134 switch (node
.NodeType
) {
136 case HtmlNodeType
.Document
:
137 case HtmlNodeType
.Element
:
138 if (! NodeIsContentFree (node
.Name
)) {
139 bool isHot
= NodeIsHot (node
.Name
);
140 bool breaksText
= NodeBreaksText (node
.Name
);
141 bool breaksStructure
= NodeBreaksStructure (node
.Name
);
146 if (node
.Name
== "img") {
147 string attr
= node
.GetAttributeValue ("alt", "");
152 if (node
.Name
== "a") {
153 string attr
= node
.GetAttributeValue ("href", "");
158 foreach (HtmlNode subnode
in node
.ChildNodes
)
159 WalkBodyNodes (subnode
);
163 AppendStructuralBreak ();
170 case HtmlNodeType
.Text
:
171 String text
= ((HtmlTextNode
)node
).Text
;
172 text
= HtmlEntity
.DeEntitize (text
);
179 protected void WalkNodes (HtmlNode node
)
181 foreach (HtmlNode subnode
in node
.ChildNodes
) {
182 if (subnode
.NodeType
== HtmlNodeType
.Element
) {
183 switch (subnode
.Name
) {
188 WalkHeadNodes (subnode
);
192 WalkBodyNodes (subnode
);
199 override protected void DoOpen (FileInfo info
)
201 HtmlDocument doc
= new HtmlDocument ();
205 } catch (NotSupportedException e
) {
206 doc
.Load (Stream
, Encoding
.ASCII
);
210 WalkNodes (doc
.DocumentNode
);
215 virtual protected void RegisterSupportedTypes ()
217 AddSupportedFlavor (FilterFlavor
.NewFromMimeType ("text/html"));