Compute lucene-style scores for our hits.
[beagle.git] / Filters / FilterHtml.cs
blobcad36025942f28da562e51f781be4f77ae47f780
1 //
2 // FilterHtml.cs
3 //
4 // Copyright (C) 2004 Novell, Inc.
5 //
7 //
8 // Permission is hereby granted, free of charge, to any person obtaining a
9 // copy of this software and associated documentation files (the "Software"),
10 // to deal in the Software without restriction, including without limitation
11 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 // and/or sell copies of the Software, and to permit persons to whom the
13 // Software is furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 // DEALINGS IN THE SOFTWARE.
28 using System;
29 using System.Collections;
30 using System.IO;
31 using System.Text;
33 using Beagle.Daemon;
35 using HtmlAgilityPack;
37 namespace Beagle.Filters {
39 public class FilterHtml : Beagle.Daemon.Filter {
41 public FilterHtml ()
43 RegisterSupportedTypes ();
44 SnippetMode = true;
47 protected bool NodeIsHot (String nodeName)
49 return nodeName == "b"
50 || nodeName == "u"
51 || nodeName == "em"
52 || nodeName == "strong"
53 || nodeName == "big"
54 || nodeName == "h1"
55 || nodeName == "h2"
56 || nodeName == "h3"
57 || nodeName == "h4"
58 || nodeName == "h5"
59 || nodeName == "h6"
60 || nodeName == "i"
61 || nodeName == "th";
64 protected static bool NodeBreaksText (String nodeName)
66 return nodeName == "td"
67 || nodeName == "a"
68 || nodeName == "div"
69 || nodeName == "option";
72 protected static bool NodeBreaksStructure (string nodeName)
74 return nodeName == "p"
75 || nodeName == "br"
76 || nodeName == "h1"
77 || nodeName == "h2"
78 || nodeName == "h3"
79 || nodeName == "h4"
80 || nodeName == "h5"
81 || nodeName == "h6";
84 protected static bool NodeIsContentFree (String nodeName)
86 return nodeName == "script"
87 || nodeName == "map"
88 || nodeName == "style";
91 protected String WalkChildNodesForText (HtmlNode node)
93 StringBuilder builder = new StringBuilder ("");
94 foreach (HtmlNode subnode in node.ChildNodes) {
95 switch (subnode.NodeType) {
96 case HtmlNodeType.Element:
97 if (! NodeIsContentFree (subnode.Name)) {
98 String subtext = WalkChildNodesForText (subnode);
99 builder.Append (subtext);
101 break;
103 case HtmlNodeType.Text:
104 String text = ((HtmlTextNode)subnode).Text;
105 text = HtmlEntity.DeEntitize (text);
106 builder.Append (text);
107 break;
110 return builder.ToString ().Trim ();
113 protected void WalkHeadNodes (HtmlNode node)
115 foreach (HtmlNode subnode in node.ChildNodes) {
116 if (subnode.NodeType == HtmlNodeType.Element
117 && subnode.Name == "title") {
118 String title = WalkChildNodesForText (subnode);
119 title = HtmlEntity.DeEntitize (title);
120 AddProperty (Beagle.Property.New ("dc:title", title));
122 if (subnode.NodeType == HtmlNodeType.Element
123 && subnode.Name == "meta") {
124 string name = subnode.GetAttributeValue ("name", "");
125 string content = subnode.GetAttributeValue ("content", "");
126 if (name != "" && content != "")
127 AddProperty (Beagle.Property.New (name, content));
132 protected void WalkBodyNodes (HtmlNode node)
134 switch (node.NodeType) {
136 case HtmlNodeType.Document:
137 case HtmlNodeType.Element:
138 if (! NodeIsContentFree (node.Name)) {
139 bool isHot = NodeIsHot (node.Name);
140 bool breaksText = NodeBreaksText (node.Name);
141 bool breaksStructure = NodeBreaksStructure (node.Name);
142 if (isHot)
143 HotUp ();
144 if (breaksText)
145 AppendWhiteSpace ();
146 if (node.Name == "img") {
147 string attr = node.GetAttributeValue ("alt", "");
148 if (attr != "") {
149 AppendText (attr);
152 if (node.Name == "a") {
153 string attr = node.GetAttributeValue ("href", "");
154 if (attr != "") {
155 AppendText (attr);
158 foreach (HtmlNode subnode in node.ChildNodes)
159 WalkBodyNodes (subnode);
160 if (breaksText)
161 AppendWhiteSpace ();
162 if (breaksStructure)
163 AppendStructuralBreak ();
164 if (isHot)
165 HotDown ();
168 break;
170 case HtmlNodeType.Text:
171 String text = ((HtmlTextNode)node).Text;
172 text = HtmlEntity.DeEntitize (text);
173 AppendText (text);
174 break;
179 protected void WalkNodes (HtmlNode node)
181 foreach (HtmlNode subnode in node.ChildNodes) {
182 if (subnode.NodeType == HtmlNodeType.Element) {
183 switch (subnode.Name) {
184 case "html":
185 WalkNodes (subnode);
186 break;
187 case "head":
188 WalkHeadNodes (subnode);
189 break;
190 case "body":
191 default:
192 WalkBodyNodes (subnode);
193 break;
199 override protected void DoOpen (FileInfo info)
201 HtmlDocument doc = new HtmlDocument ();
203 try {
204 doc.Load (Stream);
205 } catch (NotSupportedException e) {
206 doc.Load (Stream, Encoding.ASCII);
209 if (doc != null)
210 WalkNodes (doc.DocumentNode);
211 Finished ();
215 virtual protected void RegisterSupportedTypes ()
217 AddSupportedFlavor (FilterFlavor.NewFromMimeType ("text/html"));