Compute lucene-style scores for our hits.
[beagle.git] / Filters / FilterAbiword.cs
bloba59b66f66b09892e0f0ebe4a32acb34bb922d6d3
1 //
2 // FilterAbiword.cs : Trivial implementation of a Abiword-document filter.
3 //
4 // Author: Veerapuram Varadhan <vvaradhan@novell.com>
5 //
6 // Copyright (C) 2004 Novell, Inc.
7 //
9 //
10 // Permission is hereby granted, free of charge, to any person obtaining a
11 // copy of this software and associated documentation files (the "Software"),
12 // to deal in the Software without restriction, including without limitation
13 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
14 // and/or sell copies of the Software, and to permit persons to whom the
15 // Software is furnished to do so, subject to the following conditions:
17 // The above copyright notice and this permission notice shall be included in
18 // all copies or substantial portions of the Software.
20 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
25 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
26 // DEALINGS IN THE SOFTWARE.
29 using System;
30 using System.Collections;
31 using System.IO;
32 using System.Text;
33 using System.Xml;
35 using Beagle.Util;
36 using Beagle.Daemon;
38 using ICSharpCode.SharpZipLib.GZip;
40 namespace Beagle.Filters {
42 public class FilterAbiWord : Beagle.Daemon.Filter {
44 Hashtable hotStyles;
46 public FilterAbiWord ()
48 AddSupportedFlavor (FilterFlavor.NewFromMimeType ("application/x-abiword"));
49 SnippetMode = true;
52 // Process the <styles> ... </styles> nodes.
53 void StudyStyleNode (XmlTextReader reader)
55 string styleName = null;
56 int original_depth = reader.Depth;
58 if (!reader.IsEmptyElement) {
59 reader.Read ();
60 while (reader.Depth > original_depth) {
61 if (reader.NodeType == XmlNodeType.Element
62 && reader.Name == "s") {
63 styleName = reader.GetAttribute ("name");
64 if (styleName != null &&
65 (styleName.ToLower().IndexOf ("head") > -1 ||
66 styleName.ToLower().IndexOf ("note") > -1))
67 hotStyles [styleName] = true;
69 reader.Read ();
74 // Process the props="blah:blah; blah:blah;" values
75 bool StudyPropsAttribute (string props)
77 string[] propsTokens = null;
78 string[] propAndValue = null;
79 bool retVal = false;
81 if (props == null)
82 return false;
84 propsTokens = props.Split (';');
86 if (propsTokens.Length > 0) {
87 for (int i = 0; i < propsTokens.Length; i++) {
89 propAndValue = propsTokens[i].Split (':');
90 switch (propAndValue[0].Trim()) {
91 case "font-weight":
92 if (propAndValue[1] == "bold")
93 retVal = true;
94 break;
96 case "font-style":
97 if (propAndValue[1] == "italic")
98 retVal = true;
99 break;
101 case "text-decoration":
102 if (propAndValue[1] == "underline")
103 retVal = true;
104 break;
106 case "bgcolor":
107 return retVal = true;
111 return retVal;
114 static bool NodeIsFreezing (String nodeName)
116 return nodeName == "text:footnote-citation";
119 static bool NodeBreaksTextAfter (String nodeName)
121 return nodeName == "p";
124 private Stack hot_nodes = new Stack ();
125 private bool inSection = false;
127 // Walk through the <section> ... </section> nodes
128 // and extract the texts.
129 bool WalkContentNodes (XmlTextReader reader)
131 // total number of elements to read per-pull
132 const int total_elements = 10;
133 int num_elements = 0;
134 while (reader.Read ()) {
135 if (reader.Name == "styles" &&
136 reader.NodeType == XmlNodeType.Element) {
137 StudyStyleNode (reader);
138 continue;
139 } else if (!inSection && reader.Name != "section")
140 continue;
142 switch (reader.NodeType) {
143 case XmlNodeType.Element:
144 // A node/text is hot if:
145 // (1) It is flagged with a hot style (header, footer and
146 // other styles)
147 // (2) It contains "hot" styled attributes.
148 bool isHot = false;
149 if (reader.Name == "section") {
150 string type = reader.GetAttribute ("type");
151 if (type == "header" ||
152 type == "footer")
153 isHot = true;
154 inSection = true;
156 } else if (reader.IsEmptyElement) {
157 if (NodeBreaksTextAfter (reader.Name)) {
158 AppendWhiteSpace ();
159 AppendStructuralBreak ();
161 continue;
164 // <c ....> text blah blah </c> overrides the
165 // formatting at the paragraph level.
166 if (reader.Name == "c") {
167 string val = reader.GetAttribute ("props");
168 isHot = StudyPropsAttribute (val);
169 //Console.WriteLine ("{0} is hot? {1}", val, isHot);
170 } else {
172 bool has_attr = reader.MoveToFirstAttribute ();
173 while (has_attr) {
174 if (reader.Name == "style") {
175 if (hotStyles.Contains (reader.Value))
176 isHot = true;
177 break;
179 has_attr = reader.MoveToNextAttribute ();
182 reader.MoveToElement();
185 hot_nodes.Push (isHot);
187 if (isHot)
188 HotUp ();
190 if (NodeIsFreezing (reader.Name))
191 FreezeUp ();
193 break;
194 case XmlNodeType.Text:
195 string text = reader.Value;
196 AppendText (text);
197 break;
198 case XmlNodeType.EndElement:
199 if (NodeBreaksTextAfter (reader.Name)) {
200 AppendWhiteSpace ();
201 AppendStructuralBreak ();
204 bool is_hot = (bool) hot_nodes.Pop ();
205 if (is_hot)
206 HotDown ();
208 if (NodeIsFreezing (reader.Name))
209 FreezeDown ();
210 if (reader.Name == "section")
211 inSection = false;
212 break;
214 num_elements++;
215 if (num_elements >= total_elements) {
216 return false;
219 return true;
222 private void ExtractMetadata (XmlTextReader reader)
224 string key = null;
225 bool found = false;
226 int depth = -1;
228 while (reader.Read()) {
229 if (!found && reader.Name == "metadata" && reader.NodeType == XmlNodeType.Element) {
230 found = true;
231 depth = reader.Depth;
232 continue;
235 if (found && reader.Name == "metadata" && reader.NodeType == XmlNodeType.EndElement)
236 break;
238 if (found && reader.Name == "m" && reader.Depth > depth) {
239 key = reader.GetAttribute ("key");
240 switch (key) {
241 case "abiword.generator":
242 reader.Read ();
243 AddProperty (Beagle.Property.New ("fixme:appname", reader.Value ));
244 break;
246 case "dc.description":
247 reader.Read ();
248 AddProperty (Beagle.Property.New ("dc:description", reader.Value ));
249 break;
251 case "abiword.keywords":
252 reader.Read ();
253 AddProperty (Beagle.Property.New ("fixme:keywords", reader.Value ));
254 break;
256 case "dc.relation":
257 reader.Read ();
258 AddProperty (Beagle.Property.New ("dc:relation", reader.Value ));
259 break;
261 case "dc.rights":
262 reader.Read ();
263 AddProperty (Beagle.Property.New ("dc:rights", reader.Value ));
264 break;
266 case "dc.source":
267 reader.Read ();
268 AddProperty (Beagle.Property.New ("dc:source", reader.Value ));
269 break;
271 case "dc.contributor":
272 reader.Read ();
273 AddProperty (Beagle.Property.New ("dc:contributor", reader.Value ));
274 break;
276 case "dc.subject":
277 reader.Read ();
278 AddProperty (Beagle.Property.New ("dc:subject", reader.Value ));
279 break;
281 case "dc.creator":
282 reader.Read ();
283 AddProperty (Beagle.Property.New ("dc:creator", reader.Value ));
284 break;
286 case "dc.coverage":
287 reader.Read ();
288 AddProperty (Beagle.Property.New ("dc:coverage", reader.Value ));
289 break;
291 case "dc.type":
292 reader.Read ();
293 AddProperty (Beagle.Property.New ("dc:type", reader.Value ));
294 break;
296 case "dc.language":
297 reader.Read ();
298 AddProperty (Beagle.Property.New ("dc:language", reader.Value ));
299 break;
301 case "dc.title":
302 reader.Read ();
303 AddProperty (Beagle.Property.New ("dc:title", reader.Value ));
304 break;
306 case "dc.publisher":
307 reader.Read ();
308 AddProperty (Beagle.Property.New ("dc:publisher", reader.Value ));
309 break;
315 XmlTextReader reader = null;
316 override protected void DoOpen (FileInfo info)
318 hotStyles = new Hashtable ();
319 reader = new XmlTextReader (info.FullName);
322 override protected void DoPullProperties ()
324 XmlTextReader metaReader = new XmlTextReader (FileInfo.FullName);
325 try {
326 ExtractMetadata (metaReader);
327 metaReader.Close ();
328 } catch (Exception e) {
329 metaReader.Close ();
330 Finished ();
331 Logger.Log.Error ("Exception occurred while reading meta-data from {0}",
332 FileInfo.FullName);
333 Logger.Log.Debug (e);
337 override protected void DoPull ()
339 if (reader == null) {
340 Finished ();
341 return;
343 try {
344 if (WalkContentNodes (reader)) {
345 reader.Close ();
346 Finished ();
348 } catch (Exception e) {
349 reader.Close ();
350 Finished ();
351 Logger.Log.Error ("Exception occurred while reading contents from {0}",
352 FileInfo.FullName);
353 Logger.Log.Debug (e);