NoiseFilter: Dont drop last word of apparent hostnames. Too many non-hostnames can...
[beagle.git] / Filters / FilterAbiword.cs
blob07b4676a27ae127455f33bbdf11bd3a57b507729
1 //
2 // FilterAbiword.cs : Trivial implementation of a Abiword-document filter.
3 //
4 // Author: Veerapuram Varadhan <vvaradhan@novell.com>
5 //
6 // Copyright (C) 2004 Novell, Inc.
7 //
9 //
10 // Permission is hereby granted, free of charge, to any person obtaining a
11 // copy of this software and associated documentation files (the "Software"),
12 // to deal in the Software without restriction, including without limitation
13 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
14 // and/or sell copies of the Software, and to permit persons to whom the
15 // Software is furnished to do so, subject to the following conditions:
17 // The above copyright notice and this permission notice shall be included in
18 // all copies or substantial portions of the Software.
20 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
25 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
26 // DEALINGS IN THE SOFTWARE.
29 using System;
30 using System.Collections;
31 using System.IO;
32 using System.Text;
33 using System.Xml;
35 using Beagle.Util;
36 using Beagle.Daemon;
38 using ICSharpCode.SharpZipLib.GZip;
40 namespace Beagle.Filters {
42 public class FilterAbiWord : Beagle.Daemon.Filter {
44 Hashtable hotStyles;
45 bool is_gzipped;
47 public FilterAbiWord ()
49 AddSupportedFlavor (FilterFlavor.NewFromMimeType ("application/x-abiword"));
50 SnippetMode = true;
53 // Process the <styles> ... </styles> nodes.
54 void StudyStyleNode (XmlTextReader reader)
56 string styleName = null;
57 int original_depth = reader.Depth;
59 if (!reader.IsEmptyElement) {
60 reader.Read ();
61 while (reader.Depth > original_depth) {
62 if (reader.NodeType == XmlNodeType.Element
63 && reader.Name == "s") {
64 styleName = reader.GetAttribute ("name");
65 if (styleName != null &&
66 (styleName.ToLower().IndexOf ("head") > -1 ||
67 styleName.ToLower().IndexOf ("note") > -1))
68 hotStyles [styleName] = true;
70 reader.Read ();
75 // Process the props="blah:blah; blah:blah;" values
76 bool StudyPropsAttribute (string props)
78 string[] propsTokens = null;
79 string[] propAndValue = null;
80 bool retVal = false;
82 if (props == null)
83 return false;
85 propsTokens = props.Split (';');
87 if (propsTokens.Length > 0) {
88 for (int i = 0; i < propsTokens.Length; i++) {
90 propAndValue = propsTokens[i].Split (':');
91 switch (propAndValue[0].Trim()) {
92 case "font-weight":
93 if (propAndValue[1] == "bold")
94 retVal = true;
95 break;
97 case "font-style":
98 if (propAndValue[1] == "italic")
99 retVal = true;
100 break;
102 case "text-decoration":
103 if (propAndValue[1] == "underline")
104 retVal = true;
105 break;
107 case "bgcolor":
108 return retVal = true;
112 return retVal;
115 static bool NodeIsFreezing (String nodeName)
117 return nodeName == "text:footnote-citation";
120 static bool NodeBreaksTextAfter (String nodeName)
122 return nodeName == "p";
125 private Stack hot_nodes = new Stack ();
126 private bool inSection = false;
128 // Walk through the <section> ... </section> nodes
129 // and extract the texts.
130 bool WalkContentNodes (XmlTextReader reader)
132 // total number of elements to read per-pull
133 const int total_elements = 10;
134 int num_elements = 0;
135 while (reader.Read ()) {
136 if (reader.Name == "styles" &&
137 reader.NodeType == XmlNodeType.Element) {
138 StudyStyleNode (reader);
139 continue;
140 } else if (!inSection && reader.Name != "section")
141 continue;
143 switch (reader.NodeType) {
144 case XmlNodeType.Element:
145 // A node/text is hot if:
146 // (1) It is flagged with a hot style (header, footer and
147 // other styles)
148 // (2) It contains "hot" styled attributes.
149 bool isHot = false;
150 if (reader.Name == "section") {
151 string type = reader.GetAttribute ("type");
152 if (type == "header" ||
153 type == "footer")
154 isHot = true;
155 inSection = true;
157 } else if (reader.IsEmptyElement) {
158 if (NodeBreaksTextAfter (reader.Name)) {
159 AppendWhiteSpace ();
160 AppendStructuralBreak ();
162 continue;
165 // <c ....> text blah blah </c> overrides the
166 // formatting at the paragraph level.
167 if (reader.Name == "c") {
168 string val = reader.GetAttribute ("props");
169 isHot = StudyPropsAttribute (val);
170 //Console.WriteLine ("{0} is hot? {1}", val, isHot);
171 } else {
173 bool has_attr = reader.MoveToFirstAttribute ();
174 while (has_attr) {
175 if (reader.Name == "style") {
176 if (hotStyles.Contains (reader.Value))
177 isHot = true;
178 break;
180 has_attr = reader.MoveToNextAttribute ();
183 reader.MoveToElement();
186 hot_nodes.Push (isHot);
188 if (isHot)
189 HotUp ();
191 if (NodeIsFreezing (reader.Name))
192 FreezeUp ();
194 break;
195 case XmlNodeType.Text:
196 string text = reader.Value;
197 AppendText (text);
198 break;
199 case XmlNodeType.EndElement:
200 if (NodeBreaksTextAfter (reader.Name)) {
201 AppendWhiteSpace ();
202 AppendStructuralBreak ();
205 bool is_hot = (bool) hot_nodes.Pop ();
206 if (is_hot)
207 HotDown ();
209 if (NodeIsFreezing (reader.Name))
210 FreezeDown ();
211 if (reader.Name == "section")
212 inSection = false;
213 break;
215 num_elements++;
216 if (num_elements >= total_elements) {
217 return false;
220 return true;
223 private void ExtractMetadata (XmlTextReader reader)
225 string key = null;
226 bool found = false;
227 int depth = -1;
229 while (reader.Read()) {
230 if (!found && reader.Name == "metadata" && reader.NodeType == XmlNodeType.Element) {
231 found = true;
232 depth = reader.Depth;
233 continue;
236 if (found && reader.Name == "metadata" && reader.NodeType == XmlNodeType.EndElement)
237 break;
239 if (found && reader.Name == "m" && reader.Depth > depth) {
240 key = reader.GetAttribute ("key");
241 switch (key) {
242 case "abiword.generator":
243 reader.Read ();
244 AddProperty (Beagle.Property.New ("fixme:appname", reader.Value ));
245 break;
247 case "dc.description":
248 reader.Read ();
249 AddProperty (Beagle.Property.New ("dc:description", reader.Value ));
250 break;
252 case "abiword.keywords":
253 reader.Read ();
254 AddProperty (Beagle.Property.New ("fixme:keywords", reader.Value ));
255 break;
257 case "dc.relation":
258 reader.Read ();
259 AddProperty (Beagle.Property.New ("dc:relation", reader.Value ));
260 break;
262 case "dc.rights":
263 reader.Read ();
264 AddProperty (Beagle.Property.New ("dc:rights", reader.Value ));
265 break;
267 case "dc.source":
268 reader.Read ();
269 AddProperty (Beagle.Property.New ("dc:source", reader.Value ));
270 break;
272 case "dc.contributor":
273 reader.Read ();
274 AddProperty (Beagle.Property.New ("dc:contributor", reader.Value ));
275 break;
277 case "dc.subject":
278 reader.Read ();
279 AddProperty (Beagle.Property.New ("dc:subject", reader.Value ));
280 break;
282 case "dc.creator":
283 reader.Read ();
284 AddProperty (Beagle.Property.New ("dc:creator", reader.Value ));
285 break;
287 case "dc.coverage":
288 reader.Read ();
289 AddProperty (Beagle.Property.New ("dc:coverage", reader.Value ));
290 break;
292 case "dc.type":
293 reader.Read ();
294 AddProperty (Beagle.Property.New ("dc:type", reader.Value ));
295 break;
297 case "dc.language":
298 reader.Read ();
299 AddProperty (Beagle.Property.New ("dc:language", reader.Value ));
300 break;
302 case "dc.title":
303 reader.Read ();
304 AddProperty (Beagle.Property.New ("dc:title", reader.Value ));
305 break;
307 case "dc.publisher":
308 reader.Read ();
309 AddProperty (Beagle.Property.New ("dc:publisher", reader.Value ));
310 break;
316 private XmlTextReader BuildReader (string path)
318 Stream s;
319 s = new FileStream (path,
320 FileMode.Open,
321 FileAccess.Read,
322 FileShare.Read);
324 if (is_gzipped)
325 s = new GZipInputStream (s);
327 return new XmlTextReader (s);
330 XmlTextReader reader = null;
331 override protected void DoOpen (FileInfo info)
333 // Try to open the file as if it is gzip.
334 // If that fails, we conclude that it must
335 // just be a regular text file full of xml.
336 is_gzipped = true;
337 try {
338 Stream s;
339 s = new FileStream (info.FullName, FileMode.Open, FileAccess.Read, FileShare.Read);
340 Stream z;
341 z = new GZipInputStream (s);
342 z.ReadByte ();
343 z.Close ();
344 s.Close ();
345 } catch (Exception ex) {
346 is_gzipped = false;
349 hotStyles = new Hashtable ();
350 reader = BuildReader (info.FullName);
353 override protected void DoPullProperties ()
355 XmlTextReader metaReader = BuildReader (FileInfo.FullName);
356 try {
357 ExtractMetadata (metaReader);
358 metaReader.Close ();
359 } catch (Exception e) {
360 metaReader.Close ();
361 Finished ();
362 Logger.Log.Error ("Exception occurred while reading meta-data from {0}",
363 FileInfo.FullName);
364 Logger.Log.Debug (e);
368 override protected void DoPull ()
370 if (reader == null) {
371 Finished ();
372 return;
374 try {
375 if (WalkContentNodes (reader)) {
376 reader.Close ();
377 Finished ();
379 } catch (Exception e) {
380 reader.Close ();
381 Finished ();
382 Logger.Log.Error ("Exception occurred while reading contents from {0}",
383 FileInfo.FullName);
384 Logger.Log.Debug (e);