NoiseFilter: Dont drop last word of apparent hostnames. Too many non-hostnames can...
[beagle.git] / Filters / FilterOpenOffice.cs
blob67943e12bda63363b1151f57517ef791aadc5605
1 //
2 // FilterOpenOffice.cs
3 //
4 // Copyright (C) 2004 Novell, Inc.
5 //
7 //
8 // Permission is hereby granted, free of charge, to any person obtaining a
9 // copy of this software and associated documentation files (the "Software"),
10 // to deal in the Software without restriction, including without limitation
11 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 // and/or sell copies of the Software, and to permit persons to whom the
13 // Software is furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 // DEALINGS IN THE SOFTWARE.
28 using System;
29 using System.Collections;
30 using System.IO;
31 using System.Text;
32 using System.Xml;
34 using Beagle.Util;
35 using Beagle.Daemon;
37 using ICSharpCode.SharpZipLib.Zip;
39 namespace Beagle.Filters {
41 public class FilterOpenOffice : Beagle.Daemon.Filter {
43 private Hashtable hotStyles;
44 private bool odtFormat;
46 private Hashtable attr_to_index = null;
47 private Hashtable InterestingAttribute {
48 get {
49 if (attr_to_index != null)
50 return attr_to_index;
52 attr_to_index = new Hashtable (1 /* Number of attributes */);
53 // Add interesting node-attribute pairs to this hashtable
54 // attr_to_index [node_name] = attribute_name
55 attr_to_index ["table:table"] = "table:name";
57 return attr_to_index;
61 public FilterOpenOffice ()
63 // OO 1.0 mime types
64 AddSupportedFlavor (FilterFlavor.NewFromMimeType ("application/vnd.sun.xml.writer"));
65 AddSupportedFlavor (FilterFlavor.NewFromMimeType ("application/vnd.sun.xml.writer.template"));
66 AddSupportedFlavor (FilterFlavor.NewFromMimeType ("application/vnd.sun.xml.calc"));
67 AddSupportedFlavor (FilterFlavor.NewFromMimeType ("application/vnd.sun.xml.calc.template"));
68 AddSupportedFlavor (FilterFlavor.NewFromMimeType ("application/vnd.sun.xml.impress"));
69 AddSupportedFlavor (FilterFlavor.NewFromMimeType ("application/vnd.sun.xml.impress.template"));
70 AddSupportedFlavor (FilterFlavor.NewFromMimeType ("application/vnd.sun.xml.draw"));
71 AddSupportedFlavor (FilterFlavor.NewFromMimeType ("application/vnd.sun.xml.draw.template"));
73 // OO 2.0 mime types
74 AddSupportedFlavor (FilterFlavor.NewFromMimeType ("application/vnd.oasis.opendocument.text"));
75 AddSupportedFlavor (FilterFlavor.NewFromMimeType ("application/vnd.oasis.opendocument.text-template"));
76 AddSupportedFlavor (FilterFlavor.NewFromMimeType ("application/vnd.oasis.opendocument.spreadsheet"));
77 AddSupportedFlavor (FilterFlavor.NewFromMimeType ("application/vnd.oasis.opendocument.spreadsheet-template"));
78 AddSupportedFlavor (FilterFlavor.NewFromMimeType ("application/vnd.oasis.opendocument.presentation"));
79 AddSupportedFlavor (FilterFlavor.NewFromMimeType ("application/vnd.oasis.opendocument.presentation-template"));
80 AddSupportedFlavor (FilterFlavor.NewFromMimeType ("application/vnd.oasis.opendocument.graphics"));
81 AddSupportedFlavor (FilterFlavor.NewFromMimeType ("application/vnd.oasis.opendocument.graphics-template"));
83 SnippetMode = true;
84 odtFormat = false;
87 // Parse the "style" nodes and mark appropriate styles as *HOT*
88 // FIXME: Identify and ADD more *HOT* styles. ;)
89 void StudyStyleNode (XmlReader reader)
91 string style_name = reader.GetAttribute ("style:name");
92 string style_parent = reader.GetAttribute ("style:parent-style-name");
94 string weight = null;
95 string underline = null;
96 string italic = null;
97 int original_depth = reader.Depth;
99 if (!reader.IsEmptyElement) {
100 reader.Read ();
101 while (reader.Depth > original_depth) {
102 if (reader.NodeType == XmlNodeType.Element
103 && (reader.Name == "style:properties" ||
104 reader.Name == "style:text-properties")) { /* ODT changes */
105 weight = reader.GetAttribute ("fo:font-weight");
106 italic = reader.GetAttribute ("fo:font-style");
107 underline = reader.GetAttribute ("style:text-underline");
109 reader.Read ();
113 if ((style_parent != null && style_parent.StartsWith("Heading"))
114 || (style_name != null && ((String.Compare (style_name, "Footnote") == 0)
115 || (String.Compare (style_name, "Endnote") == 0)
116 || (String.Compare (style_name, "Header") == 0)
117 || (String.Compare (style_name, "Footer") == 0)))
118 || (weight != null && weight == "bold")
119 || (italic != null && italic == "italic")
120 || (underline != null && underline != "none"))
121 hotStyles[style_name] = true;
124 static bool NodeIsLink (String nodeName)
126 return nodeName == "text:a";
129 static bool NodeIsHot (String nodeName)
131 return nodeName == "text:h";
134 // These container tags allows multiple-lines of texts and
135 // all of them should be marked *HOT* and hence called Container ;-)
136 static bool NodeIsHotContainer (String nodeName)
138 return nodeName == "office:annotation" ||
139 nodeName == "text:footnote" ||
140 nodeName == "text:endnote" ||
141 nodeName == "text:note"; // "ODT format"
144 static bool NodeIsFreezing (String nodeName)
146 return nodeName == "text:footnote-citation"
147 || nodeName == "text:endnote-citation"
148 || nodeName == "text:note-citation"; // "ODT format"
152 static bool NodeBreaksTextBefore (String nodeName)
154 return nodeName == "text:footnote"
155 || nodeName == "text:endnote"
156 || nodeName == "office:annotation"
157 || nodeName == "table:table"
158 || nodeName == "text:note"; // "ODT format"
161 static bool NodeBreaksTextAfter (String nodeName)
163 return nodeName == "text:line-break"
164 || nodeName == "text:s"
165 || nodeName == "text:tab-stop"
166 || nodeName == "table:table-cell";
169 static bool NodeBreaksStructureAfter (String nodeName)
171 return nodeName == "text:p"
172 || nodeName == "text:h"
173 || nodeName == "text:footnote"
174 || nodeName == "text:endnote"
175 || nodeName == "office:annotation"
176 || nodeName == "text:note"; // "ODT format"
179 private Stack hot_nodes = new Stack ();
180 private string strPartText = "";
181 private bool bPartHotStyle = false;
182 private Stack hot_container_nodes = new Stack ();
184 void AddTextForIndexing (string paramStr)
186 int sindex = 0;
187 string strTemp;
188 bool wasHot = false;
189 int index = paramStr.LastIndexOf (' ');
191 if (index > -1) {
192 // During the previous-parsing, a word got terminatted partially,
193 // find the remaining part of the word, concatenate it and add it to
194 // the respective pools and reset the HOT status, if required.
195 if (strPartText.Length > 0) {
196 sindex = paramStr.IndexOf (' ');
197 strTemp = strPartText + paramStr.Substring (0, sindex);
198 if (!IsHot) {
199 if (bPartHotStyle)
200 HotUp ();
202 else
203 wasHot = true;
205 AppendText (strTemp);
206 if (!wasHot && bPartHotStyle)
207 HotDown ();
208 bPartHotStyle = false;
210 paramStr = paramStr.Substring (sindex);
211 index = paramStr.LastIndexOf (' ');
212 sindex = 0;
215 if (index > -1) {
216 strPartText = paramStr.Substring (index);
217 paramStr = paramStr.Substring (sindex, index);
218 } else {
219 strTemp = strPartText + paramStr;
220 strPartText = strTemp;
221 paramStr = "";
222 strTemp = "";
225 // Enable *HOT* just before appending the text
226 // because, there can be some *Partial Texts* without
227 // *HOT* styles that needs to be appended.
228 if (hot_nodes.Count > 0 && (bool) hot_nodes.Peek() == true) {
229 if (!IsHot)
230 HotUp ();
231 bPartHotStyle = true;
232 } else
233 bPartHotStyle |= false;
235 if (paramStr.Length > 0)
236 AppendText (paramStr);
238 if (strPartText.Trim().Length < 1)
239 bPartHotStyle = false;
242 void IndexAttribute (string node_name, string attr_name, string attr_value)
244 if (attr_value == null || attr_value.Length == 0)
245 return;
247 if ((string)InterestingAttribute [node_name] == attr_name)
248 AppendText (attr_value);
251 void FlushPartialText ()
253 if (strPartText.Length > 0) {
254 if (bPartHotStyle && !IsHot)
255 HotUp ();
256 AppendText (strPartText);
257 if (IsHot)
258 HotDown ();
259 strPartText = "";
261 bPartHotStyle = false;
265 bool WalkContentNodes (XmlReader reader)
267 // total number of elements to read per-pull
268 const int total_elements = 10;
269 int num_elements = 0;
271 while (reader.Read ()) {
272 switch (reader.NodeType) {
273 case XmlNodeType.Element:
274 if (reader.IsEmptyElement) {
275 FlushPartialText ();
276 if (NodeBreaksStructureAfter (reader.Name))
277 AppendStructuralBreak ();
278 else {
279 if (NodeBreaksTextBefore (reader.Name))
280 AppendWhiteSpace ();
282 if (NodeBreaksTextAfter (reader.Name))
283 AppendWhiteSpace ();
285 continue;
288 // FIXME: Allow adding more style nodes
289 if (reader.Name == "style:style"
290 || reader.Name == "number:date-style"
291 || reader.Name == "style:font-decl") {
292 StudyStyleNode (reader);
293 continue;
296 // A node is hot if:
297 // (1) It's name is hot
298 // (2) It is flagged with a hot style
299 // (3) annotations are always hot.
301 bool isHot = false;
303 if (NodeIsHot (reader.Name)) {
304 isHot = true;
305 } else if (NodeIsHotContainer (reader.Name)) {
306 hot_container_nodes.Push (reader.Name);
307 } else if (NodeIsLink (reader.Name)) {
308 string attr = reader.GetAttribute ("xlink:href");
309 AppendText (attr);
310 AppendWhiteSpace ();
311 isHot = false;
312 } else {
313 string node_name = reader.Name;
314 bool has_attr = reader.MoveToFirstAttribute ();
315 while (has_attr) {
316 if (reader.Name.EndsWith(":style-name")) {
317 if (hotStyles.Contains (reader.Value))
318 isHot = true;
319 break;
320 } else
321 IndexAttribute (node_name, reader.Name, reader.Value);
323 has_attr = reader.MoveToNextAttribute ();
325 reader.MoveToElement();
328 hot_nodes.Push (isHot);
330 // Call *HotUp* iff
331 // i) Its already in *HOT* mode and
332 // ii) there is a hot style/hot container tag
333 if (!IsHot && (isHot || hot_container_nodes.Count > 0))
334 HotUp ();
336 if (NodeIsFreezing (reader.Name)) {
337 FreezeUp ();
340 if (NodeBreaksTextBefore (reader.Name)) {
341 FlushPartialText ();
342 AppendWhiteSpace ();
344 break;
346 case XmlNodeType.Text:
347 string text = reader.Value;
348 if (text.Length < 1)
349 continue;
350 if (!IsFrozen)
351 AddTextForIndexing (text);
352 break;
354 case XmlNodeType.EndElement:
355 if (NodeBreaksStructureAfter (reader.Name)) {
356 FlushPartialText ();
357 AppendStructuralBreak ();
359 else if (NodeBreaksTextAfter (reader.Name))
360 AppendWhiteSpace ();
362 bool is_hot = false;
363 if (hot_nodes.Count > 0)
364 is_hot = (bool) hot_nodes.Pop ();
365 else
366 Logger.Log.Debug ("FilterOpenOffice: hot_nodes underflow in {0}",
367 reader.Name);
368 if (hot_container_nodes.Count > 0) {
369 string hot_container_tag = (string) hot_container_nodes.Peek ();
370 if (hot_container_tag == reader.Name) {
371 hot_container_nodes.Clear ();
372 HotDown ();
376 if (is_hot)
377 HotDown ();
379 if (NodeIsFreezing (reader.Name)) {
380 FreezeDown ();
382 break;
384 num_elements++;
385 if (num_elements >= total_elements) {
386 return false;
389 return true;
392 // SlideCount is not stored in meta.xml rather we need to
393 // parse the whole of content.xml to find out the count of
394 // slides present in an .sxi.
395 private void ExtractSlideCount (XmlReader reader)
397 string slideCount = null;
398 reader.Read ();
399 do {
400 reader.Read ();
402 // Do not parse the whole file if it is not a
403 // presentation (impress document)
404 if (reader.Name == "office:document-content"
405 && reader.NodeType == XmlNodeType.Element) {
406 string docClass = reader.GetAttribute ("office:class");
407 if (docClass != "presentation")
408 return;
410 } while (reader.Depth < 2);
412 while (reader.Depth >= 1) {
413 if (reader.Depth != 2 || reader.NodeType != XmlNodeType.Element) {
414 reader.Read ();
415 continue;
417 switch (reader.Name) {
418 case "draw:page":
419 slideCount = reader.GetAttribute ("draw:id");
420 break;
422 reader.Read ();
424 if (slideCount != null)
425 AddProperty (Beagle.Property.NewUnsearched ("fixme:slide-count", slideCount));
428 private void ExtractMetadata (XmlReader reader)
430 do {
431 reader.Read ();
432 } while (reader.Depth < 2);
434 while (reader.Depth >= 2) {
435 if (reader.Depth != 2 || reader.NodeType != XmlNodeType.Element) {
436 reader.Read ();
437 continue;
439 switch (reader.Name) {
440 case "dc:title":
441 reader.Read ();
442 AddProperty (Beagle.Property.New ("dc:title",
443 reader.Value));
444 break;
446 case "dc:description":
447 reader.Read ();
449 AddProperty (Beagle.Property.New ("dc:description",
450 reader.Value));
451 break;
453 case "dc:subject":
454 reader.Read ();
456 AddProperty (Beagle.Property.New ("dc:subject",
457 reader.Value));
458 break;
460 case "meta:document-statistic":
461 string attr = reader.GetAttribute ("meta:page-count");
462 if (attr != null)
463 AddProperty (Beagle.Property.NewUnsearched ("fixme:page-count", attr));
464 attr = reader.GetAttribute ("meta:word-count");
465 if (attr != null)
466 AddProperty (Beagle.Property.NewUnsearched ("fixme:word-count", attr));
468 // Both writer and calc uses this attribute. writer stores the
469 // count of tables in a sxw whereas calc stores the count of
470 // spreadsheets in a sxc.
471 attr = reader.GetAttribute ("meta:table-count");
472 if (attr != null && Convert.ToInt32 (attr) > 0
473 && MimeType == "application/vnd.sun.xml.calc")
474 AddProperty (Beagle.Property.NewUnsearched ("fixme:spreadsheet-count", attr));
475 break;
477 case "meta:user-defined":
478 string name = reader.GetAttribute ("meta:name");
479 reader.Read ();
481 if (reader.Value != "") {
482 AddProperty (Beagle.Property.New ("fixme:UserDefined-" + name,
483 reader.Value));
485 break;
487 case "meta:keyword":
488 reader.Read ();
489 AddProperty (Beagle.Property.New ("fixme:keywords",
490 + reader.Value));
491 break;
494 reader.Read ();
498 ZipFile zip = null;
500 override protected void DoOpen (FileInfo info)
502 hotStyles = new Hashtable ();
503 try {
504 zip = new ZipFile (info.FullName);
506 if (MimeType.StartsWith ("application/vnd.oasis.opendocument."))
507 odtFormat = true;
509 } catch (Exception e) {
510 Logger.Log.Error ("Unable to open {0}. Probably an invalid OpenOffice document.",
511 info.FullName);
512 Finished ();
516 override protected void DoPullProperties ()
518 if (zip != null) {
519 ZipEntry entry = zip.GetEntry ("meta.xml");
520 if (entry != null) {
521 Stream meta_stream = zip.GetInputStream (entry);
522 XmlReader reader = new XmlTextReader (meta_stream);
523 ExtractMetadata (reader);
524 } else {
525 Logger.Log.Error ("No meta.xml!");
527 entry = zip.GetEntry ("content.xml");
528 if (entry != null) {
529 Stream contents_stream = zip.GetInputStream (entry);
530 XmlReader reader = new XmlTextReader (contents_stream);
531 ExtractSlideCount (reader);
532 } else {
533 Logger.Log.Error ("No content.xml!");
538 XmlReader content_reader = null;
539 XmlReader style_reader = null;
540 override protected void DoPull ()
542 if (zip != null) {
543 // We need both styles.xml and content.xml as
544 // "Header", "Footer" are stored in styles.xml and
545 // "[Foot/End]Notes are stored in content.xml
546 if ((content_reader == null) && (style_reader == null)) {
548 ZipEntry entry = zip.GetEntry ("content.xml");
549 ZipEntry entry1 = zip.GetEntry ("styles.xml");
551 if ((entry != null) && (entry1 != null)) {
552 Stream content_stream = zip.GetInputStream (entry);
553 Stream style_stream = zip.GetInputStream (entry1);
554 content_reader = new XmlTextReader (content_stream);
555 style_reader = new XmlTextReader (style_stream);
559 if ((content_reader == null) && (style_reader == null)) {
560 Finished ();
561 return;
564 // Note: Do not change the order.
565 // we need to populate our hotStyles table with all posible hot styles.
566 // Since, "footnotes" and "endnotes" gets stored in content.xml and these
567 // styles needs to be marked as *HOT*, they need to be processed before contents.
568 if ((WalkContentNodes (style_reader)) && (WalkContentNodes (content_reader)))
569 Finished ();
572 override protected void DoClose ()
574 if (zip != null)
575 zip.Close ();