Compute lucene-style scores for our hits.
[beagle.git] / Filters / FilterOpenOffice.cs
blobf1e136238774336c1b6d7a1c56d432c7e20ddf68
1 //
2 // FilterOpenOffice.cs
3 //
4 // Copyright (C) 2004 Novell, Inc.
5 //
7 //
8 // Permission is hereby granted, free of charge, to any person obtaining a
9 // copy of this software and associated documentation files (the "Software"),
10 // to deal in the Software without restriction, including without limitation
11 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 // and/or sell copies of the Software, and to permit persons to whom the
13 // Software is furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 // DEALINGS IN THE SOFTWARE.
28 using System;
29 using System.Collections;
30 using System.IO;
31 using System.Text;
32 using System.Xml;
34 using Beagle.Util;
35 using Beagle.Daemon;
37 using ICSharpCode.SharpZipLib.Zip;
39 namespace Beagle.Filters {
41 public class FilterOpenOffice : Beagle.Daemon.Filter {
43 Hashtable hotStyles;
44 bool odtFormat;
46 public FilterOpenOffice ()
48 // OO 1.0 mime types
49 AddSupportedFlavor (FilterFlavor.NewFromMimeType ("application/vnd.sun.xml.writer"));
50 AddSupportedFlavor (FilterFlavor.NewFromMimeType ("application/vnd.sun.xml.writer.template"));
51 AddSupportedFlavor (FilterFlavor.NewFromMimeType ("application/vnd.sun.xml.calc"));
52 AddSupportedFlavor (FilterFlavor.NewFromMimeType ("application/vnd.sun.xml.calc.template"));
53 AddSupportedFlavor (FilterFlavor.NewFromMimeType ("application/vnd.sun.xml.impress"));
54 AddSupportedFlavor (FilterFlavor.NewFromMimeType ("application/vnd.sun.xml.impress.template"));
56 // OO 2.0 mime types
57 AddSupportedFlavor (FilterFlavor.NewFromMimeType ("application/vnd.oasis.opendocument.text"));
58 AddSupportedFlavor (FilterFlavor.NewFromMimeType ("application/vnd.oasis.opendocument.text-template"));
59 AddSupportedFlavor (FilterFlavor.NewFromMimeType ("application/vnd.oasis.opendocument.spreadsheet"));
60 AddSupportedFlavor (FilterFlavor.NewFromMimeType ("application/vnd.oasis.opendocument.spreadsheet-template"));
61 AddSupportedFlavor (FilterFlavor.NewFromMimeType ("application/vnd.oasis.opendocument.presentation"));
62 AddSupportedFlavor (FilterFlavor.NewFromMimeType ("application/vnd.oasis.opendocument.presentation-template"));
64 SnippetMode = true;
65 odtFormat = false;
68 static String FindChildAttribute (XmlNode node,
69 String nodeName,
70 String attributeName)
72 foreach (XmlNode subnode in node.ChildNodes) {
73 if (subnode.Name == nodeName) {
74 XmlAttribute attr = subnode.Attributes [attributeName];
75 if (attr == null)
76 return null;
77 return attr.Value;
80 return null;
83 // Parse the "style" nodes and mark appropriate styles as *HOT*
84 // FIXME: Identify and ADD more *HOT* styles. ;)
85 void StudyStyleNode (XmlReader reader)
87 string style_name = reader.GetAttribute ("style:name");
88 string style_parent = reader.GetAttribute ("style:parent-style-name");
90 string weight = null;
91 string underline = null;
92 string italic = null;
93 int original_depth = reader.Depth;
95 if (!reader.IsEmptyElement) {
96 reader.Read ();
97 while (reader.Depth > original_depth) {
98 if (reader.NodeType == XmlNodeType.Element
99 && (reader.Name == "style:properties" ||
100 reader.Name == "style:text-properties")) { /* ODT changes */
101 weight = reader.GetAttribute ("fo:font-weight");
102 italic = reader.GetAttribute ("fo:font-style");
103 underline = reader.GetAttribute ("style:text-underline");
105 reader.Read ();
109 if ((style_parent != null && style_parent.StartsWith("Heading"))
110 || (style_name != null && ((String.Compare (style_name, "Footnote") == 0)
111 || (String.Compare (style_name, "Endnote") == 0)
112 || (String.Compare (style_name, "Header") == 0)
113 || (String.Compare (style_name, "Footer") == 0)))
114 || (weight != null && weight == "bold")
115 || (italic != null && italic == "italic")
116 || (underline != null && underline != "none"))
117 hotStyles[style_name] = true;
120 static bool NodeIsLink (String nodeName)
122 return nodeName == "text:a";
125 static bool NodeIsHot (String nodeName)
127 return nodeName == "text:h";
130 // These container tags allows multiple-lines of texts and
131 // all of them should be marked *HOT* and hence called Container ;-)
132 static bool NodeIsHotContainer (String nodeName)
134 return nodeName == "office:annotation" ||
135 nodeName == "text:footnote" ||
136 nodeName == "text:endnote" ||
137 nodeName == "text:note"; // "ODT format"
140 static bool NodeIsFreezing (String nodeName)
142 return nodeName == "text:footnote-citation"
143 || nodeName == "text:endnote-citation"
144 || nodeName == "text:note-citation"; // "ODT format"
148 static bool NodeBreaksTextBefore (String nodeName)
150 return nodeName == "text:footnote"
151 || nodeName == "text:endnote"
152 || nodeName == "office:annotation"
153 || nodeName == "text:note"; // "ODT format"
156 static bool NodeBreaksTextAfter (String nodeName)
158 return nodeName == "text:s"
159 || nodeName == "text:tab-stop"
160 || nodeName == "table:table-cell";
163 static bool NodeBreaksStructureAfter (String nodeName)
165 return nodeName == "text:p"
166 || nodeName == "text:h"
167 || nodeName == "text:footnote"
168 || nodeName == "text:endnote"
169 || nodeName == "office:annotation"
170 || nodeName == "text:note"; // "ODT format"
173 private Stack hot_nodes = new Stack ();
174 private string strPartText = "";
175 private bool bPartHotStyle = false;
176 private Stack hot_container_nodes = new Stack ();
178 void AddTextForIndexing (string paramStr)
180 int sindex = 0;
181 string strTemp;
182 bool wasHot = false;
183 int index = paramStr.LastIndexOf (' ');
185 if (index > -1) {
186 // During the previous-parsing, a word got terminatted partially,
187 // find the remaining part of the word, concatenate it and add it to
188 // the respective pools and reset the HOT status, if required.
189 if (strPartText.Length > 0) {
190 sindex = paramStr.IndexOf (' ');
191 strTemp = strPartText + paramStr.Substring (0, sindex);
192 if (!IsHot) {
193 if (bPartHotStyle)
194 HotUp ();
196 else
197 wasHot = true;
199 AppendText (strTemp);
200 if (!wasHot && bPartHotStyle)
201 HotDown ();
202 bPartHotStyle = false;
204 paramStr = paramStr.Substring (sindex);
205 index = paramStr.LastIndexOf (' ');
206 sindex = 0;
209 if (index > -1) {
210 strPartText = paramStr.Substring (index);
211 paramStr = paramStr.Substring (sindex, index);
212 } else {
213 strTemp = strPartText + paramStr;
214 strPartText = strTemp;
215 paramStr = "";
216 strTemp = "";
219 // Enable *HOT* just before appending the text
220 // because, there can be some *Partial Texts* without
221 // *HOT* styles that needs to be appended.
222 if (hot_nodes.Count > 0 && (bool) hot_nodes.Peek() == true) {
223 if (!IsHot)
224 HotUp ();
225 bPartHotStyle = true;
226 } else
227 bPartHotStyle |= false;
229 if (paramStr.Length > 0)
230 AppendText (paramStr);
232 if (strPartText.Trim().Length < 1)
233 bPartHotStyle = false;
236 void FlushPartialText ()
238 if (strPartText.Length > 0) {
239 if (bPartHotStyle && !IsHot)
240 HotUp ();
241 AppendText (strPartText);
242 if (IsHot)
243 HotDown ();
244 strPartText = "";
246 bPartHotStyle = false;
250 bool WalkContentNodes (XmlReader reader)
252 // total number of elements to read per-pull
253 const int total_elements = 10;
254 int num_elements = 0;
256 while (reader.Read ()) {
257 switch (reader.NodeType) {
258 case XmlNodeType.Element:
259 if (reader.IsEmptyElement) {
260 FlushPartialText ();
261 if (NodeBreaksStructureAfter (reader.Name))
262 AppendStructuralBreak ();
263 else {
264 if (NodeBreaksTextBefore (reader.Name))
265 AppendWhiteSpace ();
267 if (NodeBreaksTextAfter (reader.Name))
268 AppendWhiteSpace ();
270 continue;
273 if (reader.Name == "style:style") {
274 StudyStyleNode (reader);
275 continue;
278 // A node is hot if:
279 // (1) It's name is hot
280 // (2) It is flagged with a hot style
281 // (3) annotations are always hot.
283 bool isHot = false;
285 if (NodeIsHot (reader.Name)) {
286 isHot = true;
287 } else if (NodeIsHotContainer (reader.Name)) {
288 hot_container_nodes.Push (reader.Name);
289 } else if (NodeIsLink (reader.Name)) {
290 string attr = reader.GetAttribute ("xlink:href");
291 AppendText (attr);
292 AppendWhiteSpace ();
293 isHot = false;
294 } else {
295 bool has_attr = reader.MoveToFirstAttribute ();
296 while (has_attr) {
297 if (reader.Name.EndsWith(":style-name")) {
298 if (hotStyles.Contains (reader.Value))
299 isHot = true;
300 break;
302 has_attr = reader.MoveToNextAttribute ();
304 reader.MoveToElement();
307 hot_nodes.Push (isHot);
309 // Call *HotUp* iff
310 // i) Its already in *HOT* mode and
311 // ii) there is a hot style/hot container tag
312 if (!IsHot && (isHot || hot_container_nodes.Count > 0))
313 HotUp ();
315 if (NodeIsFreezing (reader.Name)) {
316 FreezeUp ();
319 if (NodeBreaksTextBefore (reader.Name)) {
320 FlushPartialText ();
321 AppendWhiteSpace ();
323 break;
325 case XmlNodeType.Text:
326 string text = reader.Value;
327 if (text.Length < 1)
328 continue;
329 if (!IsFrozen)
330 AddTextForIndexing (text);
331 break;
333 case XmlNodeType.EndElement:
334 if (NodeBreaksStructureAfter (reader.Name)) {
335 FlushPartialText ();
336 AppendStructuralBreak ();
338 else if (NodeBreaksTextAfter (reader.Name))
339 AppendWhiteSpace ();
341 bool is_hot = false;
342 if (hot_nodes.Count > 0)
343 is_hot = (bool) hot_nodes.Pop ();
344 else
345 Logger.Log.Debug ("FilterOpenOffice: hot_nodes underflow in {0}",
346 reader.Name);
347 if (hot_container_nodes.Count > 0) {
348 string hot_container_tag = (string) hot_container_nodes.Peek ();
349 if (hot_container_tag == reader.Name) {
350 hot_container_nodes.Clear ();
351 HotDown ();
355 if (is_hot)
356 HotDown ();
358 if (NodeIsFreezing (reader.Name)) {
359 FreezeDown ();
361 break;
363 num_elements++;
364 if (num_elements >= total_elements) {
365 return false;
368 return true;
371 // SlideCount is not stored in meta.xml rather we need to
372 // parse the whole of content.xml to find out the count of
373 // slides present in an .sxi.
374 private void ExtractSlideCount (XmlReader reader)
376 string slideCount = null;
377 reader.Read ();
378 do {
379 reader.Read ();
381 // Do not parse the whole file if it is not a
382 // presentation (impress document)
383 if (reader.Name == "office:document-content"
384 && reader.NodeType == XmlNodeType.Element) {
385 string docClass = reader.GetAttribute ("office:class");
386 if (docClass != "presentation")
387 return;
389 } while (reader.Depth < 2);
391 while (reader.Depth >= 1) {
392 if (reader.Depth != 2 || reader.NodeType != XmlNodeType.Element) {
393 reader.Read ();
394 continue;
396 switch (reader.Name) {
397 case "draw:page":
398 slideCount = reader.GetAttribute ("draw:id");
399 break;
401 reader.Read ();
403 if (slideCount != null)
404 AddProperty (Beagle.Property.NewKeyword ("fixme:slide-count", slideCount));
407 private void ExtractMetadata (XmlReader reader)
409 do {
410 reader.Read ();
411 } while (reader.Depth < 2);
413 while (reader.Depth >= 2) {
414 if (reader.Depth != 2 || reader.NodeType != XmlNodeType.Element) {
415 reader.Read ();
416 continue;
418 switch (reader.Name) {
419 case "dc:title":
420 reader.Read ();
421 AddProperty (Beagle.Property.New ("dc:title",
422 reader.Value));
423 break;
425 case "dc:description":
426 reader.Read ();
428 AddProperty (Beagle.Property.New ("dc:description",
429 reader.Value));
430 break;
432 case "dc:subject":
433 reader.Read ();
435 AddProperty (Beagle.Property.New ("dc:subject",
436 reader.Value));
437 break;
439 case "meta:document-statistic":
440 string attr = reader.GetAttribute ("meta:page-count");
441 if (attr != null)
442 AddProperty (Beagle.Property.NewKeyword ("fixme:page-count", attr));
443 attr = reader.GetAttribute ("meta:word-count");
444 if (attr != null)
445 AddProperty (Beagle.Property.NewKeyword ("fixme:word-count", attr));
447 // Both writer and calc uses this attribute. writer stores the
448 // count of tables in a sxw whereas calc stores the count of
449 // spreadsheets in a sxc.
450 attr = reader.GetAttribute ("meta:table-count");
451 if (attr != null && Convert.ToInt32 (attr) > 0
452 && MimeType == "application/vnd.sun.xml.calc")
453 AddProperty (Beagle.Property.NewKeyword ("fixme:spreadsheet-count", attr));
454 break;
456 case "meta:user-defined":
457 string name = reader.GetAttribute ("meta:name");
458 reader.Read ();
460 if (reader.Value != "") {
461 AddProperty (Beagle.Property.New ("fixme:UserDefined-" + name,
462 reader.Value));
464 break;
467 reader.Read ();
471 ZipFile zip = null;
473 override protected void DoOpen (FileInfo info)
475 hotStyles = new Hashtable ();
476 try {
477 zip = new ZipFile (info.FullName);
479 if (MimeType.StartsWith ("application/vnd.oasis.opendocument."))
480 odtFormat = true;
482 } catch (Exception e) {
483 Logger.Log.Error ("Unable to open {0}. Probably an invalid OpenOffice document.",
484 info.FullName);
485 Finished ();
489 override protected void DoPullProperties ()
491 if (zip != null) {
492 ZipEntry entry = zip.GetEntry ("meta.xml");
493 if (entry != null) {
494 Stream meta_stream = zip.GetInputStream (entry);
495 XmlReader reader = new XmlTextReader (meta_stream);
496 ExtractMetadata (reader);
497 } else {
498 Logger.Log.Error ("No meta.xml!");
500 entry = zip.GetEntry ("content.xml");
501 if (entry != null) {
502 Stream contents_stream = zip.GetInputStream (entry);
503 XmlReader reader = new XmlTextReader (contents_stream);
504 ExtractSlideCount (reader);
505 } else {
506 Logger.Log.Error ("No content.xml!");
511 XmlReader content_reader = null;
512 XmlReader style_reader = null;
513 override protected void DoPull ()
515 if (zip != null) {
516 // We need both styles.xml and content.xml as
517 // "Header", "Footer" are stored in styles.xml and
518 // "[Foot/End]Notes are stored in content.xml
519 if ((content_reader == null) && (style_reader == null)) {
521 ZipEntry entry = zip.GetEntry ("content.xml");
522 ZipEntry entry1 = zip.GetEntry ("styles.xml");
524 if ((entry != null) && (entry1 != null)) {
525 Stream content_stream = zip.GetInputStream (entry);
526 Stream style_stream = zip.GetInputStream (entry1);
527 content_reader = new XmlTextReader (content_stream);
528 style_reader = new XmlTextReader (style_stream);
532 if ((content_reader == null) && (style_reader == null)) {
533 Finished ();
534 return;
537 // Note: Do not change the order.
538 // we need to populate our hotStyles table with all posible hot styles.
539 // Since, "footnotes" and "endnotes" gets stored in content.xml and these
540 // styles needs to be marked as *HOT*, they need to be processed before contents.
541 if ((WalkContentNodes (style_reader)) && (WalkContentNodes (content_reader)))
542 Finished ();