2005-04-19 Gabor Kelemen <kelemeng@gnome.hu>
[beagle.git] / Filters / FilterOpenOffice.cs
blob40227dfe2810335c6d32f7d844c5df3b52d88749
1 //
2 // FilterOpenOffice.cs
3 //
4 // Copyright (C) 2004 Novell, Inc.
5 //
7 //
8 // Permission is hereby granted, free of charge, to any person obtaining a
9 // copy of this software and associated documentation files (the "Software"),
10 // to deal in the Software without restriction, including without limitation
11 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 // and/or sell copies of the Software, and to permit persons to whom the
13 // Software is furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 // DEALINGS IN THE SOFTWARE.
28 using System;
29 using System.Collections;
30 using System.IO;
31 using System.Text;
32 using System.Xml;
33 using Beagle.Util;
35 using ICSharpCode.SharpZipLib.Zip;
37 namespace Beagle.Filters {
39 public class FilterOpenOffice : Beagle.Daemon.Filter {
41 Hashtable hotStyles;
42 bool odtFormat;
44 public FilterOpenOffice ()
46 // OO 1.0 mime types
47 AddSupportedMimeType ("application/vnd.sun.xml.writer");
48 AddSupportedMimeType ("application/vnd.sun.xml.writer.template");
49 AddSupportedMimeType ("application/vnd.sun.xml.calc");
50 AddSupportedMimeType ("application/vnd.sun.xml.calc.template");
51 AddSupportedMimeType ("application/vnd.sun.xml.impress");
52 AddSupportedMimeType ("application/vnd.sun.xml.impress.template");
54 // OO 2.0 mime types
55 AddSupportedMimeType ("application/vnd.oasis.opendocument.text");
56 AddSupportedMimeType ("application/vnd.oasis.opendocument.text-template");
57 AddSupportedMimeType ("application/vnd.oasis.opendocument.spreadsheet");
58 AddSupportedMimeType ("application/vnd.oasis.opendocument.spreadsheet-template");
59 AddSupportedMimeType ("application/vnd.oasis.opendocument.presentation");
60 AddSupportedMimeType ("application/vnd.oasis.opendocument.presentation-template");
62 SnippetMode = true;
63 odtFormat = false;
66 static String FindChildAttribute (XmlNode node,
67 String nodeName,
68 String attributeName)
70 foreach (XmlNode subnode in node.ChildNodes) {
71 if (subnode.Name == nodeName) {
72 XmlAttribute attr = subnode.Attributes [attributeName];
73 if (attr == null)
74 return null;
75 return attr.Value;
78 return null;
81 // Parse the "style" nodes and mark appropriate styles as *HOT*
82 // FIXME: Identify and ADD more *HOT* styles. ;)
83 void StudyStyleNode (XmlReader reader)
85 string style_name = reader.GetAttribute ("style:name");
86 string style_parent = reader.GetAttribute ("style:parent-style-name");
88 string weight = null;
89 string underline = null;
90 string italic = null;
91 int original_depth = reader.Depth;
93 if (!reader.IsEmptyElement) {
94 reader.Read ();
95 while (reader.Depth > original_depth) {
96 if (reader.NodeType == XmlNodeType.Element
97 && (reader.Name == "style:properties" ||
98 reader.Name == "style:text-properties")) { /* ODT changes */
99 weight = reader.GetAttribute ("fo:font-weight");
100 italic = reader.GetAttribute ("fo:font-style");
101 underline = reader.GetAttribute ("style:text-underline");
103 reader.Read ();
107 if ((style_parent != null && style_parent.StartsWith("Heading"))
108 || (style_name != null && ((String.Compare (style_name, "Footnote") == 0)
109 || (String.Compare (style_name, "Endnote") == 0)
110 || (String.Compare (style_name, "Header") == 0)
111 || (String.Compare (style_name, "Footer") == 0)))
112 || (weight != null && weight == "bold")
113 || (italic != null && italic == "italic")
114 || (underline != null && underline != "none"))
115 hotStyles[style_name] = true;
118 static bool NodeIsLink (String nodeName)
120 return nodeName == "text:a";
123 static bool NodeIsHot (String nodeName)
125 return nodeName == "text:h";
128 // These container tags allows multiple-lines of texts and
129 // all of them should be marked *HOT* and hence called Container ;-)
130 static bool NodeIsHotContainer (String nodeName)
132 return nodeName == "office:annotation" ||
133 nodeName == "text:footnote" ||
134 nodeName == "text:endnote" ||
135 nodeName == "text:note"; // "ODT format"
138 static bool NodeIsFreezing (String nodeName)
140 return nodeName == "text:footnote-citation"
141 || nodeName == "text:endnote-citation"
142 || nodeName == "text:note-citation"; // "ODT format"
146 static bool NodeBreaksTextBefore (String nodeName)
148 return nodeName == "text:footnote"
149 || nodeName == "text:endnote"
150 || nodeName == "office:annotation"
151 || nodeName == "text:note"; // "ODT format"
154 static bool NodeBreaksTextAfter (String nodeName)
156 return nodeName == "text:s"
157 || nodeName == "text:tab-stop"
158 || nodeName == "table:table-cell";
161 static bool NodeBreaksStructureAfter (String nodeName)
163 return nodeName == "text:p"
164 || nodeName == "text:h"
165 || nodeName == "text:footnote"
166 || nodeName == "text:endnote"
167 || nodeName == "office:annotation"
168 || nodeName == "text:note"; // "ODT format"
171 private Stack hot_nodes = new Stack ();
172 private string strPartText = "";
173 private bool bPartHotStyle = false;
174 private Stack hot_container_nodes = new Stack ();
176 void AddTextForIndexing (string paramStr)
178 int sindex = 0;
179 string strTemp;
180 bool wasHot = false;
181 int index = paramStr.LastIndexOf (' ');
183 if (index > -1) {
184 // During the previous-parsing, a word got terminatted partially,
185 // find the remaining part of the word, concatenate it and add it to
186 // the respective pools and reset the HOT status, if required.
187 if (strPartText.Length > 0) {
188 sindex = paramStr.IndexOf (' ');
189 strTemp = strPartText + paramStr.Substring (0, sindex);
190 if (!IsHot) {
191 if (bPartHotStyle)
192 HotUp ();
194 else
195 wasHot = true;
197 AppendText (strTemp);
198 if (!wasHot && bPartHotStyle)
199 HotDown ();
200 bPartHotStyle = false;
202 paramStr = paramStr.Substring (sindex);
203 index = paramStr.LastIndexOf (' ');
204 sindex = 0;
207 if (index > -1) {
208 strPartText = paramStr.Substring (index);
209 paramStr = paramStr.Substring (sindex, index);
210 } else {
211 strTemp = strPartText + paramStr;
212 strPartText = strTemp;
213 paramStr = "";
214 strTemp = "";
217 // Enable *HOT* just before appending the text
218 // because, there can be some *Partial Texts* without
219 // *HOT* styles that needs to be appended.
220 if (hot_nodes.Count > 0 && (bool) hot_nodes.Peek() == true) {
221 if (!IsHot)
222 HotUp ();
223 bPartHotStyle = true;
224 } else
225 bPartHotStyle |= false;
227 if (paramStr.Length > 0)
228 AppendText (paramStr);
230 if (strPartText.Trim().Length < 1)
231 bPartHotStyle = false;
234 void FlushPartialText ()
236 if (strPartText.Length > 0) {
237 if (bPartHotStyle && !IsHot)
238 HotUp ();
239 AppendText (strPartText);
240 if (IsHot)
241 HotDown ();
242 strPartText = "";
244 bPartHotStyle = false;
248 bool WalkContentNodes (XmlReader reader)
250 // total number of elements to read per-pull
251 const int total_elements = 10;
252 int num_elements = 0;
254 while (reader.Read ()) {
255 switch (reader.NodeType) {
256 case XmlNodeType.Element:
257 if (reader.IsEmptyElement) {
258 FlushPartialText ();
259 if (NodeBreaksStructureAfter (reader.Name))
260 AppendStructuralBreak ();
261 else {
262 if (NodeBreaksTextBefore (reader.Name))
263 AppendWhiteSpace ();
265 if (NodeBreaksTextAfter (reader.Name))
266 AppendWhiteSpace ();
268 continue;
271 if (reader.Name == "style:style") {
272 StudyStyleNode (reader);
273 continue;
276 // A node is hot if:
277 // (1) It's name is hot
278 // (2) It is flagged with a hot style
279 // (3) annotations are always hot.
281 bool isHot = false;
283 if (NodeIsHot (reader.Name)) {
284 isHot = true;
285 } else if (NodeIsHotContainer (reader.Name)) {
286 hot_container_nodes.Push (reader.Name);
287 } else if (NodeIsLink (reader.Name)) {
288 string attr = reader.GetAttribute ("xlink:href");
289 AppendText (attr);
290 AppendWhiteSpace ();
291 isHot = false;
292 } else {
293 bool has_attr = reader.MoveToFirstAttribute ();
294 while (has_attr) {
295 if (reader.Name.EndsWith(":style-name")) {
296 if (hotStyles.Contains (reader.Value))
297 isHot = true;
298 break;
300 has_attr = reader.MoveToNextAttribute ();
302 reader.MoveToElement();
305 hot_nodes.Push (isHot);
307 // Call *HotUp* iff
308 // i) Its already in *HOT* mode and
309 // ii) there is a hot style/hot container tag
310 if (!IsHot && (isHot || hot_container_nodes.Count > 0))
311 HotUp ();
313 if (NodeIsFreezing (reader.Name)) {
314 FreezeUp ();
317 if (NodeBreaksTextBefore (reader.Name)) {
318 FlushPartialText ();
319 AppendWhiteSpace ();
321 break;
323 case XmlNodeType.Text:
324 string text = reader.Value;
325 if (text.Length < 1)
326 continue;
327 if (!IsFrozen)
328 AddTextForIndexing (text);
329 break;
331 case XmlNodeType.EndElement:
332 if (NodeBreaksStructureAfter (reader.Name)) {
333 FlushPartialText ();
334 AppendStructuralBreak ();
336 else if (NodeBreaksTextAfter (reader.Name))
337 AppendWhiteSpace ();
339 bool is_hot = false;
340 if (hot_nodes.Count > 0)
341 is_hot = (bool) hot_nodes.Pop ();
342 else
343 Logger.Log.Debug ("FilterOpenOffice: hot_nodes underflow in {0}",
344 reader.Name);
345 if (hot_container_nodes.Count > 0) {
346 string hot_container_tag = (string) hot_container_nodes.Peek ();
347 if (hot_container_tag == reader.Name) {
348 hot_container_nodes.Clear ();
349 HotDown ();
353 if (is_hot)
354 HotDown ();
356 if (NodeIsFreezing (reader.Name)) {
357 FreezeDown ();
359 break;
361 num_elements++;
362 if (num_elements >= total_elements) {
363 return false;
366 return true;
369 // SlideCount is not stored in meta.xml rather we need to
370 // parse the whole of content.xml to find out the count of
371 // slides present in an .sxi.
372 private void ExtractSlideCount (XmlReader reader)
374 string slideCount = null;
375 reader.Read ();
376 do {
377 reader.Read ();
379 // Do not parse the whole file if it is not a
380 // presentation (impress document)
381 if (reader.Name == "office:document-content"
382 && reader.NodeType == XmlNodeType.Element) {
383 string docClass = reader.GetAttribute ("office:class");
384 if (docClass != "presentation")
385 return;
387 } while (reader.Depth < 2);
389 while (reader.Depth >= 1) {
390 if (reader.Depth != 2 || reader.NodeType != XmlNodeType.Element) {
391 reader.Read ();
392 continue;
394 switch (reader.Name) {
395 case "draw:page":
396 slideCount = reader.GetAttribute ("draw:id");
397 break;
399 reader.Read ();
401 if (slideCount != null)
402 AddProperty (Beagle.Property.NewKeyword ("fixme:slide-count", slideCount));
405 private void ExtractMetadata (XmlReader reader)
407 do {
408 reader.Read ();
409 } while (reader.Depth < 2);
411 while (reader.Depth >= 2) {
412 if (reader.Depth != 2 || reader.NodeType != XmlNodeType.Element) {
413 reader.Read ();
414 continue;
416 switch (reader.Name) {
417 case "dc:title":
418 reader.Read ();
419 AddProperty (Beagle.Property.New ("dc:title",
420 reader.Value));
421 break;
423 case "dc:description":
424 reader.Read ();
426 AddProperty (Beagle.Property.New ("dc:description",
427 reader.Value));
428 break;
430 case "dc:subject":
431 reader.Read ();
433 AddProperty (Beagle.Property.New ("dc:subject",
434 reader.Value));
435 break;
437 case "meta:document-statistic":
438 string attr = reader.GetAttribute ("meta:page-count");
439 if (attr != null)
440 AddProperty (Beagle.Property.NewKeyword ("fixme:page-count", attr));
441 attr = reader.GetAttribute ("meta:word-count");
442 if (attr != null)
443 AddProperty (Beagle.Property.NewKeyword ("fixme:word-count", attr));
445 // Both writer and calc uses this attribute. writer stores the
446 // count of tables in a sxw whereas calc stores the count of
447 // spreadsheets in a sxc.
448 attr = reader.GetAttribute ("meta:table-count");
449 if (attr != null && Convert.ToInt32 (attr) > 0
450 && MimeType == "application/vnd.sun.xml.calc")
451 AddProperty (Beagle.Property.NewKeyword ("fixme:spreadsheet-count", attr));
452 break;
454 case "meta:user-defined":
455 string name = reader.GetAttribute ("meta:name");
456 reader.Read ();
458 if (reader.Value != "") {
459 AddProperty (Beagle.Property.New ("fixme:UserDefined-" + name,
460 reader.Value));
462 break;
465 reader.Read ();
469 ZipFile zip = null;
471 override protected void DoOpen (FileInfo info)
473 hotStyles = new Hashtable ();
474 try {
475 zip = new ZipFile (info.FullName);
477 if (MimeType.StartsWith ("application/vnd.oasis.opendocument."))
478 odtFormat = true;
480 } catch (Exception e) {
481 Logger.Log.Error ("Unable to open {0}. Probably an invalid OpenOffice document.",
482 info.FullName);
483 Finished ();
487 override protected void DoPullProperties ()
489 if (zip != null) {
490 ZipEntry entry = zip.GetEntry ("meta.xml");
491 if (entry != null) {
492 Stream meta_stream = zip.GetInputStream (entry);
493 XmlReader reader = new XmlTextReader (meta_stream);
494 ExtractMetadata (reader);
495 } else {
496 Logger.Log.Error ("No meta.xml!");
498 entry = zip.GetEntry ("content.xml");
499 if (entry != null) {
500 Stream contents_stream = zip.GetInputStream (entry);
501 XmlReader reader = new XmlTextReader (contents_stream);
502 ExtractSlideCount (reader);
503 } else {
504 Logger.Log.Error ("No content.xml!");
509 XmlReader content_reader = null;
510 XmlReader style_reader = null;
511 override protected void DoPull ()
513 if (zip != null) {
514 // We need both styles.xml and content.xml as
515 // "Header", "Footer" are stored in styles.xml and
516 // "[Foot/End]Notes are stored in content.xml
517 if ((content_reader == null) && (style_reader == null)) {
519 ZipEntry entry = zip.GetEntry ("content.xml");
520 ZipEntry entry1 = zip.GetEntry ("styles.xml");
522 if ((entry != null) && (entry1 != null)) {
523 Stream content_stream = zip.GetInputStream (entry);
524 Stream style_stream = zip.GetInputStream (entry1);
525 content_reader = new XmlTextReader (content_stream);
526 style_reader = new XmlTextReader (style_stream);
530 if ((content_reader == null) && (style_reader == null)) {
531 Finished ();
532 return;
535 // Note: Do not change the order.
536 // we need to populate our hotStyles table with all posible hot styles.
537 // Since, "footnotes" and "endnotes" gets stored in content.xml and these
538 // styles needs to be marked as *HOT*, they need to be processed before contents.
539 if ((WalkContentNodes (style_reader)) && (WalkContentNodes (content_reader)))
540 Finished ();