Added FilterDeb from Kevin Kubasik.
[beagle.git] / Filters / FilterOpenOffice.cs
blob7e0eeb0aa5c01a059750b3970acab375b865010e
1 //
2 // FilterOpenOffice.cs
3 //
4 // Copyright (C) 2004 Novell, Inc.
5 //
7 //
8 // Permission is hereby granted, free of charge, to any person obtaining a
9 // copy of this software and associated documentation files (the "Software"),
10 // to deal in the Software without restriction, including without limitation
11 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 // and/or sell copies of the Software, and to permit persons to whom the
13 // Software is furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 // DEALINGS IN THE SOFTWARE.
28 using System;
29 using System.Collections;
30 using System.IO;
31 using System.Text;
32 using System.Xml;
34 using Beagle.Util;
35 using Beagle.Daemon;
37 using ICSharpCode.SharpZipLib.Zip;
39 namespace Beagle.Filters {
41 public class FilterOpenOffice : Beagle.Daemon.Filter {
43 Hashtable hotStyles;
44 bool odtFormat;
46 public FilterOpenOffice ()
48 // OO 1.0 mime types
49 AddSupportedFlavor (FilterFlavor.NewFromMimeType ("application/vnd.sun.xml.writer"));
50 AddSupportedFlavor (FilterFlavor.NewFromMimeType ("application/vnd.sun.xml.writer.template"));
51 AddSupportedFlavor (FilterFlavor.NewFromMimeType ("application/vnd.sun.xml.calc"));
52 AddSupportedFlavor (FilterFlavor.NewFromMimeType ("application/vnd.sun.xml.calc.template"));
53 AddSupportedFlavor (FilterFlavor.NewFromMimeType ("application/vnd.sun.xml.impress"));
54 AddSupportedFlavor (FilterFlavor.NewFromMimeType ("application/vnd.sun.xml.impress.template"));
55 AddSupportedFlavor (FilterFlavor.NewFromMimeType ("application/vnd.sun.xml.draw"));
56 AddSupportedFlavor (FilterFlavor.NewFromMimeType ("application/vnd.sun.xml.draw.template"));
58 // OO 2.0 mime types
59 AddSupportedFlavor (FilterFlavor.NewFromMimeType ("application/vnd.oasis.opendocument.text"));
60 AddSupportedFlavor (FilterFlavor.NewFromMimeType ("application/vnd.oasis.opendocument.text-template"));
61 AddSupportedFlavor (FilterFlavor.NewFromMimeType ("application/vnd.oasis.opendocument.spreadsheet"));
62 AddSupportedFlavor (FilterFlavor.NewFromMimeType ("application/vnd.oasis.opendocument.spreadsheet-template"));
63 AddSupportedFlavor (FilterFlavor.NewFromMimeType ("application/vnd.oasis.opendocument.presentation"));
64 AddSupportedFlavor (FilterFlavor.NewFromMimeType ("application/vnd.oasis.opendocument.presentation-template"));
65 AddSupportedFlavor (FilterFlavor.NewFromMimeType ("application/vnd.oasis.opendocument.graphics"));
66 AddSupportedFlavor (FilterFlavor.NewFromMimeType ("application/vnd.oasis.opendocument.graphics-template"));
68 SnippetMode = true;
69 odtFormat = false;
72 // Parse the "style" nodes and mark appropriate styles as *HOT*
73 // FIXME: Identify and ADD more *HOT* styles. ;)
74 void StudyStyleNode (XmlReader reader)
76 string style_name = reader.GetAttribute ("style:name");
77 string style_parent = reader.GetAttribute ("style:parent-style-name");
79 string weight = null;
80 string underline = null;
81 string italic = null;
82 int original_depth = reader.Depth;
84 if (!reader.IsEmptyElement) {
85 reader.Read ();
86 while (reader.Depth > original_depth) {
87 if (reader.NodeType == XmlNodeType.Element
88 && (reader.Name == "style:properties" ||
89 reader.Name == "style:text-properties")) { /* ODT changes */
90 weight = reader.GetAttribute ("fo:font-weight");
91 italic = reader.GetAttribute ("fo:font-style");
92 underline = reader.GetAttribute ("style:text-underline");
94 reader.Read ();
98 if ((style_parent != null && style_parent.StartsWith("Heading"))
99 || (style_name != null && ((String.Compare (style_name, "Footnote") == 0)
100 || (String.Compare (style_name, "Endnote") == 0)
101 || (String.Compare (style_name, "Header") == 0)
102 || (String.Compare (style_name, "Footer") == 0)))
103 || (weight != null && weight == "bold")
104 || (italic != null && italic == "italic")
105 || (underline != null && underline != "none"))
106 hotStyles[style_name] = true;
109 static bool NodeIsLink (String nodeName)
111 return nodeName == "text:a";
114 static bool NodeIsHot (String nodeName)
116 return nodeName == "text:h";
119 // These container tags allows multiple-lines of texts and
120 // all of them should be marked *HOT* and hence called Container ;-)
121 static bool NodeIsHotContainer (String nodeName)
123 return nodeName == "office:annotation" ||
124 nodeName == "text:footnote" ||
125 nodeName == "text:endnote" ||
126 nodeName == "text:note"; // "ODT format"
129 static bool NodeIsFreezing (String nodeName)
131 return nodeName == "text:footnote-citation"
132 || nodeName == "text:endnote-citation"
133 || nodeName == "text:note-citation"; // "ODT format"
137 static bool NodeBreaksTextBefore (String nodeName)
139 return nodeName == "text:footnote"
140 || nodeName == "text:endnote"
141 || nodeName == "office:annotation"
142 || nodeName == "text:note"; // "ODT format"
145 static bool NodeBreaksTextAfter (String nodeName)
147 return nodeName == "text:s"
148 || nodeName == "text:tab-stop"
149 || nodeName == "table:table-cell";
152 static bool NodeBreaksStructureAfter (String nodeName)
154 return nodeName == "text:p"
155 || nodeName == "text:h"
156 || nodeName == "text:footnote"
157 || nodeName == "text:endnote"
158 || nodeName == "office:annotation"
159 || nodeName == "text:note"; // "ODT format"
162 private Stack hot_nodes = new Stack ();
163 private string strPartText = "";
164 private bool bPartHotStyle = false;
165 private Stack hot_container_nodes = new Stack ();
167 void AddTextForIndexing (string paramStr)
169 int sindex = 0;
170 string strTemp;
171 bool wasHot = false;
172 int index = paramStr.LastIndexOf (' ');
174 if (index > -1) {
175 // During the previous-parsing, a word got terminatted partially,
176 // find the remaining part of the word, concatenate it and add it to
177 // the respective pools and reset the HOT status, if required.
178 if (strPartText.Length > 0) {
179 sindex = paramStr.IndexOf (' ');
180 strTemp = strPartText + paramStr.Substring (0, sindex);
181 if (!IsHot) {
182 if (bPartHotStyle)
183 HotUp ();
185 else
186 wasHot = true;
188 AppendText (strTemp);
189 if (!wasHot && bPartHotStyle)
190 HotDown ();
191 bPartHotStyle = false;
193 paramStr = paramStr.Substring (sindex);
194 index = paramStr.LastIndexOf (' ');
195 sindex = 0;
198 if (index > -1) {
199 strPartText = paramStr.Substring (index);
200 paramStr = paramStr.Substring (sindex, index);
201 } else {
202 strTemp = strPartText + paramStr;
203 strPartText = strTemp;
204 paramStr = "";
205 strTemp = "";
208 // Enable *HOT* just before appending the text
209 // because, there can be some *Partial Texts* without
210 // *HOT* styles that needs to be appended.
211 if (hot_nodes.Count > 0 && (bool) hot_nodes.Peek() == true) {
212 if (!IsHot)
213 HotUp ();
214 bPartHotStyle = true;
215 } else
216 bPartHotStyle |= false;
218 if (paramStr.Length > 0)
219 AppendText (paramStr);
221 if (strPartText.Trim().Length < 1)
222 bPartHotStyle = false;
225 void FlushPartialText ()
227 if (strPartText.Length > 0) {
228 if (bPartHotStyle && !IsHot)
229 HotUp ();
230 AppendText (strPartText);
231 if (IsHot)
232 HotDown ();
233 strPartText = "";
235 bPartHotStyle = false;
239 bool WalkContentNodes (XmlReader reader)
241 // total number of elements to read per-pull
242 const int total_elements = 10;
243 int num_elements = 0;
245 while (reader.Read ()) {
246 switch (reader.NodeType) {
247 case XmlNodeType.Element:
248 if (reader.IsEmptyElement) {
249 FlushPartialText ();
250 if (NodeBreaksStructureAfter (reader.Name))
251 AppendStructuralBreak ();
252 else {
253 if (NodeBreaksTextBefore (reader.Name))
254 AppendWhiteSpace ();
256 if (NodeBreaksTextAfter (reader.Name))
257 AppendWhiteSpace ();
259 continue;
262 if (reader.Name == "style:style") {
263 StudyStyleNode (reader);
264 continue;
267 // A node is hot if:
268 // (1) It's name is hot
269 // (2) It is flagged with a hot style
270 // (3) annotations are always hot.
272 bool isHot = false;
274 if (NodeIsHot (reader.Name)) {
275 isHot = true;
276 } else if (NodeIsHotContainer (reader.Name)) {
277 hot_container_nodes.Push (reader.Name);
278 } else if (NodeIsLink (reader.Name)) {
279 string attr = reader.GetAttribute ("xlink:href");
280 AppendText (attr);
281 AppendWhiteSpace ();
282 isHot = false;
283 } else {
284 bool has_attr = reader.MoveToFirstAttribute ();
285 while (has_attr) {
286 if (reader.Name.EndsWith(":style-name")) {
287 if (hotStyles.Contains (reader.Value))
288 isHot = true;
289 break;
291 has_attr = reader.MoveToNextAttribute ();
293 reader.MoveToElement();
296 hot_nodes.Push (isHot);
298 // Call *HotUp* iff
299 // i) Its already in *HOT* mode and
300 // ii) there is a hot style/hot container tag
301 if (!IsHot && (isHot || hot_container_nodes.Count > 0))
302 HotUp ();
304 if (NodeIsFreezing (reader.Name)) {
305 FreezeUp ();
308 if (NodeBreaksTextBefore (reader.Name)) {
309 FlushPartialText ();
310 AppendWhiteSpace ();
312 break;
314 case XmlNodeType.Text:
315 string text = reader.Value;
316 if (text.Length < 1)
317 continue;
318 if (!IsFrozen)
319 AddTextForIndexing (text);
320 break;
322 case XmlNodeType.EndElement:
323 if (NodeBreaksStructureAfter (reader.Name)) {
324 FlushPartialText ();
325 AppendStructuralBreak ();
327 else if (NodeBreaksTextAfter (reader.Name))
328 AppendWhiteSpace ();
330 bool is_hot = false;
331 if (hot_nodes.Count > 0)
332 is_hot = (bool) hot_nodes.Pop ();
333 else
334 Logger.Log.Debug ("FilterOpenOffice: hot_nodes underflow in {0}",
335 reader.Name);
336 if (hot_container_nodes.Count > 0) {
337 string hot_container_tag = (string) hot_container_nodes.Peek ();
338 if (hot_container_tag == reader.Name) {
339 hot_container_nodes.Clear ();
340 HotDown ();
344 if (is_hot)
345 HotDown ();
347 if (NodeIsFreezing (reader.Name)) {
348 FreezeDown ();
350 break;
352 num_elements++;
353 if (num_elements >= total_elements) {
354 return false;
357 return true;
360 // SlideCount is not stored in meta.xml rather we need to
361 // parse the whole of content.xml to find out the count of
362 // slides present in an .sxi.
363 private void ExtractSlideCount (XmlReader reader)
365 string slideCount = null;
366 reader.Read ();
367 do {
368 reader.Read ();
370 // Do not parse the whole file if it is not a
371 // presentation (impress document)
372 if (reader.Name == "office:document-content"
373 && reader.NodeType == XmlNodeType.Element) {
374 string docClass = reader.GetAttribute ("office:class");
375 if (docClass != "presentation")
376 return;
378 } while (reader.Depth < 2);
380 while (reader.Depth >= 1) {
381 if (reader.Depth != 2 || reader.NodeType != XmlNodeType.Element) {
382 reader.Read ();
383 continue;
385 switch (reader.Name) {
386 case "draw:page":
387 slideCount = reader.GetAttribute ("draw:id");
388 break;
390 reader.Read ();
392 if (slideCount != null)
393 AddProperty (Beagle.Property.NewUnsearched ("fixme:slide-count", slideCount));
396 private void ExtractMetadata (XmlReader reader)
398 do {
399 reader.Read ();
400 } while (reader.Depth < 2);
402 while (reader.Depth >= 2) {
403 if (reader.Depth != 2 || reader.NodeType != XmlNodeType.Element) {
404 reader.Read ();
405 continue;
407 switch (reader.Name) {
408 case "dc:title":
409 reader.Read ();
410 AddProperty (Beagle.Property.New ("dc:title",
411 reader.Value));
412 break;
414 case "dc:description":
415 reader.Read ();
417 AddProperty (Beagle.Property.New ("dc:description",
418 reader.Value));
419 break;
421 case "dc:subject":
422 reader.Read ();
424 AddProperty (Beagle.Property.New ("dc:subject",
425 reader.Value));
426 break;
428 case "meta:document-statistic":
429 string attr = reader.GetAttribute ("meta:page-count");
430 if (attr != null)
431 AddProperty (Beagle.Property.NewUnsearched ("fixme:page-count", attr));
432 attr = reader.GetAttribute ("meta:word-count");
433 if (attr != null)
434 AddProperty (Beagle.Property.NewUnsearched ("fixme:word-count", attr));
436 // Both writer and calc uses this attribute. writer stores the
437 // count of tables in a sxw whereas calc stores the count of
438 // spreadsheets in a sxc.
439 attr = reader.GetAttribute ("meta:table-count");
440 if (attr != null && Convert.ToInt32 (attr) > 0
441 && MimeType == "application/vnd.sun.xml.calc")
442 AddProperty (Beagle.Property.NewUnsearched ("fixme:spreadsheet-count", attr));
443 break;
445 case "meta:user-defined":
446 string name = reader.GetAttribute ("meta:name");
447 reader.Read ();
449 if (reader.Value != "") {
450 AddProperty (Beagle.Property.New ("fixme:UserDefined-" + name,
451 reader.Value));
453 break;
455 case "meta:keyword":
456 reader.Read ();
457 AddProperty (Beagle.Property.New ("fixme:keywords",
458 + reader.Value));
459 break;
462 reader.Read ();
466 ZipFile zip = null;
468 override protected void DoOpen (FileInfo info)
470 hotStyles = new Hashtable ();
471 try {
472 zip = new ZipFile (info.FullName);
474 if (MimeType.StartsWith ("application/vnd.oasis.opendocument."))
475 odtFormat = true;
477 } catch (Exception e) {
478 Logger.Log.Error ("Unable to open {0}. Probably an invalid OpenOffice document.",
479 info.FullName);
480 Finished ();
484 override protected void DoPullProperties ()
486 if (zip != null) {
487 ZipEntry entry = zip.GetEntry ("meta.xml");
488 if (entry != null) {
489 Stream meta_stream = zip.GetInputStream (entry);
490 XmlReader reader = new XmlTextReader (meta_stream);
491 ExtractMetadata (reader);
492 } else {
493 Logger.Log.Error ("No meta.xml!");
495 entry = zip.GetEntry ("content.xml");
496 if (entry != null) {
497 Stream contents_stream = zip.GetInputStream (entry);
498 XmlReader reader = new XmlTextReader (contents_stream);
499 ExtractSlideCount (reader);
500 } else {
501 Logger.Log.Error ("No content.xml!");
506 XmlReader content_reader = null;
507 XmlReader style_reader = null;
508 override protected void DoPull ()
510 if (zip != null) {
511 // We need both styles.xml and content.xml as
512 // "Header", "Footer" are stored in styles.xml and
513 // "[Foot/End]Notes are stored in content.xml
514 if ((content_reader == null) && (style_reader == null)) {
516 ZipEntry entry = zip.GetEntry ("content.xml");
517 ZipEntry entry1 = zip.GetEntry ("styles.xml");
519 if ((entry != null) && (entry1 != null)) {
520 Stream content_stream = zip.GetInputStream (entry);
521 Stream style_stream = zip.GetInputStream (entry1);
522 content_reader = new XmlTextReader (content_stream);
523 style_reader = new XmlTextReader (style_stream);
527 if ((content_reader == null) && (style_reader == null)) {
528 Finished ();
529 return;
532 // Note: Do not change the order.
533 // we need to populate our hotStyles table with all posible hot styles.
534 // Since, "footnotes" and "endnotes" gets stored in content.xml and these
535 // styles needs to be marked as *HOT*, they need to be processed before contents.
536 if ((WalkContentNodes (style_reader)) && (WalkContentNodes (content_reader)))
537 Finished ();