4 // Copyright (C) 2004 Novell, Inc.
8 // Permission is hereby granted, free of charge, to any person obtaining a
9 // copy of this software and associated documentation files (the "Software"),
10 // to deal in the Software without restriction, including without limitation
11 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 // and/or sell copies of the Software, and to permit persons to whom the
13 // Software is furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 // DEALINGS IN THE SOFTWARE.
29 using System
.Collections
;
37 using ICSharpCode
.SharpZipLib
.Zip
;
39 namespace Beagle
.Filters
{
41 public class FilterOpenOffice
: Beagle
.Daemon
.Filter
{
43 private Hashtable hotStyles
;
44 private bool odtFormat
;
46 private Hashtable attr_to_index
= null;
47 private Hashtable InterestingAttribute
{
49 if (attr_to_index
!= null)
52 attr_to_index
= new Hashtable (1 /* Number of attributes */);
53 // Add interesting node-attribute pairs to this hashtable
54 // attr_to_index [node_name] = attribute_name
55 attr_to_index
["table:table"] = "table:name";
61 public FilterOpenOffice ()
64 AddSupportedFlavor (FilterFlavor
.NewFromMimeType ("application/vnd.sun.xml.writer"));
65 AddSupportedFlavor (FilterFlavor
.NewFromMimeType ("application/vnd.sun.xml.writer.template"));
66 AddSupportedFlavor (FilterFlavor
.NewFromMimeType ("application/vnd.sun.xml.calc"));
67 AddSupportedFlavor (FilterFlavor
.NewFromMimeType ("application/vnd.sun.xml.calc.template"));
68 AddSupportedFlavor (FilterFlavor
.NewFromMimeType ("application/vnd.sun.xml.impress"));
69 AddSupportedFlavor (FilterFlavor
.NewFromMimeType ("application/vnd.sun.xml.impress.template"));
70 AddSupportedFlavor (FilterFlavor
.NewFromMimeType ("application/vnd.sun.xml.draw"));
71 AddSupportedFlavor (FilterFlavor
.NewFromMimeType ("application/vnd.sun.xml.draw.template"));
74 AddSupportedFlavor (FilterFlavor
.NewFromMimeType ("application/vnd.oasis.opendocument.text"));
75 AddSupportedFlavor (FilterFlavor
.NewFromMimeType ("application/vnd.oasis.opendocument.text-template"));
76 AddSupportedFlavor (FilterFlavor
.NewFromMimeType ("application/vnd.oasis.opendocument.spreadsheet"));
77 AddSupportedFlavor (FilterFlavor
.NewFromMimeType ("application/vnd.oasis.opendocument.spreadsheet-template"));
78 AddSupportedFlavor (FilterFlavor
.NewFromMimeType ("application/vnd.oasis.opendocument.presentation"));
79 AddSupportedFlavor (FilterFlavor
.NewFromMimeType ("application/vnd.oasis.opendocument.presentation-template"));
80 AddSupportedFlavor (FilterFlavor
.NewFromMimeType ("application/vnd.oasis.opendocument.graphics"));
81 AddSupportedFlavor (FilterFlavor
.NewFromMimeType ("application/vnd.oasis.opendocument.graphics-template"));
87 // Parse the "style" nodes and mark appropriate styles as *HOT*
88 // FIXME: Identify and ADD more *HOT* styles. ;)
89 void StudyStyleNode (XmlReader reader
)
91 string style_name
= reader
.GetAttribute ("style:name");
92 string style_parent
= reader
.GetAttribute ("style:parent-style-name");
95 string underline
= null;
97 int original_depth
= reader
.Depth
;
99 if (!reader
.IsEmptyElement
) {
101 while (reader
.Depth
> original_depth
) {
102 if (reader
.NodeType
== XmlNodeType
.Element
103 && (reader
.Name
== "style:properties" ||
104 reader
.Name
== "style:text-properties")) { /* ODT changes */
105 weight
= reader
.GetAttribute ("fo:font-weight");
106 italic
= reader
.GetAttribute ("fo:font-style");
107 underline
= reader
.GetAttribute ("style:text-underline");
113 if ((style_parent
!= null && style_parent
.StartsWith("Heading"))
114 || (style_name
!= null && ((String
.Compare (style_name
, "Footnote") == 0)
115 || (String
.Compare (style_name
, "Endnote") == 0)
116 || (String
.Compare (style_name
, "Header") == 0)
117 || (String
.Compare (style_name
, "Footer") == 0)))
118 || (weight
!= null && weight
== "bold")
119 || (italic
!= null && italic
== "italic")
120 || (underline
!= null && underline
!= "none"))
121 hotStyles
[style_name
] = true;
124 static bool NodeIsLink (String nodeName
)
126 return nodeName
== "text:a";
129 static bool NodeIsHot (String nodeName
)
131 return nodeName
== "text:h";
134 // These container tags allows multiple-lines of texts and
135 // all of them should be marked *HOT* and hence called Container ;-)
136 static bool NodeIsHotContainer (String nodeName
)
138 return nodeName
== "office:annotation" ||
139 nodeName
== "text:footnote" ||
140 nodeName
== "text:endnote" ||
141 nodeName
== "text:note"; // "ODT format"
144 static bool NodeIsFreezing (String nodeName
)
146 return nodeName
== "text:footnote-citation"
147 || nodeName
== "text:endnote-citation"
148 || nodeName
== "text:note-citation"; // "ODT format"
152 static bool NodeBreaksTextBefore (String nodeName
)
154 return nodeName
== "text:footnote"
155 || nodeName
== "text:endnote"
156 || nodeName
== "office:annotation"
157 || nodeName
== "table:table"
158 || nodeName
== "text:note"; // "ODT format"
161 static bool NodeBreaksTextAfter (String nodeName
)
163 return nodeName
== "text:line-break"
164 || nodeName
== "text:s"
165 || nodeName
== "text:tab-stop"
166 || nodeName
== "table:table-cell";
169 static bool NodeBreaksStructureAfter (String nodeName
)
171 return nodeName
== "text:p"
172 || nodeName
== "text:h"
173 || nodeName
== "text:footnote"
174 || nodeName
== "text:endnote"
175 || nodeName
== "office:annotation"
176 || nodeName
== "text:note"; // "ODT format"
179 private Stack hot_nodes
= new Stack ();
180 private string strPartText
= "";
181 private bool bPartHotStyle
= false;
182 private Stack hot_container_nodes
= new Stack ();
184 void AddTextForIndexing (string paramStr
)
189 int index
= paramStr
.LastIndexOf (' ');
192 // During the previous-parsing, a word got terminatted partially,
193 // find the remaining part of the word, concatenate it and add it to
194 // the respective pools and reset the HOT status, if required.
195 if (strPartText
.Length
> 0) {
196 sindex
= paramStr
.IndexOf (' ');
197 strTemp
= strPartText
+ paramStr
.Substring (0, sindex
);
205 AppendText (strTemp
);
206 if (!wasHot
&& bPartHotStyle
)
208 bPartHotStyle
= false;
210 paramStr
= paramStr
.Substring (sindex
);
211 index
= paramStr
.LastIndexOf (' ');
216 strPartText
= paramStr
.Substring (index
);
217 paramStr
= paramStr
.Substring (sindex
, index
);
219 strTemp
= strPartText
+ paramStr
;
220 strPartText
= strTemp
;
225 // Enable *HOT* just before appending the text
226 // because, there can be some *Partial Texts* without
227 // *HOT* styles that needs to be appended.
228 if (hot_nodes
.Count
> 0 && (bool) hot_nodes
.Peek() == true) {
231 bPartHotStyle
= true;
233 bPartHotStyle
|= false;
235 if (paramStr
.Length
> 0)
236 AppendText (paramStr
);
238 if (strPartText
.Trim().Length
< 1)
239 bPartHotStyle
= false;
242 void IndexAttribute (string node_name
, string attr_name
, string attr_value
)
244 if (attr_value
== null || attr_value
.Length
== 0)
247 if ((string)InterestingAttribute
[node_name
] == attr_name
)
248 AppendText (attr_value
);
251 void FlushPartialText ()
253 if (strPartText
.Length
> 0) {
254 if (bPartHotStyle
&& !IsHot
)
256 AppendText (strPartText
);
261 bPartHotStyle
= false;
265 bool WalkContentNodes (XmlReader reader
)
267 // total number of elements to read per-pull
268 const int total_elements
= 10;
269 int num_elements
= 0;
271 while (reader
.Read ()) {
272 switch (reader
.NodeType
) {
273 case XmlNodeType
.Element
:
274 if (reader
.IsEmptyElement
) {
276 if (NodeBreaksStructureAfter (reader
.Name
))
277 AppendStructuralBreak ();
279 if (NodeBreaksTextBefore (reader
.Name
))
282 if (NodeBreaksTextAfter (reader
.Name
))
288 // FIXME: Allow adding more style nodes
289 if (reader
.Name
== "style:style"
290 || reader
.Name
== "number:date-style"
291 || reader
.Name
== "style:font-decl") {
292 StudyStyleNode (reader
);
297 // (1) It's name is hot
298 // (2) It is flagged with a hot style
299 // (3) annotations are always hot.
303 if (NodeIsHot (reader
.Name
)) {
305 } else if (NodeIsHotContainer (reader
.Name
)) {
306 hot_container_nodes
.Push (reader
.Name
);
307 } else if (NodeIsLink (reader
.Name
)) {
308 string attr
= reader
.GetAttribute ("xlink:href");
313 string node_name
= reader
.Name
;
314 bool has_attr
= reader
.MoveToFirstAttribute ();
316 if (reader
.Name
.EndsWith(":style-name")) {
317 if (hotStyles
.Contains (reader
.Value
))
321 IndexAttribute (node_name
, reader
.Name
, reader
.Value
);
323 has_attr
= reader
.MoveToNextAttribute ();
325 reader
.MoveToElement();
328 hot_nodes
.Push (isHot
);
331 // i) Its already in *HOT* mode and
332 // ii) there is a hot style/hot container tag
333 if (!IsHot
&& (isHot
|| hot_container_nodes
.Count
> 0))
336 if (NodeIsFreezing (reader
.Name
)) {
340 if (NodeBreaksTextBefore (reader
.Name
)) {
346 case XmlNodeType
.Text
:
347 string text
= reader
.Value
;
351 AddTextForIndexing (text
);
354 case XmlNodeType
.EndElement
:
355 if (NodeBreaksStructureAfter (reader
.Name
)) {
357 AppendStructuralBreak ();
359 else if (NodeBreaksTextAfter (reader
.Name
))
363 if (hot_nodes
.Count
> 0)
364 is_hot
= (bool) hot_nodes
.Pop ();
366 Logger
.Log
.Debug ("FilterOpenOffice: hot_nodes underflow in {0}",
368 if (hot_container_nodes
.Count
> 0) {
369 string hot_container_tag
= (string) hot_container_nodes
.Peek ();
370 if (hot_container_tag
== reader
.Name
) {
371 hot_container_nodes
.Clear ();
379 if (NodeIsFreezing (reader
.Name
)) {
385 if (num_elements
>= total_elements
) {
392 // SlideCount is not stored in meta.xml rather we need to
393 // parse the whole of content.xml to find out the count of
394 // slides present in an .sxi.
395 private void ExtractSlideCount (XmlReader reader
)
397 string slideCount
= null;
402 // Do not parse the whole file if it is not a
403 // presentation (impress document)
404 if (reader
.Name
== "office:document-content"
405 && reader
.NodeType
== XmlNodeType
.Element
) {
406 string docClass
= reader
.GetAttribute ("office:class");
407 if (docClass
!= "presentation")
410 } while (reader
.Depth
< 2);
412 while (reader
.Depth
>= 1) {
413 if (reader
.Depth
!= 2 || reader
.NodeType
!= XmlNodeType
.Element
) {
417 switch (reader
.Name
) {
419 slideCount
= reader
.GetAttribute ("draw:id");
424 if (slideCount
!= null)
425 AddProperty (Beagle
.Property
.NewUnsearched ("fixme:slide-count", slideCount
));
428 private void ExtractMetadata (XmlReader reader
)
432 } while (reader
.Depth
< 2);
434 while (reader
.Depth
>= 2) {
435 if (reader
.Depth
!= 2 || reader
.NodeType
!= XmlNodeType
.Element
) {
439 switch (reader
.Name
) {
442 AddProperty (Beagle
.Property
.New ("dc:title",
446 case "dc:description":
449 AddProperty (Beagle
.Property
.New ("dc:description",
456 AddProperty (Beagle
.Property
.New ("dc:subject",
460 case "meta:document-statistic":
461 string attr
= reader
.GetAttribute ("meta:page-count");
463 AddProperty (Beagle
.Property
.NewUnsearched ("fixme:page-count", attr
));
464 attr
= reader
.GetAttribute ("meta:word-count");
466 AddProperty (Beagle
.Property
.NewUnsearched ("fixme:word-count", attr
));
468 // Both writer and calc uses this attribute. writer stores the
469 // count of tables in a sxw whereas calc stores the count of
470 // spreadsheets in a sxc.
471 attr
= reader
.GetAttribute ("meta:table-count");
472 if (attr
!= null && Convert
.ToInt32 (attr
) > 0
473 && MimeType
== "application/vnd.sun.xml.calc")
474 AddProperty (Beagle
.Property
.NewUnsearched ("fixme:spreadsheet-count", attr
));
477 case "meta:user-defined":
478 string name
= reader
.GetAttribute ("meta:name");
481 if (reader
.Value
!= "") {
482 AddProperty (Beagle
.Property
.New ("fixme:UserDefined-" + name
,
489 AddProperty (Beagle
.Property
.New ("fixme:keywords",
500 override protected void DoOpen (FileInfo info
)
502 hotStyles
= new Hashtable ();
504 zip
= new ZipFile (info
.FullName
);
506 if (MimeType
.StartsWith ("application/vnd.oasis.opendocument."))
509 } catch (Exception
) {
510 Logger
.Log
.Error ("Unable to open {0}. Probably an invalid OpenOffice document.",
516 override protected void DoPullProperties ()
519 ZipEntry entry
= zip
.GetEntry ("meta.xml");
521 Stream meta_stream
= zip
.GetInputStream (entry
);
522 XmlReader reader
= new XmlTextReader (meta_stream
);
523 ExtractMetadata (reader
);
525 Logger
.Log
.Error ("No meta.xml!");
527 entry
= zip
.GetEntry ("content.xml");
529 Stream contents_stream
= zip
.GetInputStream (entry
);
530 XmlReader reader
= new XmlTextReader (contents_stream
);
531 ExtractSlideCount (reader
);
533 Logger
.Log
.Error ("No content.xml!");
538 XmlReader content_reader
= null;
539 XmlReader style_reader
= null;
540 override protected void DoPull ()
543 // We need both styles.xml and content.xml as
544 // "Header", "Footer" are stored in styles.xml and
545 // "[Foot/End]Notes are stored in content.xml
546 if ((content_reader
== null) && (style_reader
== null)) {
548 ZipEntry entry
= zip
.GetEntry ("content.xml");
549 ZipEntry entry1
= zip
.GetEntry ("styles.xml");
551 if ((entry
!= null) && (entry1
!= null)) {
552 Stream content_stream
= zip
.GetInputStream (entry
);
553 Stream style_stream
= zip
.GetInputStream (entry1
);
554 content_reader
= new XmlTextReader (content_stream
);
555 style_reader
= new XmlTextReader (style_stream
);
559 if ((content_reader
== null) && (style_reader
== null)) {
564 // Note: Do not change the order.
565 // we need to populate our hotStyles table with all posible hot styles.
566 // Since, "footnotes" and "endnotes" gets stored in content.xml and these
567 // styles needs to be marked as *HOT*, they need to be processed before contents.
568 if ((WalkContentNodes (style_reader
)) && (WalkContentNodes (content_reader
)))
572 override protected void DoClose ()