4 // Copyright (C) 2004 Novell, Inc.
8 // Permission is hereby granted, free of charge, to any person obtaining a
9 // copy of this software and associated documentation files (the "Software"),
10 // to deal in the Software without restriction, including without limitation
11 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 // and/or sell copies of the Software, and to permit persons to whom the
13 // Software is furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 // DEALINGS IN THE SOFTWARE.
29 using System
.Collections
;
37 using ICSharpCode
.SharpZipLib
.Zip
;
39 namespace Beagle
.Filters
{
41 public class FilterOpenOffice
: Beagle
.Daemon
.Filter
{
43 private Hashtable hotStyles
;
44 private bool odtFormat
;
46 private Hashtable attr_to_index
= null;
47 private Hashtable InterestingAttribute
{
49 if (attr_to_index
!= null)
52 attr_to_index
= new Hashtable (1 /* Number of attributes */);
53 // Add interesting node-attribute pairs to this hashtable
54 // attr_to_index [node_name] = attribute_name
55 attr_to_index
["table:table"] = "table:name";
61 public FilterOpenOffice ()
64 AddSupportedFlavor (FilterFlavor
.NewFromMimeType ("application/vnd.sun.xml.writer"));
65 AddSupportedFlavor (FilterFlavor
.NewFromMimeType ("application/vnd.sun.xml.writer.template"));
66 AddSupportedFlavor (FilterFlavor
.NewFromMimeType ("application/vnd.sun.xml.calc"));
67 AddSupportedFlavor (FilterFlavor
.NewFromMimeType ("application/vnd.sun.xml.calc.template"));
68 AddSupportedFlavor (FilterFlavor
.NewFromMimeType ("application/vnd.sun.xml.impress"));
69 AddSupportedFlavor (FilterFlavor
.NewFromMimeType ("application/vnd.sun.xml.impress.template"));
70 AddSupportedFlavor (FilterFlavor
.NewFromMimeType ("application/vnd.sun.xml.draw"));
71 AddSupportedFlavor (FilterFlavor
.NewFromMimeType ("application/vnd.sun.xml.draw.template"));
74 AddSupportedFlavor (FilterFlavor
.NewFromMimeType ("application/vnd.oasis.opendocument.text"));
75 AddSupportedFlavor (FilterFlavor
.NewFromMimeType ("application/vnd.oasis.opendocument.text-template"));
76 AddSupportedFlavor (FilterFlavor
.NewFromMimeType ("application/vnd.oasis.opendocument.spreadsheet"));
77 AddSupportedFlavor (FilterFlavor
.NewFromMimeType ("application/vnd.oasis.opendocument.spreadsheet-template"));
78 AddSupportedFlavor (FilterFlavor
.NewFromMimeType ("application/vnd.oasis.opendocument.presentation"));
79 AddSupportedFlavor (FilterFlavor
.NewFromMimeType ("application/vnd.oasis.opendocument.presentation-template"));
80 AddSupportedFlavor (FilterFlavor
.NewFromMimeType ("application/vnd.oasis.opendocument.graphics"));
81 AddSupportedFlavor (FilterFlavor
.NewFromMimeType ("application/vnd.oasis.opendocument.graphics-template"));
87 // Parse the "style" nodes and mark appropriate styles as *HOT*
88 // FIXME: Identify and ADD more *HOT* styles. ;)
89 void StudyStyleNode (XmlReader reader
)
91 string style_name
= reader
.GetAttribute ("style:name");
92 string style_parent
= reader
.GetAttribute ("style:parent-style-name");
95 string underline
= null;
97 int original_depth
= reader
.Depth
;
99 if (!reader
.IsEmptyElement
) {
101 while (reader
.Depth
> original_depth
) {
102 if (reader
.NodeType
== XmlNodeType
.Element
103 && (reader
.Name
== "style:properties" ||
104 reader
.Name
== "style:text-properties")) { /* ODT changes */
105 weight
= reader
.GetAttribute ("fo:font-weight");
106 italic
= reader
.GetAttribute ("fo:font-style");
107 underline
= reader
.GetAttribute ("style:text-underline");
113 if ((style_parent
!= null && style_parent
.StartsWith("Heading"))
114 || (style_name
!= null && ((String
.Compare (style_name
, "Footnote") == 0)
115 || (String
.Compare (style_name
, "Endnote") == 0)
116 || (String
.Compare (style_name
, "Header") == 0)
117 || (String
.Compare (style_name
, "Footer") == 0)))
118 || (weight
!= null && weight
== "bold")
119 || (italic
!= null && italic
== "italic")
120 || (underline
!= null && underline
!= "none"))
121 hotStyles
[style_name
] = true;
124 static bool NodeIsLink (String nodeName
)
126 return nodeName
== "text:a";
129 static bool NodeIsHot (String nodeName
)
131 return nodeName
== "text:h";
134 // These container tags allows multiple-lines of texts and
135 // all of them should be marked *HOT* and hence called Container ;-)
136 static bool NodeIsHotContainer (String nodeName
)
138 return nodeName
== "office:annotation" ||
139 nodeName
== "text:footnote" ||
140 nodeName
== "text:endnote" ||
141 nodeName
== "text:note"; // "ODT format"
144 static bool NodeIsFreezing (String nodeName
)
146 return nodeName
== "text:footnote-citation"
147 || nodeName
== "text:endnote-citation"
148 || nodeName
== "text:note-citation"; // "ODT format"
152 static bool NodeBreaksTextBefore (String nodeName
)
154 return nodeName
== "text:footnote"
155 || nodeName
== "text:endnote"
156 || nodeName
== "office:annotation"
157 || nodeName
== "table:table"
158 || nodeName
== "text:note"; // "ODT format"
161 static bool NodeBreaksTextAfter (String nodeName
)
163 return nodeName
== "text:line-break"
164 || nodeName
== "text:s"
165 || nodeName
== "text:tab-stop"
166 || nodeName
== "table:table-cell";
169 static bool NodeBreaksStructureAfter (String nodeName
)
171 return nodeName
== "text:p"
172 || nodeName
== "text:h"
173 || nodeName
== "text:footnote"
174 || nodeName
== "text:endnote"
175 || nodeName
== "office:annotation"
176 || nodeName
== "text:note"; // "ODT format"
179 private Stack hot_nodes
= new Stack ();
180 private string strPartText
= String
.Empty
;
181 private bool bPartHotStyle
= false;
182 private Stack hot_container_nodes
= new Stack ();
184 void AddTextForIndexing (string paramStr
)
189 int index
= paramStr
.LastIndexOf (' ');
192 // During the previous-parsing, a word got terminatted partially,
193 // find the remaining part of the word, concatenate it and add it to
194 // the respective pools and reset the HOT status, if required.
195 if (strPartText
.Length
> 0) {
196 sindex
= paramStr
.IndexOf (' ');
197 strTemp
= strPartText
+ paramStr
.Substring (0, sindex
);
205 AppendText (strTemp
);
206 if (!wasHot
&& bPartHotStyle
)
208 bPartHotStyle
= false;
210 paramStr
= paramStr
.Substring (sindex
);
211 index
= paramStr
.LastIndexOf (' ');
216 strPartText
= paramStr
.Substring (index
);
217 paramStr
= paramStr
.Substring (sindex
, index
);
219 strTemp
= strPartText
+ paramStr
;
220 strPartText
= strTemp
;
221 paramStr
= String
.Empty
;
222 strTemp
= String
.Empty
;
225 // Enable *HOT* just before appending the text
226 // because, there can be some *Partial Texts* without
227 // *HOT* styles that needs to be appended.
228 if (hot_nodes
.Count
> 0 && (bool) hot_nodes
.Peek() == true) {
231 bPartHotStyle
= true;
233 bPartHotStyle
|= false;
235 AppendText (paramStr
);
237 if (strPartText
.Trim().Length
< 1)
238 bPartHotStyle
= false;
241 void IndexAttribute (string node_name
, string attr_name
, string attr_value
)
243 if (attr_value
== null || attr_value
.Length
== 0)
246 if ((string)InterestingAttribute
[node_name
] == attr_name
)
247 AppendText (attr_value
);
250 void FlushPartialText ()
252 if (strPartText
.Length
> 0) {
253 if (bPartHotStyle
&& !IsHot
)
255 AppendText (strPartText
);
258 strPartText
= String
.Empty
;
260 bPartHotStyle
= false;
264 bool WalkContentNodes (XmlReader reader
)
266 // total number of elements to read per-pull
267 const int total_elements
= 10;
268 int num_elements
= 0;
270 while (reader
.Read ()) {
271 switch (reader
.NodeType
) {
272 case XmlNodeType
.Element
:
273 if (reader
.IsEmptyElement
) {
275 if (NodeBreaksStructureAfter (reader
.Name
))
276 AppendStructuralBreak ();
278 if (NodeBreaksTextBefore (reader
.Name
))
281 if (NodeBreaksTextAfter (reader
.Name
))
287 // FIXME: Allow adding more style nodes
288 if (reader
.Name
== "style:style"
289 || reader
.Name
== "number:date-style"
290 || reader
.Name
== "style:font-decl") {
291 StudyStyleNode (reader
);
296 // (1) It's name is hot
297 // (2) It is flagged with a hot style
298 // (3) annotations are always hot.
302 if (NodeIsHot (reader
.Name
)) {
304 } else if (NodeIsHotContainer (reader
.Name
)) {
305 hot_container_nodes
.Push (reader
.Name
);
306 } else if (NodeIsLink (reader
.Name
)) {
307 string attr
= reader
.GetAttribute ("xlink:href");
312 string node_name
= reader
.Name
;
313 bool has_attr
= reader
.MoveToFirstAttribute ();
315 if (reader
.Name
.EndsWith(":style-name")) {
316 if (hotStyles
.Contains (reader
.Value
))
320 IndexAttribute (node_name
, reader
.Name
, reader
.Value
);
322 has_attr
= reader
.MoveToNextAttribute ();
324 reader
.MoveToElement();
327 hot_nodes
.Push (isHot
);
330 // i) Its already in *HOT* mode and
331 // ii) there is a hot style/hot container tag
332 if (!IsHot
&& (isHot
|| hot_container_nodes
.Count
> 0))
335 if (NodeIsFreezing (reader
.Name
)) {
339 if (NodeBreaksTextBefore (reader
.Name
)) {
345 case XmlNodeType
.Text
:
346 string text
= reader
.Value
;
350 AddTextForIndexing (text
);
353 case XmlNodeType
.EndElement
:
354 if (NodeBreaksStructureAfter (reader
.Name
)) {
356 AppendStructuralBreak ();
358 else if (NodeBreaksTextAfter (reader
.Name
))
362 if (hot_nodes
.Count
> 0)
363 is_hot
= (bool) hot_nodes
.Pop ();
365 Logger
.Log
.Debug ("FilterOpenOffice: hot_nodes underflow in {0}",
367 if (hot_container_nodes
.Count
> 0) {
368 string hot_container_tag
= (string) hot_container_nodes
.Peek ();
369 if (hot_container_tag
== reader
.Name
) {
370 hot_container_nodes
.Clear ();
378 if (NodeIsFreezing (reader
.Name
)) {
384 if (num_elements
>= total_elements
) {
391 // SlideCount is not stored in meta.xml rather we need to
392 // parse the whole of content.xml to find out the count of
393 // slides present in an .sxi.
394 private void ExtractSlideCount (XmlReader reader
)
396 string slideCount
= null;
401 // Do not parse the whole file if it is not a
402 // presentation (impress document)
403 if (reader
.Name
== "office:document-content"
404 && reader
.NodeType
== XmlNodeType
.Element
) {
405 string docClass
= reader
.GetAttribute ("office:class");
406 if (docClass
!= "presentation")
409 } while (reader
.Depth
< 2);
411 while (reader
.Depth
>= 1) {
412 if (reader
.Depth
!= 2 || reader
.NodeType
!= XmlNodeType
.Element
) {
416 switch (reader
.Name
) {
418 slideCount
= reader
.GetAttribute ("draw:id");
424 AddProperty (Beagle
.Property
.NewUnsearched ("fixme:slide-count", slideCount
));
427 private void ExtractMetadata (XmlReader reader
)
431 } while (reader
.Depth
< 2);
433 while (reader
.Depth
>= 2) {
434 if (reader
.Depth
!= 2 || reader
.NodeType
!= XmlNodeType
.Element
) {
438 switch (reader
.Name
) {
441 AddProperty (Beagle
.Property
.New ("dc:title",
445 case "dc:description":
448 AddProperty (Beagle
.Property
.New ("dc:description",
455 AddProperty (Beagle
.Property
.New ("dc:subject",
459 case "meta:document-statistic":
460 string attr
= reader
.GetAttribute ("meta:page-count");
461 AddProperty (Beagle
.Property
.NewUnsearched ("fixme:page-count", attr
));
462 attr
= reader
.GetAttribute ("meta:word-count");
463 AddProperty (Beagle
.Property
.NewUnsearched ("fixme:word-count", attr
));
465 // Both writer and calc uses this attribute. writer stores the
466 // count of tables in a sxw whereas calc stores the count of
467 // spreadsheets in a sxc.
468 attr
= reader
.GetAttribute ("meta:table-count");
469 if (attr
!= null && Convert
.ToInt32 (attr
) > 0
470 && MimeType
== "application/vnd.sun.xml.calc")
471 AddProperty (Beagle
.Property
.NewUnsearched ("fixme:spreadsheet-count", attr
));
474 case "meta:user-defined":
475 string name
= reader
.GetAttribute ("meta:name");
478 AddProperty (Beagle
.Property
.New ("fixme:UserDefined-" + name
,
484 AddProperty (Beagle
.Property
.New ("fixme:keywords",
495 override protected void DoOpen (FileInfo info
)
497 hotStyles
= new Hashtable ();
499 zip
= new ZipFile (info
.FullName
);
501 if (MimeType
.StartsWith ("application/vnd.oasis.opendocument."))
504 } catch (Exception
) {
505 Logger
.Log
.Error ("Unable to open {0}. Probably an invalid OpenOffice document.",
511 override protected void DoPullProperties ()
514 ZipEntry entry
= zip
.GetEntry ("meta.xml");
516 Stream meta_stream
= zip
.GetInputStream (entry
);
517 XmlReader reader
= new XmlTextReader (meta_stream
);
518 ExtractMetadata (reader
);
520 Logger
.Log
.Error ("No meta.xml!");
522 entry
= zip
.GetEntry ("content.xml");
524 Stream contents_stream
= zip
.GetInputStream (entry
);
525 XmlReader reader
= new XmlTextReader (contents_stream
);
526 ExtractSlideCount (reader
);
528 Logger
.Log
.Error ("No content.xml!");
533 XmlReader content_reader
= null;
534 XmlReader style_reader
= null;
535 override protected void DoPull ()
538 // We need both styles.xml and content.xml as
539 // "Header", "Footer" are stored in styles.xml and
540 // "[Foot/End]Notes are stored in content.xml
541 if ((content_reader
== null) && (style_reader
== null)) {
543 ZipEntry entry
= zip
.GetEntry ("content.xml");
544 ZipEntry entry1
= zip
.GetEntry ("styles.xml");
546 if ((entry
!= null) && (entry1
!= null)) {
547 Stream content_stream
= zip
.GetInputStream (entry
);
548 Stream style_stream
= zip
.GetInputStream (entry1
);
549 content_reader
= new XmlTextReader (content_stream
);
550 style_reader
= new XmlTextReader (style_stream
);
554 if ((content_reader
== null) && (style_reader
== null)) {
559 // Note: Do not change the order.
560 // we need to populate our hotStyles table with all posible hot styles.
561 // Since, "footnotes" and "endnotes" gets stored in content.xml and these
562 // styles needs to be marked as *HOT*, they need to be processed before contents.
563 if ((WalkContentNodes (style_reader
)) && (WalkContentNodes (content_reader
)))
567 override protected void DoClose ()