5 // FilterRTF.cs : Trivial implementation of a RTF-document filter.
7 // Copyright (C) 2004 Novell, Inc.
10 // Permission is hereby granted, free of charge, to any person obtaining a
11 // copy of this software and associated documentation files (the "Software"),
12 // to deal in the Software without restriction, including without limitation
13 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
14 // and/or sell copies of the Software, and to permit persons to whom the
15 // Software is furnished to do so, subject to the following conditions:
17 // The above copyright notice and this permission notice shall be included in
18 // all copies or substantial portions of the Software.
20 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
25 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
26 // DEALINGS IN THE SOFTWARE.
29 // Currently, the filtering is based on only few *control words*. If anyone
30 // has any samples that can break this "assumption", kindly post a copy of it,
31 // if you can, to <vvaradhan@novell.com>
33 // FIXME: Require more complex samples to test the parsing, mostly generated
34 // using Microsoft Word or wordpad. :)
37 using System
.Collections
;
44 internal class RTFControlWordType
{
59 public string ctrlWord
;
61 RTFControlWordType (Type types
, string ctrlword
)
64 this.ctrlWord
= ctrlword
;
67 // FIXME: Need to add "unicode", "styles",
68 // "header", "footer" etc.
69 static RTFControlWordType
[] types
=
71 new RTFControlWordType (Type
.None
, ""),
72 new RTFControlWordType (Type
.MetaDataBlock
, "info"),
73 new RTFControlWordType (Type
.MetaDataTag
, "title"),
74 new RTFControlWordType (Type
.MetaDataTag
, "author"),
75 new RTFControlWordType (Type
.MetaDataTag
, "comment"),
76 new RTFControlWordType (Type
.MetaDataTag
, "operator"),
77 new RTFControlWordType (Type
.MetaDataTag
, "nofpages"),
78 new RTFControlWordType (Type
.MetaDataTag
, "nofwords"),
79 new RTFControlWordType (Type
.MetaDataTag
, "generator"),
80 new RTFControlWordType (Type
.MetaDataTag
, "company"),
81 new RTFControlWordType (Type
.ParaEnd
, "par"),
82 new RTFControlWordType (Type
.Paragraph
, "pard"),
83 new RTFControlWordType (Type
.SplSection
, "header"),
84 new RTFControlWordType (Type
.SplSection
, "footer"),
85 new RTFControlWordType (Type
.SplSection
, "headerl"),
86 new RTFControlWordType (Type
.SplSection
, "footerl"),
87 new RTFControlWordType (Type
.SplSection
, "footnote"),
88 new RTFControlWordType (Type
.CharProp
, "b"),
89 new RTFControlWordType (Type
.CharProp
, "i"),
90 new RTFControlWordType (Type
.CharProp
, "ul"),
91 new RTFControlWordType (Type
.CharProp
, "up"),
92 new RTFControlWordType (Type
.CharProp
, "dn"),
93 new RTFControlWordType (Type
.Skip
, "'"),
94 new RTFControlWordType (Type
.Skip
, "*"),
95 new RTFControlWordType (Type
.EscSeq
, "{"),
96 new RTFControlWordType (Type
.EscSeq
, "}"),
97 new RTFControlWordType (Type
.EscSeq
, "\\"),
100 public static RTFControlWordType
Find (string strCtrlWord
)
102 for (int i
= 0; i
< types
.Length
; i
++) {
103 if (String
.Compare (types
[i
].ctrlWord
, strCtrlWord
) == 0)
109 namespace Beagle
.Filters
{
111 public class FilterRTF
: Beagle
.Daemon
.Filter
{
113 public enum Position
{
116 InMetaDataTagGenerator
,
121 public enum ErrorCodes
{
124 ERROR_RTF_UNHANDLED_SYMBOL
133 StreamReader SReaderRTF
;
141 // Make this a general rtf filter.
142 AddSupportedFlavor (FilterFlavor
.NewFromMimeType ("application/rtf"));
148 bPartHotStyle
= false;
153 MetaDataStack
= new Stack ();
154 TextDataStack
= new Stack ();
159 override protected void DoOpen (FileInfo info
)
162 FsRTF
= new FileStream (info
.FullName
, FileMode
.Open
,
165 SReaderRTF
= new StreamReader (FsRTF
);
167 Logger
.Log
.Error ("Unable to open {0}.", info
.FullName
);
170 } catch (Exception
) {
171 Logger
.Log
.Error ("Unable to open {0}.", info
.FullName
);
177 // Identifies the type of RTF control word and handles accordingly
178 private ErrorCodes
HandleControlWord (string strCtrlWord
, int paramVal
, bool bMeta
)
180 RTFControlWordType ctrlWrdType
= RTFControlWordType
.Find (strCtrlWord
);
182 switch (ctrlWrdType
.Types
) {
183 case RTFControlWordType
.Type
.MetaDataBlock
: /* process meta-data */
184 pos
= Position
.InMetaData
;
186 case RTFControlWordType
.Type
.MetaDataTag
:
187 if (pos
== Position
.InMetaData
) {
188 if (String
.Compare (strCtrlWord
, "title") == 0)
189 MetaDataStack
.Push ("dc:title");
190 else if (String
.Compare (strCtrlWord
, "author") == 0)
191 MetaDataStack
.Push ("dc:author");
192 else if (String
.Compare (strCtrlWord
, "comment") == 0)
193 MetaDataStack
.Push ("fixme:comment");
194 else if (String
.Compare (strCtrlWord
, "operator") == 0)
195 MetaDataStack
.Push ("fixme:operator");
196 else if (String
.Compare (strCtrlWord
, "nofpages") == 0) {
197 MetaDataStack
.Push (Convert
.ToString (paramVal
));
198 MetaDataStack
.Push ("fixme:page-count");
200 else if (String
.Compare (strCtrlWord
, "nofwords") == 0) {
201 MetaDataStack
.Push (Convert
.ToString (paramVal
));
202 MetaDataStack
.Push ("fixme:word-count");
204 else if (String
.Compare (strCtrlWord
, "company") == 0)
205 MetaDataStack
.Push ("fixme:company");
206 } else if (String
.Compare (strCtrlWord
, "generator") == 0) {
207 pos
= Position
.InMetaDataTagGenerator
;
208 MetaDataStack
.Push ("fixme:generator");
212 case RTFControlWordType
.Type
.Paragraph
:
214 pos
= Position
.InPara
;
217 case RTFControlWordType
.Type
.ParaEnd
:
219 pos
= Position
.InBody
;
222 // FIXME: "Hot" styles are not *properly reset to normal*
223 // on some *wierd* conditions.
224 // To avoid such stuff, we need to maintain a stack of
225 // groupCounts for set/reset Hot styles.
226 case RTFControlWordType
.Type
.SplSection
:
227 hotStyleCount
= groupCount
- 1;
230 case RTFControlWordType
.Type
.CharProp
:
231 if (pos
== Position
.InPara
) {
233 //Console.WriteLine ("HotUp: \\{0}{1}", strCtrlWord, paramVal);
234 hotStyleCount
= groupCount
- 1;
240 case RTFControlWordType
.Type
.EscSeq
:
241 if (pos
== Position
.InPara
) {
242 TextDataStack
.Push (strCtrlWord
);
243 TextDataStack
.Push ("EscSeq");
246 case RTFControlWordType
.Type
.Skip
:
247 skipCount
= groupCount
- 1;
248 //SkipDataStack.Push (groupCount-1);
251 return ErrorCodes
.ERROR_RTF_OK
;
254 // FIXME: Probably need a little cleanup ;-)
256 private ErrorCodes
ProcessControlWords (bool bMeta
)
261 bool negParamVal
= false;
262 StringBuilder strCtrlWord
= new StringBuilder ();
263 StringBuilder strParameter
= new StringBuilder ();
265 aByte
= SReaderRTF
.Read ();
267 return ErrorCodes
.ERROR_RTF_EOF
;
270 RTFControlWordType ctrlWrdType
= RTFControlWordType
.Find (new String (ch
, 1));
272 if (!Char
.IsLetter (ch
) &&
273 ctrlWrdType
.Types
!= RTFControlWordType
.Type
.Skip
&&
274 ctrlWrdType
.Types
!= RTFControlWordType
.Type
.EscSeq
) {
275 Logger
.Log
.Error ("Unhandled symbol: {0}, {1}", ch
, ctrlWrdType
.Types
);
276 return ErrorCodes
.ERROR_RTF_UNHANDLED_SYMBOL
;
278 while (aByte
!= -1) {
279 strCtrlWord
.Append (ch
);
280 aByte
= SReaderRTF
.Peek ();
282 if (Char
.IsLetter (ch
)) {
283 aByte
= SReaderRTF
.Read ();
289 aByte
= SReaderRTF
.Peek ();
291 if (aByte
!= -1 && ch
== '-') {
293 aByte
= SReaderRTF
.Read (); // move the fp
294 aByte
= SReaderRTF
.Peek ();
297 if (Char
.IsDigit (ch
)) {
298 aByte
= SReaderRTF
.Read ();
300 while (aByte
!= -1) {
301 strParameter
.Append (ch
);
302 aByte
= SReaderRTF
.Peek ();
304 if (Char
.IsDigit (ch
)) {
305 aByte
= SReaderRTF
.Read ();
311 if (strParameter
.Length
> 0)
312 paramVal
= Convert
.ToInt32 (strParameter
.ToString());
314 //Console.WriteLine ("{0}\t{1}", strCtrlWord, strParameter);
315 if (negParamVal
&& paramVal
> -1)
317 return (HandleControlWord (strCtrlWord
.ToString(), paramVal
, bMeta
));
320 private ErrorCodes
RTFParse (bool bMeta
)
324 StringBuilder str
= new StringBuilder ();
325 string strTemp
= null;
328 while ((aByte
= SReaderRTF
.Read ()) != -1) {
331 case '\\': /* process keywords */
333 if (groupCount
> skipCount
)
338 ec
= ProcessControlWords (bMeta
);
339 if (ec
!= ErrorCodes
.ERROR_RTF_OK
)
341 if (pos
== Position
.InPara
)
342 AddTextForIndexing (str
);
343 str
.Remove (0, str
.Length
);
345 case '{': /* process groups */
346 if (pos
== Position
.InPara
)
347 AddTextForIndexing (str
);
348 str
.Remove (0, str
.Length
);
351 case '}': /* process groups */
353 if (pos
== Position
.InMetaData
||
354 pos
== Position
.InMetaDataTagGenerator
) {
355 // groupCount will atleast be 1 for
356 // the outermost "{" block
357 if (pos
== Position
.InMetaData
&& groupCount
== 1) {
359 return ErrorCodes
.ERROR_RTF_OK
;
361 if (MetaDataStack
.Count
> 0) {
362 strTemp
= (string) MetaDataStack
.Pop ();
363 if ((String
.Compare (strTemp
, "fixme:word-count") == 0) ||
364 (String
.Compare (strTemp
, "fixme:page-count") == 0)) {
365 str
.Append ((string) MetaDataStack
.Pop ());
366 AddProperty (Beagle
.Property
.NewUnsearched (strTemp
,
370 AddProperty (Beagle
.Property
.New (strTemp
,
375 } else if (pos
== Position
.InPara
) {
376 AddTextForIndexing (str
);
378 } else if (pos
== Position
.InBody
) {
379 //Console.WriteLine ("\\par : {0}", str);
382 AddTextForIndexing (str
);
383 AppendStructuralBreak ();
385 if (hotStyleCount
> 0
386 && groupCount
<= hotStyleCount
) {
387 //Console.WriteLine ("Group count: {0}, stack: {1}",
388 //groupCount, hotStyleCount);
394 case '\r': /* ignore \r */
395 case '\n': /* ignore \n */
398 if ((skipCount
== 0 || groupCount
<= skipCount
)
399 && (pos
== Position
.InPara
|| pos
== Position
.InBody
))
404 if (partText
.Length
> 0) {
405 if (bPartHotStyle
&& !IsHot
)
407 AppendText (partText
);
411 return ErrorCodes
.ERROR_RTF_OK
;
414 private void AddTextForIndexing (StringBuilder str
)
417 string paramStr
= null;
421 while (TextDataStack
.Count
> 0) {
422 strTemp
= (string) TextDataStack
.Pop ();
425 strTemp
= (string) TextDataStack
.Pop ();
426 str
.Append (strTemp
);
432 if (str
.Length
> 0) {
433 //Console.WriteLine ("Text: [{0}]", str);
435 paramStr
= str
.ToString ();
436 str
.Remove (0, str
.Length
);
438 int index
= paramStr
.LastIndexOf (' ');
442 // During the previous-parsing, a word got terminatted partially,
443 // find the remaining part of the word, concatenate it and add it to
444 // the respective pools and reset the HOT status, if required.
445 if (partText
.Length
> 0) {
446 sindex
= paramStr
.IndexOf (' ');
447 strTemp
= partText
+ paramStr
.Substring (0, sindex
);
448 //Console.WriteLine ("PartHotStyle: {0}, HotStyleCount: {1}, partText: {2}",
450 // hotStyleCount, strTemp);
458 AppendText (strTemp
);
459 if (!wasHot
&& bPartHotStyle
)
461 bPartHotStyle
= false;
463 paramStr
= paramStr
.Substring (sindex
);
464 index
= paramStr
.LastIndexOf (' ');
468 partText
= paramStr
.Substring (index
);
469 paramStr
= paramStr
.Substring (sindex
, index
);
471 strTemp
= partText
+ paramStr
;
477 // Enable *HOT* just before appending the text
478 // because, there can be some *Partial Texts* without
479 // *HOT* styles that needs to be appended.
480 if (hotStyleCount
> 0) {
483 bPartHotStyle
= true;
485 bPartHotStyle
|= false;
487 if (paramStr
.Length
> 0)
488 AppendText (paramStr
);
490 if (partText
.Length
< 1)
491 bPartHotStyle
= false;
495 override protected void DoPull ()
498 ec
= ErrorCodes
.ERROR_RTF_OK
;
501 // Discard the buffered data, if not,
502 // the buffered data can change the
503 // state "pos" variable that results
505 // Fixes: http://bugzilla.gnome.org/show_bug.cgi?id=172294
506 SReaderRTF
.DiscardBufferedData ();
508 // Rewind the file pointer to start from beginning.
509 SReaderRTF
.BaseStream
.Seek (0, SeekOrigin
.Begin
);
511 ec
= RTFParse (false);
512 if (ec
!= ErrorCodes
.ERROR_RTF_OK
)
513 Logger
.Log
.Error ("{0}", ec
);
517 override protected void DoPullProperties ()
520 ec
= ErrorCodes
.ERROR_RTF_OK
;
521 ec
= RTFParse (true);
522 if (ec
!= ErrorCodes
.ERROR_RTF_OK
)
523 Logger
.Log
.Error ("{0}", ec
);