4 // FilterRTF.cs : Trivial implementation of a RTF-document filter.
6 // Copyright (C) 2004 Novell, Inc.
9 // Permission is hereby granted, free of charge, to any person obtaining a
10 // copy of this software and associated documentation files (the "Software"),
11 // to deal in the Software without restriction, including without limitation
12 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 // and/or sell copies of the Software, and to permit persons to whom the
14 // Software is furnished to do so, subject to the following conditions:
16 // The above copyright notice and this permission notice shall be included in
17 // all copies or substantial portions of the Software.
19 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 // DEALINGS IN THE SOFTWARE.
28 // Currently, the filtering is based on only few *control words*. If anyone
29 // has any samples that can break this "assumption", kindly post a copy of it,
30 // if you can, to <vvaradhan@novell.com>
32 // FIXME: Require more complex samples to test the parsing, mostly generated
33 // using Microsoft Word or wordpad. :)
36 using System
.Collections
;
40 internal class RTFControlWordType
{
53 public string ctrlWord
;
55 RTFControlWordType (Type types
, string ctrlword
)
58 this.ctrlWord
= ctrlword
;
61 // FIXME: Need to add "unicode", "styles",
62 // "header", "footer" etc.
63 static RTFControlWordType
[] types
=
65 new RTFControlWordType (Type
.None
, ""),
66 new RTFControlWordType (Type
.MetaDataBlock
, "info"),
67 new RTFControlWordType (Type
.MetaDataTag
, "title"),
68 new RTFControlWordType (Type
.MetaDataTag
, "author"),
69 new RTFControlWordType (Type
.MetaDataTag
, "comment"),
70 new RTFControlWordType (Type
.MetaDataTag
, "operator"),
71 new RTFControlWordType (Type
.MetaDataTag
, "nofpages"),
72 new RTFControlWordType (Type
.MetaDataTag
, "nofwords"),
73 new RTFControlWordType (Type
.MetaDataTag
, "generator"),
74 new RTFControlWordType (Type
.MetaDataTag
, "company"),
75 new RTFControlWordType (Type
.Paragraph
, "par"),
76 new RTFControlWordType (Type
.Paragraph
, "pard"),
77 new RTFControlWordType (Type
.CharProp
, "b"),
78 new RTFControlWordType (Type
.CharProp
, "i"),
79 new RTFControlWordType (Type
.CharProp
, "ul"),
80 new RTFControlWordType (Type
.CharProp
, "up"),
81 new RTFControlWordType (Type
.CharProp
, "dn"),
82 new RTFControlWordType (Type
.Skip
, "'"),
83 new RTFControlWordType (Type
.Skip
, "*"),
84 new RTFControlWordType (Type
.EscSeq
, "{"),
85 new RTFControlWordType (Type
.EscSeq
, "}"),
86 new RTFControlWordType (Type
.EscSeq
, "\\"),
89 public static RTFControlWordType
Find (string strCtrlWord
)
91 for (int i
= 0; i
< types
.Length
; i
++) {
92 if (String
.Compare (types
[i
].ctrlWord
, strCtrlWord
) == 0)
98 namespace Beagle
.Filters
{
100 public class FilterRTF
: Beagle
.Daemon
.Filter
{
102 public enum Position
{
105 InMetaDataTagGenerator
,
109 public enum ErrorCodes
{
112 ERROR_RTF_UNHANDLED_SYMBOL
121 StreamReader SReaderRTF
;
125 // Make this a general rtf filter.
126 AddSupportedMimeType ("application/rtf");
128 MetaDataStack
= new Stack();
129 TextDataStack
= new Stack();
136 override protected void DoOpen (FileInfo info
)
138 FsRTF
= new FileStream (info
.FullName
, FileMode
.Open
, FileAccess
.Read
);
140 SReaderRTF
= new StreamReader (FsRTF
);
144 // Identifies the type of RTF control word and handles accordingly
145 private ErrorCodes
HandleControlWord (string strCtrlWord
, int paramVal
, bool bMeta
)
147 RTFControlWordType ctrlWrdType
= RTFControlWordType
.Find (strCtrlWord
);
149 switch (ctrlWrdType
.Types
) {
150 case RTFControlWordType
.Type
.MetaDataBlock
: /* process meta-data */
151 pos
= Position
.InMetaData
;
153 case RTFControlWordType
.Type
.MetaDataTag
:
154 if (pos
== Position
.InMetaData
) {
155 if (String
.Compare (strCtrlWord
, "title") == 0)
156 MetaDataStack
.Push ("dc:title");
157 else if (String
.Compare (strCtrlWord
, "author") == 0)
158 MetaDataStack
.Push ("dc:author");
159 else if (String
.Compare (strCtrlWord
, "comment") == 0)
160 MetaDataStack
.Push ("fixme:comment");
161 else if (String
.Compare (strCtrlWord
, "operator") == 0)
162 MetaDataStack
.Push ("fixme:operator");
163 else if (String
.Compare (strCtrlWord
, "nofpages") == 0) {
164 MetaDataStack
.Push (Convert
.ToString (paramVal
));
165 MetaDataStack
.Push ("fixme:page-count");
167 else if (String
.Compare (strCtrlWord
, "nofwords") == 0) {
168 MetaDataStack
.Push (Convert
.ToString (paramVal
));
169 MetaDataStack
.Push ("fixme:word-count");
171 else if (String
.Compare (strCtrlWord
, "company") == 0)
172 MetaDataStack
.Push ("fixme:company");
173 } else if (String
.Compare (strCtrlWord
, "generator") == 0) {
174 pos
= Position
.InMetaDataTagGenerator
;
175 MetaDataStack
.Push ("fixme:generator");
179 case RTFControlWordType
.Type
.Paragraph
:
181 pos
= Position
.InBody
;
184 // FIXME: "Hot" styles are not *properly reset to normal*
185 // on some *wierd* conditions.
186 case RTFControlWordType
.Type
.CharProp
:
187 if (pos
== Position
.InBody
) {
193 case RTFControlWordType
.Type
.EscSeq
:
194 if (pos
== Position
.InBody
) {
195 TextDataStack
.Push (strCtrlWord
);
196 TextDataStack
.Push ("EscSeq");
200 return ErrorCodes
.ERROR_RTF_OK
;
203 // FIXME: Probably need a little cleanup ;-)
205 private ErrorCodes
ProcessControlWords (bool bMeta
)
209 int paramVal
= -1, i
;
210 StringBuilder strCtrlWord
= new StringBuilder ();
211 StringBuilder strParameter
= new StringBuilder ();
213 aByte
= SReaderRTF
.Read ();
215 return ErrorCodes
.ERROR_RTF_EOF
;
218 RTFControlWordType ctrlWrdType
= RTFControlWordType
.Find (new String (ch
, 1));
220 if (!Char
.IsLetter (ch
) &&
221 ctrlWrdType
.Types
!= RTFControlWordType
.Type
.Skip
&&
222 ctrlWrdType
.Types
!= RTFControlWordType
.Type
.EscSeq
) {
223 Console
.WriteLine ("Unhandled symbol: {0}, {1}", ch
, ctrlWrdType
.Types
);
224 return ErrorCodes
.ERROR_RTF_UNHANDLED_SYMBOL
;
226 while (aByte
!= -1) {
227 strCtrlWord
.Append (ch
);
228 aByte
= SReaderRTF
.Peek ();
230 if (Char
.IsLetter (ch
)) {
231 aByte
= SReaderRTF
.Read ();
237 aByte
= SReaderRTF
.Peek ();
239 if (Char
.IsDigit (ch
)) {
240 aByte
= SReaderRTF
.Read ();
242 while (aByte
!= -1) {
243 strParameter
.Append (ch
);
244 aByte
= SReaderRTF
.Peek ();
246 if (Char
.IsDigit (ch
)) {
247 aByte
= SReaderRTF
.Read ();
253 if (strParameter
.Length
> 0)
254 paramVal
= Convert
.ToInt32 (strParameter
.ToString());
256 //Console.WriteLine ("{0}\t{1}", strCtrlWord, strParameter);
257 return (HandleControlWord (strCtrlWord
.ToString(), paramVal
, bMeta
));
260 private ErrorCodes
RTFParse (bool bMeta
)
264 StringBuilder str
= new StringBuilder ();
265 string strTemp
= null;
268 // If we are not extracting meta-data, set the
269 // file pointer to the saved position
271 SReaderRTF
.BaseStream
.Seek (offset
, SeekOrigin
.Begin
);
273 while ((aByte
= SReaderRTF
.Read ()) != -1) {
276 case '\\': /* process keywords */
277 ec
= ProcessControlWords (bMeta
);
278 if (ec
!= ErrorCodes
.ERROR_RTF_OK
)
280 if (pos
== Position
.InBody
) {
281 AddTextForIndexing (str
);
282 //AppendText (str.ToString());
283 //AppendWhiteSpace ();
285 str
.Remove (0, str
.Length
);
287 case '{': /* process groups */
288 if (pos
== Position
.InBody
)
289 AddTextForIndexing (str
);
290 str
.Remove (0, str
.Length
);
293 case '}': /* process groups */
295 if (pos
== Position
.InMetaData
||
296 pos
== Position
.InMetaDataTagGenerator
) {
297 // groupCount will atleast be 1 for
298 // the outermost "{" block
299 if (pos
== Position
.InMetaData
&& groupCount
== 1) {
301 offset
= SReaderRTF
.BaseStream
.Position
;
302 return ErrorCodes
.ERROR_RTF_OK
;
306 if (MetaDataStack
.Count
> 0) {
307 strTemp
= (string) MetaDataStack
.Pop ();
308 if ((String
.Compare (strTemp
, "fixme:word-count") == 0) ||
309 (String
.Compare (strTemp
, "fixme:page-count") == 0)) {
310 str
.Append ((string) MetaDataStack
.Pop ());
311 AddProperty (Beagle
.Property
.NewKeyword (strTemp
,
315 AddProperty (Beagle
.Property
.New (strTemp
,
320 } else if (pos
== Position
.InBody
) {
323 AddTextForIndexing (str
);
329 case '\r': /* ignore \r */
330 case '\n': /* ignore \n */
337 return ErrorCodes
.ERROR_RTF_OK
;
340 private void AddTextForIndexing (StringBuilder str
)
346 while (TextDataStack
.Count
> 0) {
347 strTemp
= (string) TextDataStack
.Pop ();
350 strTemp
= (string) TextDataStack
.Pop ();
351 str
.Append (strTemp
);
355 if (str
.Length
> 0) {
356 AppendText (str
.ToString());
357 str
.Remove (0, str
.Length
);
361 override protected void DoPull ()
364 ec
= ErrorCodes
.ERROR_RTF_OK
;
366 ec
= RTFParse (false);
367 if (ec
!= ErrorCodes
.ERROR_RTF_OK
)
368 Logger
.Log
.Error ("{0}", ec
);
372 override protected void DoPullProperties ()
375 ec
= ErrorCodes
.ERROR_RTF_OK
;
376 ec
= RTFParse (true);
377 if (ec
!= ErrorCodes
.ERROR_RTF_OK
)
378 Logger
.Log
.Error ("{0}", ec
);