RTF filter complies to MS RTF 1.5 specification. (works well with 1.8 as well).
[beagle.git] / Filters / FilterRTF.cs
blobf88d089ec0e2517a42374e6923a04f254c8cc6d8
1 //
2 // Beagle
3 //
4 // FilterRTF.cs : Trivial implementation of a RTF-document filter.
5 //
6 // Copyright (C) 2004 Novell, Inc.
7 //
8 //
9 // Permission is hereby granted, free of charge, to any person obtaining a
10 // copy of this software and associated documentation files (the "Software"),
11 // to deal in the Software without restriction, including without limitation
12 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 // and/or sell copies of the Software, and to permit persons to whom the
14 // Software is furnished to do so, subject to the following conditions:
16 // The above copyright notice and this permission notice shall be included in
17 // all copies or substantial portions of the Software.
19 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 // DEALINGS IN THE SOFTWARE.
28 // Currently, the filtering is based on only few *control words*. If anyone
29 // has any samples that can break this "assumption", kindly post a copy of it,
30 // if you can, to <vvaradhan@novell.com>
32 // FIXME: Require more complex samples to test the parsing, mostly generated
33 // using Microsoft Word or wordpad. :)
35 using System;
36 using System.Collections;
37 using System.IO;
38 using System.Text;
39 using Beagle.Util;
40 internal class RTFControlWordType {
42 public enum Type {
43 None,
44 Skip,
45 MetaDataBlock,
46 MetaDataTag,
47 Paragraph,
48 EscSeq,
49 CharProp
52 public Type Types;
53 public string ctrlWord;
55 RTFControlWordType (Type types, string ctrlword)
57 this.Types = types;
58 this.ctrlWord = ctrlword;
61 // FIXME: Need to add "unicode", "styles",
62 // "header", "footer" etc.
63 static RTFControlWordType[] types =
65 new RTFControlWordType (Type.None, ""),
66 new RTFControlWordType (Type.MetaDataBlock, "info"),
67 new RTFControlWordType (Type.MetaDataTag, "title"),
68 new RTFControlWordType (Type.MetaDataTag, "author"),
69 new RTFControlWordType (Type.MetaDataTag, "comment"),
70 new RTFControlWordType (Type.MetaDataTag, "operator"),
71 new RTFControlWordType (Type.MetaDataTag, "nofpages"),
72 new RTFControlWordType (Type.MetaDataTag, "nofwords"),
73 new RTFControlWordType (Type.MetaDataTag, "generator"),
74 new RTFControlWordType (Type.MetaDataTag, "company"),
75 new RTFControlWordType (Type.Paragraph, "par"),
76 new RTFControlWordType (Type.Paragraph, "pard"),
77 new RTFControlWordType (Type.CharProp, "b"),
78 new RTFControlWordType (Type.CharProp, "i"),
79 new RTFControlWordType (Type.CharProp, "ul"),
80 new RTFControlWordType (Type.CharProp, "up"),
81 new RTFControlWordType (Type.CharProp, "dn"),
82 new RTFControlWordType (Type.Skip, "'"),
83 new RTFControlWordType (Type.Skip, "*"),
84 new RTFControlWordType (Type.EscSeq, "{"),
85 new RTFControlWordType (Type.EscSeq, "}"),
86 new RTFControlWordType (Type.EscSeq, "\\"),
89 public static RTFControlWordType Find (string strCtrlWord)
91 for (int i = 0; i < types.Length; i++) {
92 if (String.Compare (types[i].ctrlWord, strCtrlWord) == 0)
93 return types[i];
95 return types[0];
98 namespace Beagle.Filters {
100 public class FilterRTF : Beagle.Daemon.Filter {
102 public enum Position {
103 None,
104 InMetaData,
105 InMetaDataTagGenerator,
106 InBody
109 public enum ErrorCodes {
110 ERROR_RTF_OK,
111 ERROR_RTF_EOF,
112 ERROR_RTF_UNHANDLED_SYMBOL
115 Position pos;
116 Stack MetaDataStack;
117 Stack TextDataStack;
118 int groupCount;
119 long offset;
120 FileStream FsRTF;
121 StreamReader SReaderRTF;
123 public FilterRTF ()
125 // Make this a general rtf filter.
126 AddSupportedMimeType ("application/rtf");
127 pos = Position.None;
128 MetaDataStack = new Stack();
129 TextDataStack = new Stack();
130 groupCount = 0;
131 offset = 0;
132 FsRTF = null;
133 SReaderRTF = null;
136 override protected void DoOpen (FileInfo info)
138 FsRTF = new FileStream (info.FullName, FileMode.Open, FileAccess.Read);
139 if (FsRTF != null)
140 SReaderRTF = new StreamReader (FsRTF);
144 // Identifies the type of RTF control word and handles accordingly
145 private ErrorCodes HandleControlWord (string strCtrlWord, int paramVal, bool bMeta)
147 RTFControlWordType ctrlWrdType = RTFControlWordType.Find (strCtrlWord);
149 switch (ctrlWrdType.Types) {
150 case RTFControlWordType.Type.MetaDataBlock: /* process meta-data */
151 pos = Position.InMetaData;
152 break;
153 case RTFControlWordType.Type.MetaDataTag:
154 if (pos == Position.InMetaData) {
155 if (String.Compare (strCtrlWord, "title") == 0)
156 MetaDataStack.Push ("dc:title");
157 else if (String.Compare (strCtrlWord, "author") == 0)
158 MetaDataStack.Push ("dc:author");
159 else if (String.Compare (strCtrlWord, "comment") == 0)
160 MetaDataStack.Push ("fixme:comment");
161 else if (String.Compare (strCtrlWord, "operator") == 0)
162 MetaDataStack.Push ("fixme:operator");
163 else if (String.Compare (strCtrlWord, "nofpages") == 0) {
164 MetaDataStack.Push (Convert.ToString (paramVal));
165 MetaDataStack.Push ("fixme:page-count");
167 else if (String.Compare (strCtrlWord, "nofwords") == 0) {
168 MetaDataStack.Push (Convert.ToString (paramVal));
169 MetaDataStack.Push ("fixme:word-count");
171 else if (String.Compare (strCtrlWord, "company") == 0)
172 MetaDataStack.Push ("fixme:company");
173 } else if (String.Compare (strCtrlWord, "generator") == 0) {
174 pos = Position.InMetaDataTagGenerator;
175 MetaDataStack.Push ("fixme:generator");
177 break;
179 case RTFControlWordType.Type.Paragraph:
180 if (!bMeta)
181 pos = Position.InBody;
182 break;
184 // FIXME: "Hot" styles are not *properly reset to normal*
185 // on some *wierd* conditions.
186 case RTFControlWordType.Type.CharProp:
187 if (pos == Position.InBody) {
188 if (paramVal < 0)
189 HotUp ();
191 break;
193 case RTFControlWordType.Type.EscSeq:
194 if (pos == Position.InBody) {
195 TextDataStack.Push (strCtrlWord);
196 TextDataStack.Push ("EscSeq");
198 break;
200 return ErrorCodes.ERROR_RTF_OK;
203 // FIXME: Probably need a little cleanup ;-)
205 private ErrorCodes ProcessControlWords (bool bMeta)
207 int aByte = -1;
208 char ch;
209 int paramVal = -1, i;
210 StringBuilder strCtrlWord = new StringBuilder ();
211 StringBuilder strParameter = new StringBuilder ();
213 aByte = SReaderRTF.Read ();
214 if (aByte == -1)
215 return ErrorCodes.ERROR_RTF_EOF;
217 ch = (char) aByte;
218 RTFControlWordType ctrlWrdType = RTFControlWordType.Find (new String (ch, 1));
220 if (!Char.IsLetter (ch) &&
221 ctrlWrdType.Types != RTFControlWordType.Type.Skip &&
222 ctrlWrdType.Types != RTFControlWordType.Type.EscSeq) {
223 Console.WriteLine ("Unhandled symbol: {0}, {1}", ch, ctrlWrdType.Types);
224 return ErrorCodes.ERROR_RTF_UNHANDLED_SYMBOL;
226 while (aByte != -1) {
227 strCtrlWord.Append (ch);
228 aByte = SReaderRTF.Peek ();
229 ch = (char) aByte;
230 if (Char.IsLetter (ch)) {
231 aByte = SReaderRTF.Read ();
232 ch = (char) aByte;
234 else
235 break;
237 aByte = SReaderRTF.Peek ();
238 ch = (char) aByte;
239 if (Char.IsDigit (ch)) {
240 aByte = SReaderRTF.Read ();
241 ch = (char) aByte;
242 while (aByte != -1) {
243 strParameter.Append (ch);
244 aByte = SReaderRTF.Peek ();
245 ch = (char) aByte;
246 if (Char.IsDigit (ch)) {
247 aByte = SReaderRTF.Read ();
248 ch = (char) aByte;
250 else
251 break;
253 if (strParameter.Length > 0)
254 paramVal = Convert.ToInt32 (strParameter.ToString());
256 //Console.WriteLine ("{0}\t{1}", strCtrlWord, strParameter);
257 return (HandleControlWord (strCtrlWord.ToString(), paramVal, bMeta));
260 private ErrorCodes RTFParse (bool bMeta)
262 int aByte = -1;
263 char ch;
264 StringBuilder str = new StringBuilder ();
265 string strTemp = null;
266 ErrorCodes ec;
268 // If we are not extracting meta-data, set the
269 // file pointer to the saved position
270 if (!bMeta)
271 SReaderRTF.BaseStream.Seek (offset, SeekOrigin.Begin);
273 while ((aByte = SReaderRTF.Read ()) != -1) {
274 ch = (char) aByte;
275 switch (ch) {
276 case '\\': /* process keywords */
277 ec = ProcessControlWords (bMeta);
278 if (ec != ErrorCodes.ERROR_RTF_OK)
279 return ec;
280 if (pos == Position.InBody) {
281 AddTextForIndexing (str);
282 //AppendText (str.ToString());
283 //AppendWhiteSpace ();
285 str.Remove (0, str.Length);
286 break;
287 case '{': /* process groups */
288 if (pos == Position.InBody)
289 AddTextForIndexing (str);
290 str.Remove (0, str.Length);
291 groupCount++;
292 break;
293 case '}': /* process groups */
294 groupCount--;
295 if (pos == Position.InMetaData ||
296 pos == Position.InMetaDataTagGenerator) {
297 // groupCount will atleast be 1 for
298 // the outermost "{" block
299 if (pos == Position.InMetaData && groupCount == 1) {
300 if (bMeta) {
301 offset = SReaderRTF.BaseStream.Position;
302 return ErrorCodes.ERROR_RTF_OK;
305 } else {
306 if (MetaDataStack.Count > 0) {
307 strTemp = (string) MetaDataStack.Pop ();
308 if ((String.Compare (strTemp, "fixme:word-count") == 0) ||
309 (String.Compare (strTemp, "fixme:page-count") == 0)) {
310 str.Append ((string) MetaDataStack.Pop ());
311 AddProperty (Beagle.Property.NewKeyword (strTemp,
312 str.ToString()));
314 else
315 AddProperty (Beagle.Property.New (strTemp,
316 str.ToString()));
320 } else if (pos == Position.InBody) {
321 if (str.Length > 0)
322 str.Append (' ');
323 AddTextForIndexing (str);
324 if (IsHot)
325 HotDown ();
328 break;
329 case '\r': /* ignore \r */
330 case '\n': /* ignore \n */
331 break;
332 default:
333 str.Append (ch);
334 break;
337 return ErrorCodes.ERROR_RTF_OK;
340 private void AddTextForIndexing (StringBuilder str)
342 string strTemp;
343 string strStyle;
344 int elemCount;
346 while (TextDataStack.Count > 0) {
347 strTemp = (string) TextDataStack.Pop ();
348 switch (strTemp) {
349 case "EscSeq":
350 strTemp = (string) TextDataStack.Pop ();
351 str.Append (strTemp);
352 break;
355 if (str.Length > 0) {
356 AppendText (str.ToString());
357 str.Remove (0, str.Length);
361 override protected void DoPull ()
363 ErrorCodes ec;
364 ec = ErrorCodes.ERROR_RTF_OK;
365 pos = Position.None;
366 ec = RTFParse (false);
367 if (ec != ErrorCodes.ERROR_RTF_OK)
368 Logger.Log.Error ("{0}", ec);
369 Finished ();
372 override protected void DoPullProperties ()
374 ErrorCodes ec;
375 ec = ErrorCodes.ERROR_RTF_OK;
376 ec = RTFParse (true);
377 if (ec != ErrorCodes.ERROR_RTF_OK)
378 Logger.Log.Error ("{0}", ec);