Compute lucene-style scores for our hits.
[beagle.git] / Filters / FilterRTF.cs
blob7a0345817bfbd07b0524d258ee285e58724176fe
2 //
3 // Beagle
4 //
5 // FilterRTF.cs : Trivial implementation of a RTF-document filter.
6 //
7 // Copyright (C) 2004 Novell, Inc.
8 //
9 //
10 // Permission is hereby granted, free of charge, to any person obtaining a
11 // copy of this software and associated documentation files (the "Software"),
12 // to deal in the Software without restriction, including without limitation
13 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
14 // and/or sell copies of the Software, and to permit persons to whom the
15 // Software is furnished to do so, subject to the following conditions:
17 // The above copyright notice and this permission notice shall be included in
18 // all copies or substantial portions of the Software.
20 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
25 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
26 // DEALINGS IN THE SOFTWARE.
29 // Currently, the filtering is based on only few *control words*. If anyone
30 // has any samples that can break this "assumption", kindly post a copy of it,
31 // if you can, to <vvaradhan@novell.com>
33 // FIXME: Require more complex samples to test the parsing, mostly generated
34 // using Microsoft Word or wordpad. :)
36 using System;
37 using System.Collections;
38 using System.IO;
39 using System.Text;
41 using Beagle.Util;
42 using Beagle.Daemon;
44 internal class RTFControlWordType {
46 public enum Type {
47 None,
48 Skip,
49 MetaDataBlock,
50 MetaDataTag,
51 Paragraph,
52 ParaEnd,
53 SplSection,
54 EscSeq,
55 CharProp
58 public Type Types;
59 public string ctrlWord;
61 RTFControlWordType (Type types, string ctrlword)
63 this.Types = types;
64 this.ctrlWord = ctrlword;
67 // FIXME: Need to add "unicode", "styles",
68 // "header", "footer" etc.
69 static RTFControlWordType[] types =
71 new RTFControlWordType (Type.None, ""),
72 new RTFControlWordType (Type.MetaDataBlock, "info"),
73 new RTFControlWordType (Type.MetaDataTag, "title"),
74 new RTFControlWordType (Type.MetaDataTag, "author"),
75 new RTFControlWordType (Type.MetaDataTag, "comment"),
76 new RTFControlWordType (Type.MetaDataTag, "operator"),
77 new RTFControlWordType (Type.MetaDataTag, "nofpages"),
78 new RTFControlWordType (Type.MetaDataTag, "nofwords"),
79 new RTFControlWordType (Type.MetaDataTag, "generator"),
80 new RTFControlWordType (Type.MetaDataTag, "company"),
81 new RTFControlWordType (Type.ParaEnd, "par"),
82 new RTFControlWordType (Type.Paragraph, "pard"),
83 new RTFControlWordType (Type.SplSection, "header"),
84 new RTFControlWordType (Type.SplSection, "footer"),
85 new RTFControlWordType (Type.SplSection, "headerl"),
86 new RTFControlWordType (Type.SplSection, "footerl"),
87 new RTFControlWordType (Type.SplSection, "footnote"),
88 new RTFControlWordType (Type.CharProp, "b"),
89 new RTFControlWordType (Type.CharProp, "i"),
90 new RTFControlWordType (Type.CharProp, "ul"),
91 new RTFControlWordType (Type.CharProp, "up"),
92 new RTFControlWordType (Type.CharProp, "dn"),
93 new RTFControlWordType (Type.Skip, "'"),
94 new RTFControlWordType (Type.Skip, "*"),
95 new RTFControlWordType (Type.EscSeq, "{"),
96 new RTFControlWordType (Type.EscSeq, "}"),
97 new RTFControlWordType (Type.EscSeq, "\\"),
100 public static RTFControlWordType Find (string strCtrlWord)
102 for (int i = 0; i < types.Length; i++) {
103 if (String.Compare (types[i].ctrlWord, strCtrlWord) == 0)
104 return types[i];
106 return types[0];
109 namespace Beagle.Filters {
111 public class FilterRTF : Beagle.Daemon.Filter {
113 public enum Position {
114 None,
115 InMetaData,
116 InMetaDataTagGenerator,
117 InBody,
118 InPara
121 public enum ErrorCodes {
122 ERROR_RTF_OK,
123 ERROR_RTF_EOF,
124 ERROR_RTF_UNHANDLED_SYMBOL
127 Position pos;
128 int groupCount;
129 int skipCount;
130 int hotStyleCount;
131 bool bPartHotStyle;
132 FileStream FsRTF;
133 StreamReader SReaderRTF;
134 string partText;
136 Stack MetaDataStack;
137 Stack TextDataStack;
139 public FilterRTF ()
141 // Make this a general rtf filter.
142 AddSupportedFlavor (FilterFlavor.NewFromMimeType ("application/rtf"));
144 pos = Position.None;
145 groupCount = 0;
146 skipCount = 0;
147 hotStyleCount = 0;
148 bPartHotStyle = false;
149 FsRTF = null;
150 SReaderRTF = null;
151 partText = "";
153 MetaDataStack = new Stack ();
154 TextDataStack = new Stack ();
156 SnippetMode = true;
159 override protected void DoOpen (FileInfo info)
161 try {
162 FsRTF = new FileStream (info.FullName, FileMode.Open,
163 FileAccess.Read);
164 if (FsRTF != null)
165 SReaderRTF = new StreamReader (FsRTF);
166 else {
167 Logger.Log.Error ("Unable to open {0}.", info.FullName);
168 Finished ();
170 } catch (Exception e) {
171 Logger.Log.Error ("Unable to open {0}.", info.FullName);
172 Finished ();
177 // Identifies the type of RTF control word and handles accordingly
178 private ErrorCodes HandleControlWord (string strCtrlWord, int paramVal, bool bMeta)
180 RTFControlWordType ctrlWrdType = RTFControlWordType.Find (strCtrlWord);
182 switch (ctrlWrdType.Types) {
183 case RTFControlWordType.Type.MetaDataBlock: /* process meta-data */
184 pos = Position.InMetaData;
185 break;
186 case RTFControlWordType.Type.MetaDataTag:
187 if (pos == Position.InMetaData) {
188 if (String.Compare (strCtrlWord, "title") == 0)
189 MetaDataStack.Push ("dc:title");
190 else if (String.Compare (strCtrlWord, "author") == 0)
191 MetaDataStack.Push ("dc:author");
192 else if (String.Compare (strCtrlWord, "comment") == 0)
193 MetaDataStack.Push ("fixme:comment");
194 else if (String.Compare (strCtrlWord, "operator") == 0)
195 MetaDataStack.Push ("fixme:operator");
196 else if (String.Compare (strCtrlWord, "nofpages") == 0) {
197 MetaDataStack.Push (Convert.ToString (paramVal));
198 MetaDataStack.Push ("fixme:page-count");
200 else if (String.Compare (strCtrlWord, "nofwords") == 0) {
201 MetaDataStack.Push (Convert.ToString (paramVal));
202 MetaDataStack.Push ("fixme:word-count");
204 else if (String.Compare (strCtrlWord, "company") == 0)
205 MetaDataStack.Push ("fixme:company");
206 } else if (String.Compare (strCtrlWord, "generator") == 0) {
207 pos = Position.InMetaDataTagGenerator;
208 MetaDataStack.Push ("fixme:generator");
210 break;
212 case RTFControlWordType.Type.Paragraph:
213 if (!bMeta)
214 pos = Position.InPara;
215 break;
217 case RTFControlWordType.Type.ParaEnd:
218 if (!bMeta)
219 pos = Position.InBody;
220 break;
222 // FIXME: "Hot" styles are not *properly reset to normal*
223 // on some *wierd* conditions.
224 // To avoid such stuff, we need to maintain a stack of
225 // groupCounts for set/reset Hot styles.
226 case RTFControlWordType.Type.SplSection:
227 hotStyleCount = groupCount - 1;
228 break;
230 case RTFControlWordType.Type.CharProp:
231 if (pos == Position.InPara) {
232 if (paramVal < 0) {
233 //Console.WriteLine ("HotUp: \\{0}{1}", strCtrlWord, paramVal);
234 hotStyleCount = groupCount - 1;
235 //HotUp ();
238 break;
240 case RTFControlWordType.Type.EscSeq:
241 if (pos == Position.InPara) {
242 TextDataStack.Push (strCtrlWord);
243 TextDataStack.Push ("EscSeq");
245 break;
246 case RTFControlWordType.Type.Skip:
247 skipCount = groupCount - 1;
248 //SkipDataStack.Push (groupCount-1);
249 break;
251 return ErrorCodes.ERROR_RTF_OK;
254 // FIXME: Probably need a little cleanup ;-)
256 private ErrorCodes ProcessControlWords (bool bMeta)
258 int aByte = -1;
259 char ch;
260 int paramVal = -1;
261 bool negParamVal = false;
262 StringBuilder strCtrlWord = new StringBuilder ();
263 StringBuilder strParameter = new StringBuilder ();
265 aByte = SReaderRTF.Read ();
266 if (aByte == -1)
267 return ErrorCodes.ERROR_RTF_EOF;
269 ch = (char) aByte;
270 RTFControlWordType ctrlWrdType = RTFControlWordType.Find (new String (ch, 1));
272 if (!Char.IsLetter (ch) &&
273 ctrlWrdType.Types != RTFControlWordType.Type.Skip &&
274 ctrlWrdType.Types != RTFControlWordType.Type.EscSeq) {
275 Logger.Log.Error ("Unhandled symbol: {0}, {1}", ch, ctrlWrdType.Types);
276 return ErrorCodes.ERROR_RTF_UNHANDLED_SYMBOL;
278 while (aByte != -1) {
279 strCtrlWord.Append (ch);
280 aByte = SReaderRTF.Peek ();
281 ch = (char) aByte;
282 if (Char.IsLetter (ch)) {
283 aByte = SReaderRTF.Read ();
284 ch = (char) aByte;
286 else
287 break;
289 aByte = SReaderRTF.Peek ();
290 ch = (char) aByte;
291 if (aByte != -1 && ch == '-') {
292 negParamVal = true;
293 aByte = SReaderRTF.Read (); // move the fp
294 aByte = SReaderRTF.Peek ();
295 ch = (char) aByte;
297 if (Char.IsDigit (ch)) {
298 aByte = SReaderRTF.Read ();
299 ch = (char) aByte;
300 while (aByte != -1) {
301 strParameter.Append (ch);
302 aByte = SReaderRTF.Peek ();
303 ch = (char) aByte;
304 if (Char.IsDigit (ch)) {
305 aByte = SReaderRTF.Read ();
306 ch = (char) aByte;
308 else
309 break;
311 if (strParameter.Length > 0)
312 paramVal = Convert.ToInt32 (strParameter.ToString());
314 //Console.WriteLine ("{0}\t{1}", strCtrlWord, strParameter);
315 if (negParamVal && paramVal > -1)
316 paramVal *= -1;
317 return (HandleControlWord (strCtrlWord.ToString(), paramVal, bMeta));
320 private ErrorCodes RTFParse (bool bMeta)
322 int aByte = -1;
323 char ch;
324 StringBuilder str = new StringBuilder ();
325 string strTemp = null;
326 ErrorCodes ec;
328 while ((aByte = SReaderRTF.Read ()) != -1) {
329 ch = (char) aByte;
330 switch (ch) {
331 case '\\': /* process keywords */
332 if (skipCount > 0) {
333 if (groupCount > skipCount)
334 continue;
335 else
336 skipCount = 0;
338 ec = ProcessControlWords (bMeta);
339 if (ec != ErrorCodes.ERROR_RTF_OK)
340 return ec;
341 if (pos == Position.InPara)
342 AddTextForIndexing (str);
343 str.Remove (0, str.Length);
344 break;
345 case '{': /* process groups */
346 if (pos == Position.InPara)
347 AddTextForIndexing (str);
348 str.Remove (0, str.Length);
349 groupCount++;
350 break;
351 case '}': /* process groups */
352 groupCount--;
353 if (pos == Position.InMetaData ||
354 pos == Position.InMetaDataTagGenerator) {
355 // groupCount will atleast be 1 for
356 // the outermost "{" block
357 if (pos == Position.InMetaData && groupCount == 1) {
358 if (bMeta)
359 return ErrorCodes.ERROR_RTF_OK;
360 } else {
361 if (MetaDataStack.Count > 0) {
362 strTemp = (string) MetaDataStack.Pop ();
363 if ((String.Compare (strTemp, "fixme:word-count") == 0) ||
364 (String.Compare (strTemp, "fixme:page-count") == 0)) {
365 str.Append ((string) MetaDataStack.Pop ());
366 AddProperty (Beagle.Property.NewKeyword (strTemp,
367 str.ToString()));
369 else
370 AddProperty (Beagle.Property.New (strTemp,
371 str.ToString()));
375 } else if (pos == Position.InPara) {
376 AddTextForIndexing (str);
378 } else if (pos == Position.InBody) {
379 //Console.WriteLine ("\\par : {0}", str);
380 if (str.Length > 0)
381 str.Append (' ');
382 AddTextForIndexing (str);
383 AppendStructuralBreak ();
385 if (hotStyleCount > 0
386 && groupCount <= hotStyleCount) {
387 //Console.WriteLine ("Group count: {0}, stack: {1}",
388 //groupCount, hotStyleCount);
389 HotDown ();
390 hotStyleCount = 0;
393 break;
394 case '\r': /* ignore \r */
395 case '\n': /* ignore \n */
396 break;
397 default:
398 if ((skipCount == 0 || groupCount <= skipCount)
399 && (pos == Position.InPara || pos == Position.InBody))
400 str.Append (ch);
401 break;
404 if (partText.Length > 0) {
405 if (bPartHotStyle && !IsHot)
406 HotUp ();
407 AppendText (partText);
408 if (IsHot)
409 HotDown ();
411 return ErrorCodes.ERROR_RTF_OK;
414 private void AddTextForIndexing (StringBuilder str)
416 string strTemp;
417 string paramStr = null;
419 bool wasHot = false;
421 while (TextDataStack.Count > 0) {
422 strTemp = (string) TextDataStack.Pop ();
423 switch (strTemp) {
424 case "EscSeq":
425 strTemp = (string) TextDataStack.Pop ();
426 str.Append (strTemp);
427 break;
431 strTemp = "";
432 if (str.Length > 0) {
433 //Console.WriteLine ("Text: [{0}]", str);
435 paramStr = str.ToString ();
436 str.Remove (0, str.Length);
438 int index = paramStr.LastIndexOf (' ');
439 int sindex = 0;
441 if (index > -1) {
442 // During the previous-parsing, a word got terminatted partially,
443 // find the remaining part of the word, concatenate it and add it to
444 // the respective pools and reset the HOT status, if required.
445 if (partText.Length > 0) {
446 sindex = paramStr.IndexOf (' ');
447 strTemp = partText + paramStr.Substring (0, sindex);
448 //Console.WriteLine ("PartHotStyle: {0}, HotStyleCount: {1}, partText: {2}",
449 // bPartHotStyle,
450 // hotStyleCount, strTemp);
451 if (!IsHot) {
452 if (bPartHotStyle)
453 HotUp ();
455 else
456 wasHot = true;
458 AppendText (strTemp);
459 if (!wasHot && bPartHotStyle)
460 HotDown ();
461 bPartHotStyle = false;
463 paramStr = paramStr.Substring (sindex);
464 index = paramStr.LastIndexOf (' ');
465 sindex = 0;
467 if (index > -1) {
468 partText = paramStr.Substring (index);
469 paramStr = paramStr.Substring (sindex, index);
470 } else {
471 strTemp = partText + paramStr;
472 partText = strTemp;
473 paramStr = "";
474 strTemp = "";
477 // Enable *HOT* just before appending the text
478 // because, there can be some *Partial Texts* without
479 // *HOT* styles that needs to be appended.
480 if (hotStyleCount > 0) {
481 if (!IsHot)
482 HotUp ();
483 bPartHotStyle = true;
484 } else
485 bPartHotStyle |= false;
487 if (paramStr.Length > 0)
488 AppendText (paramStr);
490 if (partText.Length < 1)
491 bPartHotStyle = false;
495 override protected void DoPull ()
497 ErrorCodes ec;
498 ec = ErrorCodes.ERROR_RTF_OK;
499 pos = Position.None;
501 // Discard the buffered data, if not,
502 // the buffered data can change the
503 // state "pos" variable that results
504 // in complete mess.
505 // Fixes: http://bugzilla.gnome.org/show_bug.cgi?id=172294
506 SReaderRTF.DiscardBufferedData ();
508 // Rewind the file pointer to start from beginning.
509 SReaderRTF.BaseStream.Seek (0, SeekOrigin.Begin);
511 ec = RTFParse (false);
512 if (ec != ErrorCodes.ERROR_RTF_OK)
513 Logger.Log.Error ("{0}", ec);
514 Finished ();
517 override protected void DoPullProperties ()
519 ErrorCodes ec;
520 ec = ErrorCodes.ERROR_RTF_OK;
521 ec = RTFParse (true);
522 if (ec != ErrorCodes.ERROR_RTF_OK)
523 Logger.Log.Error ("{0}", ec);