From 93dec3d547b9ddd3bc4b2df5f5dffb8c622632e6 Mon Sep 17 00:00:00 2001 From: vvaradan Date: Wed, 1 Dec 2004 22:13:51 +0000 Subject: [PATCH] RTF filter complies to MS RTF 1.5 specification. (works well with 1.8 as well). Extracts meta-data and style information. --- ChangeLog | 5 + Filters/FilterRTF.cs | 373 ++++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 331 insertions(+), 47 deletions(-) diff --git a/ChangeLog b/ChangeLog index 2677f0ec..6cdc7335 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +2004-12-01 Veerapuram Varadhan + + * Filters/FilterRTF.cs: Compatible with MS RTF specification, + extracts meta-data and style information. + 2004-11-18 Alex Graveley * Best/BestTray.cs: Make BestTray extend Gtk.Plug, and in the diff --git a/Filters/FilterRTF.cs b/Filters/FilterRTF.cs index 28408e0a..f88d089e 100644 --- a/Filters/FilterRTF.cs +++ b/Filters/FilterRTF.cs @@ -25,79 +25,358 @@ // DEALINGS IN THE SOFTWARE. // // -// Currently, the filtering is based on the Keyword : "\pard\plain". If anyone +// Currently, the filtering is based on only few *control words*. If anyone // has any samples that can break this "assumption", kindly post a copy of it, -// if you can, to vvaradhan AT novell DOT com +// if you can, to // // FIXME: Require more complex samples to test the parsing, mostly generated // using Microsoft Word or wordpad. :) using System; +using System.Collections; using System.IO; using System.Text; -using System.Text.RegularExpressions; +using Beagle.Util; +internal class RTFControlWordType { + + public enum Type { + None, + Skip, + MetaDataBlock, + MetaDataTag, + Paragraph, + EscSeq, + CharProp + } + + public Type Types; + public string ctrlWord; + + RTFControlWordType (Type types, string ctrlword) + { + this.Types = types; + this.ctrlWord = ctrlword; + } + // FIXME: Need to add "unicode", "styles", + // "header", "footer" etc. + static RTFControlWordType[] types = + { + new RTFControlWordType (Type.None, ""), + new RTFControlWordType (Type.MetaDataBlock, "info"), + new RTFControlWordType (Type.MetaDataTag, "title"), + new RTFControlWordType (Type.MetaDataTag, "author"), + new RTFControlWordType (Type.MetaDataTag, "comment"), + new RTFControlWordType (Type.MetaDataTag, "operator"), + new RTFControlWordType (Type.MetaDataTag, "nofpages"), + new RTFControlWordType (Type.MetaDataTag, "nofwords"), + new RTFControlWordType (Type.MetaDataTag, "generator"), + new RTFControlWordType (Type.MetaDataTag, "company"), + new RTFControlWordType (Type.Paragraph, "par"), + new RTFControlWordType (Type.Paragraph, "pard"), + new RTFControlWordType (Type.CharProp, "b"), + new RTFControlWordType (Type.CharProp, "i"), + new RTFControlWordType (Type.CharProp, "ul"), + new RTFControlWordType (Type.CharProp, "up"), + new RTFControlWordType (Type.CharProp, "dn"), + new RTFControlWordType (Type.Skip, "'"), + new RTFControlWordType (Type.Skip, "*"), + new RTFControlWordType (Type.EscSeq, "{"), + new RTFControlWordType (Type.EscSeq, "}"), + new RTFControlWordType (Type.EscSeq, "\\"), + }; + + public static RTFControlWordType Find (string strCtrlWord) + { + for (int i = 0; i < types.Length; i++) { + if (String.Compare (types[i].ctrlWord, strCtrlWord) == 0) + return types[i]; + } + return types[0]; + } +} namespace Beagle.Filters { public class FilterRTF : Beagle.Daemon.Filter { + public enum Position { + None, + InMetaData, + InMetaDataTagGenerator, + InBody + } + + public enum ErrorCodes { + ERROR_RTF_OK, + ERROR_RTF_EOF, + ERROR_RTF_UNHANDLED_SYMBOL + }; + + Position pos; + Stack MetaDataStack; + Stack TextDataStack; + int groupCount; + long offset; + FileStream FsRTF; + StreamReader SReaderRTF; + public FilterRTF () { // Make this a general rtf filter. AddSupportedMimeType ("application/rtf"); + pos = Position.None; + MetaDataStack = new Stack(); + TextDataStack = new Stack(); + groupCount = 0; + offset = 0; + FsRTF = null; + SReaderRTF = null; + } + + override protected void DoOpen (FileInfo info) + { + FsRTF = new FileStream (info.FullName, FileMode.Open, FileAccess.Read); + if (FsRTF != null) + SReaderRTF = new StreamReader (FsRTF); + } - /* - FIXME: - Right now we don't handle certain control words, - that can look like text and but not. - Ex: font names - */ - protected void ParseRTFFile (TextReader reader) + + // Identifies the type of RTF control word and handles accordingly + private ErrorCodes HandleControlWord (string strCtrlWord, int paramVal, bool bMeta) { - string str; - string[] tokens; - string[] KeyWord = {"\\pard\\plain"}; - int ndxKeyword; - while ((str = reader.ReadLine ()) != null) { - ndxKeyword = str.IndexOf (KeyWord [0]); - if (ndxKeyword < 0) - continue; - - // Assuming "1" to be the index of the first character - str.Remove (1, (ndxKeyword + KeyWord [0].Length)); - tokens = str.Split (' '); - for (int i = 0; i < tokens.Length; i++) { - if (tokens[i].IndexOf (";}") > -1 || - tokens[i].IndexOf ("{\\") > -1 || - tokens[i].IndexOf ("}\\") > -1) { - /* Control word in the RTF */ - continue; + RTFControlWordType ctrlWrdType = RTFControlWordType.Find (strCtrlWord); + + switch (ctrlWrdType.Types) { + case RTFControlWordType.Type.MetaDataBlock: /* process meta-data */ + pos = Position.InMetaData; + break; + case RTFControlWordType.Type.MetaDataTag: + if (pos == Position.InMetaData) { + if (String.Compare (strCtrlWord, "title") == 0) + MetaDataStack.Push ("dc:title"); + else if (String.Compare (strCtrlWord, "author") == 0) + MetaDataStack.Push ("dc:author"); + else if (String.Compare (strCtrlWord, "comment") == 0) + MetaDataStack.Push ("fixme:comment"); + else if (String.Compare (strCtrlWord, "operator") == 0) + MetaDataStack.Push ("fixme:operator"); + else if (String.Compare (strCtrlWord, "nofpages") == 0) { + MetaDataStack.Push (Convert.ToString (paramVal)); + MetaDataStack.Push ("fixme:page-count"); } - else if (tokens[i].StartsWith ("\\") && - (tokens[i][1] != '{' && - tokens[i][1] != '}' && - tokens[i][1] != '\\')) { - continue; + else if (String.Compare (strCtrlWord, "nofwords") == 0) { + MetaDataStack.Push (Convert.ToString (paramVal)); + MetaDataStack.Push ("fixme:word-count"); } - tokens[i] = tokens[i].Replace ("\\", ""); - tokens[i] = tokens[i].Replace ("{", ""); - tokens[i] = tokens[i].Replace ("}", ""); - //Console.WriteLine (tokens[i]); - /* - FIXME: Why don't we filter out the punctuation marks?? - Why don't we do it in "AppendText", that would be - really generic!!! - */ - AppendText (tokens [i]); - AppendWhiteSpace (); + else if (String.Compare (strCtrlWord, "company") == 0) + MetaDataStack.Push ("fixme:company"); + } else if (String.Compare (strCtrlWord, "generator") == 0) { + pos = Position.InMetaDataTagGenerator; + MetaDataStack.Push ("fixme:generator"); } - } - Finished (); + break; + + case RTFControlWordType.Type.Paragraph: + if (!bMeta) + pos = Position.InBody; + break; + + // FIXME: "Hot" styles are not *properly reset to normal* + // on some *wierd* conditions. + case RTFControlWordType.Type.CharProp: + if (pos == Position.InBody) { + if (paramVal < 0) + HotUp (); + } + break; + + case RTFControlWordType.Type.EscSeq: + if (pos == Position.InBody) { + TextDataStack.Push (strCtrlWord); + TextDataStack.Push ("EscSeq"); + } + break; + } + return ErrorCodes.ERROR_RTF_OK; } + // FIXME: Probably need a little cleanup ;-) + + private ErrorCodes ProcessControlWords (bool bMeta) + { + int aByte = -1; + char ch; + int paramVal = -1, i; + StringBuilder strCtrlWord = new StringBuilder (); + StringBuilder strParameter = new StringBuilder (); + + aByte = SReaderRTF.Read (); + if (aByte == -1) + return ErrorCodes.ERROR_RTF_EOF; + + ch = (char) aByte; + RTFControlWordType ctrlWrdType = RTFControlWordType.Find (new String (ch, 1)); + + if (!Char.IsLetter (ch) && + ctrlWrdType.Types != RTFControlWordType.Type.Skip && + ctrlWrdType.Types != RTFControlWordType.Type.EscSeq) { + Console.WriteLine ("Unhandled symbol: {0}, {1}", ch, ctrlWrdType.Types); + return ErrorCodes.ERROR_RTF_UNHANDLED_SYMBOL; + } + while (aByte != -1) { + strCtrlWord.Append (ch); + aByte = SReaderRTF.Peek (); + ch = (char) aByte; + if (Char.IsLetter (ch)) { + aByte = SReaderRTF.Read (); + ch = (char) aByte; + } + else + break; + } + aByte = SReaderRTF.Peek (); + ch = (char) aByte; + if (Char.IsDigit (ch)) { + aByte = SReaderRTF.Read (); + ch = (char) aByte; + while (aByte != -1) { + strParameter.Append (ch); + aByte = SReaderRTF.Peek (); + ch = (char) aByte; + if (Char.IsDigit (ch)) { + aByte = SReaderRTF.Read (); + ch = (char) aByte; + } + else + break; + } + if (strParameter.Length > 0) + paramVal = Convert.ToInt32 (strParameter.ToString()); + } + //Console.WriteLine ("{0}\t{1}", strCtrlWord, strParameter); + return (HandleControlWord (strCtrlWord.ToString(), paramVal, bMeta)); + } + + private ErrorCodes RTFParse (bool bMeta) + { + int aByte = -1; + char ch; + StringBuilder str = new StringBuilder (); + string strTemp = null; + ErrorCodes ec; + + // If we are not extracting meta-data, set the + // file pointer to the saved position + if (!bMeta) + SReaderRTF.BaseStream.Seek (offset, SeekOrigin.Begin); + + while ((aByte = SReaderRTF.Read ()) != -1) { + ch = (char) aByte; + switch (ch) { + case '\\': /* process keywords */ + ec = ProcessControlWords (bMeta); + if (ec != ErrorCodes.ERROR_RTF_OK) + return ec; + if (pos == Position.InBody) { + AddTextForIndexing (str); + //AppendText (str.ToString()); + //AppendWhiteSpace (); + } + str.Remove (0, str.Length); + break; + case '{': /* process groups */ + if (pos == Position.InBody) + AddTextForIndexing (str); + str.Remove (0, str.Length); + groupCount++; + break; + case '}': /* process groups */ + groupCount--; + if (pos == Position.InMetaData || + pos == Position.InMetaDataTagGenerator) { + // groupCount will atleast be 1 for + // the outermost "{" block + if (pos == Position.InMetaData && groupCount == 1) { + if (bMeta) { + offset = SReaderRTF.BaseStream.Position; + return ErrorCodes.ERROR_RTF_OK; + } + + } else { + if (MetaDataStack.Count > 0) { + strTemp = (string) MetaDataStack.Pop (); + if ((String.Compare (strTemp, "fixme:word-count") == 0) || + (String.Compare (strTemp, "fixme:page-count") == 0)) { + str.Append ((string) MetaDataStack.Pop ()); + AddProperty (Beagle.Property.NewKeyword (strTemp, + str.ToString())); + } + else + AddProperty (Beagle.Property.New (strTemp, + str.ToString())); + } + } + + } else if (pos == Position.InBody) { + if (str.Length > 0) + str.Append (' '); + AddTextForIndexing (str); + if (IsHot) + HotDown (); + } + + break; + case '\r': /* ignore \r */ + case '\n': /* ignore \n */ + break; + default: + str.Append (ch); + break; + } + } + return ErrorCodes.ERROR_RTF_OK; + } + + private void AddTextForIndexing (StringBuilder str) + { + string strTemp; + string strStyle; + int elemCount; + + while (TextDataStack.Count > 0) { + strTemp = (string) TextDataStack.Pop (); + switch (strTemp) { + case "EscSeq": + strTemp = (string) TextDataStack.Pop (); + str.Append (strTemp); + break; + } + } + if (str.Length > 0) { + AppendText (str.ToString()); + str.Remove (0, str.Length); + } + } + override protected void DoPull () { - ParseRTFFile (TextReader); + ErrorCodes ec; + ec = ErrorCodes.ERROR_RTF_OK; + pos = Position.None; + ec = RTFParse (false); + if (ec != ErrorCodes.ERROR_RTF_OK) + Logger.Log.Error ("{0}", ec); + Finished (); } + + override protected void DoPullProperties () + { + ErrorCodes ec; + ec = ErrorCodes.ERROR_RTF_OK; + ec = RTFParse (true); + if (ec != ErrorCodes.ERROR_RTF_OK) + Logger.Log.Error ("{0}", ec); + } + } } -- 2.11.4.GIT