2 // FilterDOC.cs : Trivial implementation of a MS Word-document filter.
3 // This filter uses wv1 library - http://wvware.sourceforge.net/
5 // Author: Veerapuram Varadhan <vvaradhan@novell.com>
7 // Copyright (C) 2004 Novell, Inc.
11 // Permission is hereby granted, free of charge, to any person obtaining a
12 // copy of this software and associated documentation files (the "Software"),
13 // to deal in the Software without restriction, including without limitation
14 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
15 // and/or sell copies of the Software, and to permit persons to whom the
16 // Software is furnished to do so, subject to the following conditions:
18 // The above copyright notice and this permission notice shall be included in
19 // all copies or substantial portions of the Software.
21 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
24 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
27 // DEALINGS IN THE SOFTWARE.
32 using System
.Runtime
.InteropServices
;
39 namespace Beagle
.Filters
{
41 public class FilterDOC
: FilterOle
{
43 //////////////////////////////////////////////////////////
45 private delegate void TextHandlerCallback (IntPtr byteArray
, int dataLen
,
46 IntPtr byteHotArray
, int hotDataLen
,
47 bool appendStructBrk
);
49 [DllImport ("libbeagleglue")]
50 private static extern int wv1_glue_init_doc_parsing (string fname
, TextHandlerCallback callback
);
52 [DllImport ("libbeagleglue")]
53 private static extern int wv1_init ();
55 //////////////////////////////////////////////////////////
57 static bool wv1_Initted
= false;
61 AddSupportedFlavor (FilterFlavor
.NewFromMimeType ("application/msword"));
62 AddSupportedFlavor (FilterFlavor
.NewFromMimeType ("application/vnd.ms-word"));
63 AddSupportedFlavor (FilterFlavor
.NewFromMimeType ("application/x-msword"));
67 private void IndexText (IntPtr byteArray
, int dataLen
,
68 IntPtr byteHotArray
, int hotDataLen
,
77 data
= new byte[dataLen
];
78 Marshal
.Copy (byteArray
, data
, 0, dataLen
);
82 str
= System
.Text
.Encoding
.UTF8
.GetString (data
, 0, dataLen
);
86 data
= new byte [hotDataLen
];
87 Marshal
.Copy (byteHotArray
, data
, 0, hotDataLen
);
90 strHot
= System
.Text
.Encoding
.UTF8
.GetString (data
, 0, hotDataLen
);
92 AppendText (str
, strHot
);
95 AppendStructuralBreak ();
96 } catch (Exception e
) {
97 Logger
.Log
.Debug ("Exception occurred in Word-Doc filter. {0}", e
);
101 override protected void OpenStorage (FileInfo info
)
103 FileName
= info
.FullName
;
106 override protected void ExtractMetaData (Gsf
.Input sumStream
, Gsf
.Input docSumStream
)
111 if (sumMeta
!= null) {
112 prop
= sumMeta
.Lookup ("gsf:word-count");
114 count
= (int) prop
.Val
;
116 AddProperty (Beagle
.Property
.NewUnsearched ("fixme:word-count", count
));
119 prop
= sumMeta
.Lookup ("gsf:page-count");
121 count
= (int) prop
.Val
;
123 AddProperty (Beagle
.Property
.NewUnsearched ("fixme:page-count", count
));
127 override protected void DoPull ()
130 TextHandlerCallback textHandler
;
131 textHandler
= new TextHandlerCallback (IndexText
);
138 Stopwatch stopwatch
= new Stopwatch ();
141 ret
= wv1_glue_init_doc_parsing (FileName
, textHandler
);
143 Logger
.Log
.Error ("{0} : is password protected", FileName
);
145 Logger
.Log
.Error ("{0} : Unable to read", FileName
);
147 Logger
.Log
.Error ("Unable to initiate the parser for {0}", FileName
);
149 Logger
.Log
.Info ("Word document extraction done in {0}", stopwatch
);