NoiseFilter: Dont drop last word of apparent hostnames. Too many non-hostnames can...
[beagle.git] / Filters / FilterDOC.cs
blob81a312ce634d930fc06143987d4db1088532df4a
1 //
2 // FilterDOC.cs : Trivial implementation of a MS Word-document filter.
3 // This filter uses wv1 library - http://wvware.sourceforge.net/
4 //
5 // Author: Veerapuram Varadhan <vvaradhan@novell.com>
6 //
7 // Copyright (C) 2004 Novell, Inc.
8 //
11 // Permission is hereby granted, free of charge, to any person obtaining a
12 // copy of this software and associated documentation files (the "Software"),
13 // to deal in the Software without restriction, including without limitation
14 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
15 // and/or sell copies of the Software, and to permit persons to whom the
16 // Software is furnished to do so, subject to the following conditions:
18 // The above copyright notice and this permission notice shall be included in
19 // all copies or substantial portions of the Software.
21 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
24 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
27 // DEALINGS IN THE SOFTWARE.
30 using System;
31 using System.IO;
32 using System.Runtime.InteropServices;
34 using Beagle.Util;
35 using Beagle.Daemon;
37 using Gsf;
39 namespace Beagle.Filters {
41 public class FilterDOC : FilterOle {
43 //////////////////////////////////////////////////////////
45 private delegate void TextHandlerCallback (IntPtr byteArray, int dataLen,
46 IntPtr byteHotArray, int hotDataLen,
47 bool appendStructBrk);
49 [DllImport ("libbeagleglue")]
50 private static extern int wv1_glue_init_doc_parsing (string fname, TextHandlerCallback callback);
52 [DllImport ("libbeagleglue")]
53 private static extern int wv1_init ();
55 //////////////////////////////////////////////////////////
57 static bool wv1_Initted = false;
59 public FilterDOC ()
61 AddSupportedFlavor (FilterFlavor.NewFromMimeType ("application/msword"));
62 AddSupportedFlavor (FilterFlavor.NewFromMimeType ("application/vnd.ms-word"));
63 AddSupportedFlavor (FilterFlavor.NewFromMimeType ("application/x-msword"));
64 SnippetMode = true;
67 private void IndexText (IntPtr byteArray, int dataLen,
68 IntPtr byteHotArray, int hotDataLen,
69 bool appendStructBrk)
71 byte[] data = null;
72 string str = null;
73 string strHot = null;
75 try {
76 if (dataLen > 0){
77 data = new byte[dataLen];
78 Marshal.Copy (byteArray, data, 0, dataLen);
81 if (data != null)
82 str = System.Text.Encoding.UTF8.GetString (data, 0, dataLen);
84 data = null;
85 if (hotDataLen > 0) {
86 data = new byte [hotDataLen];
87 Marshal.Copy (byteHotArray, data, 0, hotDataLen);
89 if (data != null)
90 strHot = System.Text.Encoding.UTF8.GetString (data, 0, hotDataLen);
92 AppendText (str, strHot);
94 if (appendStructBrk)
95 AppendStructuralBreak ();
96 } catch (Exception e) {
97 Logger.Log.Debug ("Exception occurred in Word-Doc filter. {0}", e);
101 override protected void OpenStorage (FileInfo info)
103 FileName = info.FullName;
106 override protected void ExtractMetaData (Gsf.Input sumStream, Gsf.Input docSumStream)
108 int count = 0;
109 DocProp prop = null;
111 if (sumMeta != null) {
112 prop = sumMeta.Lookup ("gsf:word-count");
113 if (prop != null)
114 count = (int) prop.Val;
115 if (count > 0)
116 AddProperty (Beagle.Property.NewUnsearched ("fixme:word-count", count));
118 count = 0;
119 prop = sumMeta.Lookup ("gsf:page-count");
120 if (prop != null)
121 count = (int) prop.Val;
122 if (count > 0)
123 AddProperty (Beagle.Property.NewUnsearched ("fixme:page-count", count));
127 override protected void DoPull ()
129 int ret;
130 TextHandlerCallback textHandler;
131 textHandler = new TextHandlerCallback (IndexText);
133 if (!wv1_Initted) {
134 wv1_init ();
135 wv1_Initted = true;
138 Stopwatch stopwatch = new Stopwatch ();
139 stopwatch.Start ();
141 ret = wv1_glue_init_doc_parsing (FileName, textHandler);
142 if (ret == -2)
143 Logger.Log.Error ("{0} : is password protected", FileName);
144 else if (ret == -1)
145 Logger.Log.Error ("{0} : Unable to read", FileName);
146 else if (ret == -3)
147 Logger.Log.Error ("Unable to initiate the parser for {0}", FileName);
148 stopwatch.Stop ();
149 Logger.Log.Info ("Word document extraction done in {0}", stopwatch);
150 Finished ();