2006-09-10 Francisco Javier F. Serrador <serrador@openshine.com>
[beagle.git] / beagled / Filter.cs
blobecd33d6234dcc036c74e608b3d7cc0de27bd8ed0
1 //
2 // Filter.cs
3 //
4 // Copyright (C) 2004 Novell, Inc.
5 //
7 //
8 // Permission is hereby granted, free of charge, to any person obtaining a
9 // copy of this software and associated documentation files (the "Software"),
10 // to deal in the Software without restriction, including without limitation
11 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 // and/or sell copies of the Software, and to permit persons to whom the
13 // Software is furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 // DEALINGS IN THE SOFTWARE.
28 using System;
29 using System.Collections;
30 using System.IO;
31 using System.Text;
32 using System.Reflection;
34 using Beagle.Util;
36 namespace Beagle.Daemon {
38 public class Filter {
40 static private bool Debug = false;
41 // Lucene fields allow a maximum of 10000 words
42 // Some of the words will be stop words... so a failsafe maximum of 40000 words
43 // Dont accept more words than that
44 const int MAXWORDS = 40000; // Lucene.Net.Index.IndexWriter.DEFAULT_MAX_FIELD_LENGTH * 4
46 // Derived classes always must have a constructor that
47 // takes no arguments.
48 public Filter () { }
50 //////////////////////////
52 private string identifier;
54 public string Identifier {
55 get { return identifier; }
56 set { identifier = value; }
59 //////////////////////////
61 private ArrayList supported_flavors = new ArrayList ();
63 protected void AddSupportedFlavor (FilterFlavor flavor)
65 supported_flavors.Add (flavor);
68 public ICollection SupportedFlavors {
69 get { return supported_flavors; }
72 //////////////////////////
74 // Filters are versioned. This allows us to automatically re-index
75 // files when a newer filter is available.
77 public string Name {
78 get { return this.GetType ().Name; }
81 private int version = -1;
83 public int Version {
84 get { return version < 0 ? 0 : version; }
87 protected void SetVersion (int v)
89 if (v < 0) {
90 string msg;
91 msg = String.Format ("Attempt to set invalid version {0} on Filter {1}", v, Name);
92 throw new Exception (msg);
95 if (version != -1) {
96 string msg;
97 msg = String.Format ("Attempt to re-set version from {0} to {1} on Filter {2}", version, v, Name);
98 throw new Exception (msg);
101 version = v;
106 //////////////////////////
108 private string this_mime_type = null;
109 private string this_extension = null;
110 private ArrayList indexable_properties = null;
112 public string MimeType {
113 get { return this_mime_type; }
114 set { this_mime_type = value; }
117 public string Extension {
118 get { return this_extension; }
119 set { this_extension = value; }
122 // allow the filter to access the properties
123 // set by indexable
124 public ArrayList IndexableProperties {
125 get { return indexable_properties; }
126 set { indexable_properties = value; }
129 //////////////////////////
131 private bool crawl_mode = false;
133 public void EnableCrawlMode ()
135 crawl_mode = true;
138 protected bool CrawlMode {
139 get { return crawl_mode; }
142 //////////////////////////
144 // Filters which deal with big files, and that don't need
145 // to read in whole files may want to set this to false
146 // to avoid wasting cycles in disk wait.
148 private bool preload = true;
150 protected bool PreLoad {
151 get { return preload; }
152 set { preload = value; }
155 //////////////////////////
157 int hotCount = 0;
158 int freezeCount = 0;
160 public void HotUp ()
162 ++hotCount;
165 public void HotDown ()
167 if (hotCount > 0)
168 --hotCount;
171 public bool IsHot {
172 get { return hotCount > 0; }
175 public void FreezeUp ()
177 ++freezeCount;
180 public void FreezeDown ()
182 if (freezeCount > 0)
183 --freezeCount;
186 public bool IsFrozen {
187 get { return freezeCount > 0; }
190 //////////////////////////
192 private bool snippetMode = false;
193 private bool originalIsText = false;
194 private TextWriter snippetWriter = null;
196 public bool SnippetMode {
197 get { return snippetMode; }
198 set { snippetMode = value; }
201 public bool OriginalIsText {
202 get { return originalIsText; }
203 set { originalIsText = value; }
206 public void AttachSnippetWriter (TextWriter writer)
208 if (snippetMode)
209 snippetWriter = writer;
212 //////////////////////////
214 private ArrayList textPool;
215 private ArrayList hotPool;
216 private ArrayList propertyPool;
218 private int word_count = 0;
219 private int hotword_count = 0;
221 protected bool AllowMoreWords ()
223 return (word_count < MAXWORDS);
226 private bool last_was_structural_break = true;
228 // This two-arg AppendText() will give flexibility to
229 // filters to segregate hot-contents and
230 // normal-contents of a para and call this method with
231 // respective contents.
233 // str : Holds both the normal-contents and hot contents.
234 // strHot: Holds only hot-contents.
236 // Ex:- suppose the actual-content is "one <b>two</b> three"
237 // str = "one two three"
238 // strHot = "two"
240 // NOTE: HotUp() or HotDown() has NO-EFFECT on this variant
241 // of AppendText ()
243 public int AppendText (string str, string strHot)
245 int num_words = 0;
247 if (!IsFrozen && word_count < MAXWORDS && str != null && str != "") {
248 string[] lines;
250 // Avoid unnecessary allocation of a string
251 // FIXME: Handle \r, \r\n cases.
252 if (str.IndexOf ('\n') > -1) {
253 lines = str.Split ('\n');
254 foreach (string line in lines) {
255 if (line.Length > 0) {
256 ReallyAppendText (line, null);
257 AppendStructuralBreak ();
260 } else
261 ReallyAppendText (str, null);
262 num_words = StringFu.CountWords (str, 3, -1);
263 word_count += num_words;
266 if (hotword_count < MAXWORDS) {
267 ReallyAppendText (null, strHot);
268 hotword_count += StringFu.CountWords (strHot, 3, -1);
271 return num_words;
274 public int AppendText (string str)
276 if (Debug)
277 Logger.Log.Debug ("AppendText (\"{0}\")", str);
279 if (! IsFrozen && str != null && str != "")
280 return AppendText (str, IsHot ? str : null);
282 return 0;
285 // Does adding text to to text/hot pools respectively.
286 private void ReallyAppendText (string str, string strHot)
288 if (!IsFrozen && strHot != null && strHot != "")
289 hotPool.Add (strHot.Trim()+" ");
291 if (str != null) {
292 textPool.Add (str);
294 if (snippetWriter != null)
295 snippetWriter.Write (str);
297 last_was_structural_break = false;
300 private bool NeedsWhiteSpace (ArrayList array)
302 if (array.Count == 0)
303 return true;
305 string last = (string) array [array.Count-1];
306 if (last.Length > 0
307 && char.IsWhiteSpace (last [last.Length-1]))
308 return false;
310 return true;
313 public void AppendWhiteSpace ()
315 if (last_was_structural_break)
316 return;
318 if (Debug)
319 Logger.Log.Debug ("AppendWhiteSpace ()");
321 if (NeedsWhiteSpace (textPool)) {
322 textPool.Add (" ");
323 if (snippetWriter != null)
324 snippetWriter.Write (" ");
325 last_was_structural_break = false;
329 public void AddProperty (Property prop)
331 if (prop != null && prop.Value != null && prop.Value != "")
332 propertyPool.Add (prop);
335 public void AppendStructuralBreak ()
337 if (snippetWriter != null && ! last_was_structural_break) {
338 snippetWriter.WriteLine ();
339 last_was_structural_break = true;
341 // When adding a "newline" to the textCache, we need to
342 // append a "Whitespace" to the text pool.
343 if (NeedsWhiteSpace (textPool))
344 textPool.Add (" ");
347 //////////////////////////
349 private bool isFinished = false;
351 public bool IsFinished {
352 get { return isFinished; }
355 protected void Finished ()
357 isFinished = true;
360 private bool has_error = false;
362 public bool HasError {
363 get { return has_error; }
366 protected void Error ()
368 Cleanup (); // force the clean-up of temporary files on an error
369 has_error = true;
372 //////////////////////////
374 protected virtual void DoOpen (FileSystemInfo info) {
375 if (info is FileInfo)
376 DoOpen (info as FileInfo);
377 else if (info is DirectoryInfo)
378 DoOpen (info as DirectoryInfo);
381 protected virtual void DoOpen (FileInfo info) { }
383 protected virtual void DoOpen (DirectoryInfo info) { }
385 protected virtual void DoPullProperties () { }
387 protected virtual void DoPullSetup () { }
389 protected virtual void DoPull () { Finished (); }
391 protected virtual void DoClose () { }
393 //////////////////////////
396 Open () calls:
397 (1) DoOpen (FileInfo info) or DoOpen (Stream)
398 (2) DoPullProperties ()
399 (3) DoPullSetup ()
400 At this point all properties must be in place
402 Once someone starts reading from the TextReader,
403 the following are called:
404 DoPull () [until Finished() is called]
405 DoClose () [when finished]
409 private string tempFile = null;
410 private FileSystemInfo currentInfo = null;
411 private FileStream currentStream = null;
412 private StreamReader currentReader = null;
414 public bool Open (TextReader reader)
416 tempFile = Path.GetTempFileName ();
417 FileStream file_stream = File.OpenWrite (tempFile);
419 if (Debug)
420 Logger.Log.Debug ("Storing text in tempFile {0}", tempFile);
422 // When we dump the contents of a reader into a file, we
423 // expect to use it again soon.
424 FileAdvise.PreLoad (file_stream);
426 // Make sure the temporary file is only readable by the owner.
427 // FIXME: There is probably a race here. Could some malicious program
428 // do something to the file between creation and the chmod?
429 Mono.Unix.Native.Syscall.chmod (tempFile, (Mono.Unix.Native.FilePermissions) 256);
431 BufferedStream buffered_stream = new BufferedStream (file_stream);
432 StreamWriter writer = new StreamWriter (buffered_stream);
434 const int BUFFER_SIZE = 8192;
435 char [] buffer = new char [BUFFER_SIZE];
437 int read;
438 do {
439 read = reader.Read (buffer, 0, BUFFER_SIZE);
440 if (read > 0)
441 writer.Write (buffer, 0, read);
442 } while (read > 0);
444 writer.Close ();
446 return Open (new FileInfo (tempFile));
449 public bool Open (Stream stream)
451 tempFile = Path.GetTempFileName ();
452 FileStream file_stream = File.OpenWrite (tempFile);
454 if (Debug)
455 Logger.Log.Debug ("Storing stream in tempFile {0}", tempFile);
457 // When we dump the contents of a reader into a file, we
458 // expect to use it again soon.
459 FileAdvise.PreLoad (file_stream);
461 // Make sure the temporary file is only readable by the owner.
462 // FIXME: There is probably a race here. Could some malicious program
463 // do something to the file between creation and the chmod?
464 Mono.Unix.Native.Syscall.chmod (tempFile, (Mono.Unix.Native.FilePermissions) 256);
466 BufferedStream buffered_stream = new BufferedStream (file_stream);
468 const int BUFFER_SIZE = 8192;
469 byte [] buffer = new byte [BUFFER_SIZE];
471 int read;
472 do {
473 read = stream.Read (buffer, 0, BUFFER_SIZE);
474 if (read > 0)
475 buffered_stream.Write (buffer, 0, read);
476 } while (read > 0);
478 buffered_stream.Close ();
480 return Open (new FileInfo (tempFile));
483 public bool Open (FileSystemInfo info)
485 isFinished = false;
486 textPool = new ArrayList ();
487 hotPool = new ArrayList ();
488 propertyPool = new ArrayList ();
490 currentInfo = info;
492 if (info is FileInfo) {
493 // Open a stream for this file.
494 currentStream = new FileStream (info.FullName,
495 FileMode.Open,
496 FileAccess.Read,
497 FileShare.Read);
499 if (preload) {
500 // Our default assumption is sequential reads.
501 // FIXME: Is this the right thing to do here?
502 FileAdvise.IncreaseReadAhead (currentStream);
504 // Give the OS a hint that we will be reading this
505 // file soon.
506 FileAdvise.PreLoad (currentStream);
510 try {
511 DoOpen (info);
513 if (IsFinished)
514 return true;
515 else if (HasError)
516 return false;
518 DoPullProperties ();
520 if (IsFinished)
521 return true;
522 else if (HasError)
523 return false;
525 // Close and reset our TextReader
526 if (currentReader != null) {
527 currentReader.Close ();
528 currentReader = null;
531 // Seek back to the beginning of our stream
532 currentStream.Seek (0, SeekOrigin.Begin);
534 DoPullSetup ();
536 if (HasError)
537 return false;
538 } catch (Exception e) {
539 Log.Warn (e, "Unable to filter {0}:", info.FullName);
540 Cleanup (); // clean up temporary files on an exception
541 return false;
544 return true;
547 public bool Open (string path)
549 if (File.Exists (path))
550 return Open (new FileInfo (path));
551 else if (Directory.Exists (path))
552 return Open (new DirectoryInfo (path));
553 else
554 return false;
557 public FileInfo FileInfo {
558 get { return currentInfo as FileInfo; }
561 public DirectoryInfo DirectoryInfo {
562 get { return currentInfo as DirectoryInfo; }
565 public Stream Stream {
566 get { return currentStream; }
569 public TextReader TextReader {
570 get {
571 if (currentReader == null
572 && currentStream != null) {
573 currentReader = new StreamReader (currentStream);
576 return currentReader;
580 private bool Pull ()
582 if (IsFinished || HasError) {
583 Close ();
584 return false;
587 DoPull ();
589 if (HasError)
590 return false;
592 return true;
595 private void Close ()
597 Cleanup ();
599 if (currentStream == null)
600 return;
602 DoClose ();
604 // When crawling, give the OS a hint that we don't
605 // need to keep this file around in the page cache.
606 if (CrawlMode)
607 FileAdvise.FlushCache (currentStream);
609 if (currentReader != null)
610 currentReader.Close ();
612 currentStream.Close ();
613 currentStream = null;
615 if (snippetWriter != null)
616 snippetWriter.Close ();
619 public void Cleanup ()
621 if (tempFile != null) {
622 try {
623 File.Delete (tempFile);
624 } catch (Exception ex) {
625 // Just in case it is gone already
627 tempFile = null;
631 private bool PullFromArray (ArrayList array, StringBuilder sb)
633 while (array.Count == 0 && Pull ()) { }
635 // FIXME: Do we want to try to extract as much data as
636 // possible from the filter if we get an error, or
637 // should we just give up afterward entirely?
639 if (array.Count > 0) {
640 foreach (string str in array)
641 sb.Append (str);
643 array.Clear ();
644 return true;
646 return false;
649 private bool PullTextCarefully (ArrayList array, StringBuilder sb)
651 bool pulled = false;
652 try {
653 pulled = PullFromArray (array, sb);
654 } catch (Exception ex) {
655 Logger.Log.Debug (ex, "Caught exception while pulling text in filter '{0}'", Name);
658 return pulled;
661 private bool PullText (StringBuilder sb)
663 return PullTextCarefully (textPool, sb);
666 private bool PullHotText (StringBuilder sb)
668 return PullTextCarefully (hotPool, sb);
671 public TextReader GetTextReader ()
673 PullingReader pr = new PullingReader (new PullingReader.Pull (PullText));
674 pr.Identifier = Identifier;
675 return pr;
678 public TextReader GetHotTextReader ()
680 return new PullingReader (new PullingReader.Pull (PullHotText));
683 public IEnumerable Properties {
684 get { return propertyPool; }
687 //////////////////////////////
689 // This is used primarily for the generation of URIs for the
690 // child indexables that can be created as a result of the
691 // filtering process.
693 private Uri uri = null;
695 public Uri Uri {
696 get { return uri; }
697 set { uri = value; }
700 //////////////////////////////
702 private ArrayList child_indexables = new ArrayList ();
704 protected void AddChildIndexable (Indexable indexable)
706 this.child_indexables.Add (indexable);
709 protected void AddChildIndexables (ICollection indexables)
711 this.child_indexables.AddRange (indexables);
714 public ArrayList ChildIndexables {
715 get { return this.child_indexables; }