Merge the recent changes from HEAD onto the branch
[beagle.git] / beagled / Filter.cs
blobf51286b58ca9eeebc1801b6ab2817cbd474e411f
1 //
2 // Filter.cs
3 //
4 // Copyright (C) 2004 Novell, Inc.
5 //
7 //
8 // Permission is hereby granted, free of charge, to any person obtaining a
9 // copy of this software and associated documentation files (the "Software"),
10 // to deal in the Software without restriction, including without limitation
11 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 // and/or sell copies of the Software, and to permit persons to whom the
13 // Software is furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 // DEALINGS IN THE SOFTWARE.
28 using System;
29 using System.Collections;
30 using System.IO;
31 using System.Text;
32 using System.Reflection;
34 using Beagle.Util;
36 namespace Beagle.Daemon {
38 public class Filter {
40 static private bool Debug = false;
41 // Lucene fields allow a maximum of 10000 words
42 // Some of the words will be stop words... so a failsafe maximum of 40000 words
43 // Dont accept more words than that
44 const int MAXWORDS = 40000; // Lucene.Net.Index.IndexWriter.DEFAULT_MAX_FIELD_LENGTH * 4
46 // Derived classes always must have a constructor that
47 // takes no arguments.
48 public Filter () { }
50 //////////////////////////
52 private string identifier;
54 public string Identifier {
55 get { return identifier; }
56 set { identifier = value; }
59 //////////////////////////
61 private ArrayList supported_flavors = new ArrayList ();
63 protected void AddSupportedFlavor (FilterFlavor flavor)
65 supported_flavors.Add (flavor);
68 public ICollection SupportedFlavors {
69 get { return supported_flavors; }
72 //////////////////////////
74 // Filters are versioned. This allows us to automatically re-index
75 // files when a newer filter is available.
77 public string Name {
78 get { return this.GetType ().Name; }
81 private int version = -1;
83 public int Version {
84 get { return version < 0 ? 0 : version; }
87 protected void SetVersion (int v)
89 if (v < 0) {
90 string msg;
91 msg = String.Format ("Attempt to set invalid version {0} on Filter {1}", v, Name);
92 throw new Exception (msg);
95 if (version != -1) {
96 string msg;
97 msg = String.Format ("Attempt to re-set version from {0} to {1} on Filter {2}", version, v, Name);
98 throw new Exception (msg);
101 version = v;
106 //////////////////////////
108 private string this_mime_type = null;
109 private string this_extension = null;
110 private ArrayList indexable_properties = null;
111 private DateTime timestamp = DateTime.MinValue;
113 public string MimeType {
114 get { return this_mime_type; }
115 set { this_mime_type = value; }
118 public string Extension {
119 get { return this_extension; }
120 set { this_extension = value; }
123 // Allow the filter to access the properties
124 // set by indexable
125 public ArrayList IndexableProperties {
126 get { return indexable_properties; }
127 set { indexable_properties = value; }
130 // Allow the filter to access the timestamp,
131 // sometime filters know better
132 public DateTime Timestamp {
133 get { return timestamp; }
134 set { timestamp = value; }
137 //////////////////////////
139 private bool crawl_mode = false;
141 public void EnableCrawlMode ()
143 crawl_mode = true;
146 protected bool CrawlMode {
147 get { return crawl_mode; }
150 //////////////////////////
152 // Filters which deal with big files, and that don't need
153 // to read in whole files may want to set this to false
154 // to avoid wasting cycles in disk wait.
156 private bool preload = true;
158 protected bool PreLoad {
159 get { return preload; }
160 set { preload = value; }
163 //////////////////////////
165 int hotCount = 0;
166 int freezeCount = 0;
168 public void HotUp ()
170 ++hotCount;
173 public void HotDown ()
175 if (hotCount > 0)
176 --hotCount;
179 public bool IsHot {
180 get { return hotCount > 0; }
183 public void FreezeUp ()
185 ++freezeCount;
188 public void FreezeDown ()
190 if (freezeCount > 0)
191 --freezeCount;
194 public bool IsFrozen {
195 get { return freezeCount > 0; }
198 //////////////////////////
200 private bool snippetMode = false;
201 private bool originalIsText = false;
202 private TextWriter snippetWriter = null;
204 public bool SnippetMode {
205 get { return snippetMode; }
206 set { snippetMode = value; }
209 public bool OriginalIsText {
210 get { return originalIsText; }
211 set { originalIsText = value; }
214 public void AttachSnippetWriter (TextWriter writer)
216 if (snippetMode)
217 snippetWriter = writer;
220 //////////////////////////
222 private ArrayList textPool;
223 private ArrayList hotPool;
224 private ArrayList propertyPool;
226 private int word_count = 0;
227 private int hotword_count = 0;
229 protected bool AllowMoreWords ()
231 return (word_count < MAXWORDS);
234 private bool last_was_structural_break = true;
236 // This two-arg AppendText() will give flexibility to
237 // filters to segregate hot-contents and
238 // normal-contents of a para and call this method with
239 // respective contents.
241 // str : Holds both the normal-contents and hot contents.
242 // strHot: Holds only hot-contents.
244 // Ex:- suppose the actual-content is "one <b>two</b> three"
245 // str = "one two three"
246 // strHot = "two"
248 // NOTE: HotUp() or HotDown() has NO-EFFECT on this variant
249 // of AppendText ()
251 public int AppendText (string str, string strHot)
253 int num_words = 0;
255 if (!IsFrozen && word_count < MAXWORDS && str != null && str != "") {
256 string[] lines;
258 // Avoid unnecessary allocation of a string
259 // FIXME: Handle \r, \r\n cases.
260 if (str.IndexOf ('\n') > -1) {
261 lines = str.Split ('\n');
262 foreach (string line in lines) {
263 if (line.Length > 0) {
264 ReallyAppendText (line, null);
265 AppendStructuralBreak ();
268 } else
269 ReallyAppendText (str, null);
270 num_words = StringFu.CountWords (str, 3, -1);
271 word_count += num_words;
274 if (hotword_count < MAXWORDS) {
275 ReallyAppendText (null, strHot);
276 hotword_count += StringFu.CountWords (strHot, 3, -1);
279 return num_words;
282 public int AppendText (string str)
284 if (Debug)
285 Logger.Log.Debug ("AppendText (\"{0}\")", str);
287 if (! IsFrozen && str != null && str != "")
288 return AppendText (str, IsHot ? str : null);
290 return 0;
293 // Does adding text to to text/hot pools respectively.
294 private void ReallyAppendText (string str, string strHot)
296 if (!IsFrozen && strHot != null && strHot != "")
297 hotPool.Add (strHot.Trim()+" ");
299 if (str != null) {
300 textPool.Add (str);
302 if (snippetWriter != null)
303 snippetWriter.Write (str);
305 last_was_structural_break = false;
308 private bool NeedsWhiteSpace (ArrayList array)
310 if (array.Count == 0)
311 return true;
313 string last = (string) array [array.Count-1];
314 if (last.Length > 0
315 && char.IsWhiteSpace (last [last.Length-1]))
316 return false;
318 return true;
321 public void AppendWhiteSpace ()
323 if (last_was_structural_break)
324 return;
326 if (Debug)
327 Logger.Log.Debug ("AppendWhiteSpace ()");
329 if (NeedsWhiteSpace (textPool)) {
330 textPool.Add (" ");
331 if (snippetWriter != null)
332 snippetWriter.Write (" ");
333 last_was_structural_break = false;
337 public void AddProperty (Property prop)
339 if (prop != null && prop.Value != null && prop.Value != "")
340 propertyPool.Add (prop);
343 public void AppendStructuralBreak ()
345 if (snippetWriter != null && ! last_was_structural_break) {
346 snippetWriter.WriteLine ();
347 last_was_structural_break = true;
349 // When adding a "newline" to the textCache, we need to
350 // append a "Whitespace" to the text pool.
351 if (NeedsWhiteSpace (textPool))
352 textPool.Add (" ");
355 //////////////////////////
357 private bool isFinished = false;
359 public bool IsFinished {
360 get { return isFinished; }
363 protected void Finished ()
365 isFinished = true;
368 private bool has_error = false;
370 public bool HasError {
371 get { return has_error; }
374 protected void Error ()
376 Cleanup (); // force the clean-up of temporary files on an error
377 has_error = true;
380 //////////////////////////
382 protected virtual void DoOpen (FileSystemInfo info) {
383 if (info is FileInfo)
384 DoOpen (info as FileInfo);
385 else if (info is DirectoryInfo)
386 DoOpen (info as DirectoryInfo);
389 protected virtual void DoOpen (FileInfo info) { }
391 protected virtual void DoOpen (DirectoryInfo info) { }
393 protected virtual void DoPullProperties () { }
395 protected virtual void DoPullSetup () { }
397 protected virtual void DoPull () { Finished (); }
399 protected virtual void DoClose () { }
401 //////////////////////////
404 Open () calls:
405 (1) DoOpen (FileInfo info) or DoOpen (Stream)
406 (2) DoPullProperties ()
407 (3) DoPullSetup ()
408 At this point all properties must be in place
410 Once someone starts reading from the TextReader,
411 the following are called:
412 DoPull () [until Finished() is called]
413 DoClose () [when finished]
417 private string tempFile = null;
418 private FileSystemInfo currentInfo = null;
419 private FileStream currentStream = null;
420 private StreamReader currentReader = null;
422 public bool Open (TextReader reader)
424 tempFile = Path.GetTempFileName ();
425 FileStream file_stream = File.OpenWrite (tempFile);
427 if (Debug)
428 Logger.Log.Debug ("Storing text in tempFile {0}", tempFile);
430 // When we dump the contents of a reader into a file, we
431 // expect to use it again soon.
432 FileAdvise.PreLoad (file_stream);
434 // Make sure the temporary file is only readable by the owner.
435 // FIXME: There is probably a race here. Could some malicious program
436 // do something to the file between creation and the chmod?
437 Mono.Unix.Native.Syscall.chmod (tempFile, (Mono.Unix.Native.FilePermissions) 256);
439 BufferedStream buffered_stream = new BufferedStream (file_stream);
440 StreamWriter writer = new StreamWriter (buffered_stream);
442 const int BUFFER_SIZE = 8192;
443 char [] buffer = new char [BUFFER_SIZE];
445 int read;
446 do {
447 read = reader.Read (buffer, 0, BUFFER_SIZE);
448 if (read > 0)
449 writer.Write (buffer, 0, read);
450 } while (read > 0);
452 writer.Close ();
454 return Open (new FileInfo (tempFile));
457 public bool Open (Stream stream)
459 tempFile = Path.GetTempFileName ();
460 FileStream file_stream = File.OpenWrite (tempFile);
462 if (Debug)
463 Logger.Log.Debug ("Storing stream in tempFile {0}", tempFile);
465 // When we dump the contents of a reader into a file, we
466 // expect to use it again soon.
467 FileAdvise.PreLoad (file_stream);
469 // Make sure the temporary file is only readable by the owner.
470 // FIXME: There is probably a race here. Could some malicious program
471 // do something to the file between creation and the chmod?
472 Mono.Unix.Native.Syscall.chmod (tempFile, (Mono.Unix.Native.FilePermissions) 256);
474 BufferedStream buffered_stream = new BufferedStream (file_stream);
476 const int BUFFER_SIZE = 8192;
477 byte [] buffer = new byte [BUFFER_SIZE];
479 int read;
480 do {
481 read = stream.Read (buffer, 0, BUFFER_SIZE);
482 if (read > 0)
483 buffered_stream.Write (buffer, 0, read);
484 } while (read > 0);
486 buffered_stream.Close ();
488 return Open (new FileInfo (tempFile));
491 public bool Open (FileSystemInfo info)
493 isFinished = false;
494 textPool = new ArrayList ();
495 hotPool = new ArrayList ();
496 propertyPool = new ArrayList ();
498 currentInfo = info;
500 if (info is FileInfo) {
501 // Open a stream for this file.
502 currentStream = new FileStream (info.FullName,
503 FileMode.Open,
504 FileAccess.Read,
505 FileShare.Read);
507 if (preload) {
508 // Our default assumption is sequential reads.
509 // FIXME: Is this the right thing to do here?
510 FileAdvise.IncreaseReadAhead (currentStream);
512 // Give the OS a hint that we will be reading this
513 // file soon.
514 FileAdvise.PreLoad (currentStream);
518 try {
519 DoOpen (info);
521 if (IsFinished)
522 return true;
523 else if (HasError)
524 return false;
526 DoPullProperties ();
528 if (IsFinished)
529 return true;
530 else if (HasError)
531 return false;
533 // Reset our TextReader
534 // Dont close the streamreader as
535 // that will also close the stream
536 if (currentReader != null) {
537 currentReader.DiscardBufferedData ();
540 // Seek back to the beginning of our stream
541 currentStream.Seek (0, SeekOrigin.Begin);
543 DoPullSetup ();
545 if (HasError)
546 return false;
547 } catch (Exception e) {
548 Log.Warn (e, "Unable to filter {0}:", info.FullName);
549 Cleanup (); // clean up temporary files on an exception
550 return false;
553 return true;
556 public bool Open (string path)
558 if (File.Exists (path))
559 return Open (new FileInfo (path));
560 else if (Directory.Exists (path))
561 return Open (new DirectoryInfo (path));
562 else
563 return false;
566 public FileInfo FileInfo {
567 get { return currentInfo as FileInfo; }
570 public DirectoryInfo DirectoryInfo {
571 get { return currentInfo as DirectoryInfo; }
574 public Stream Stream {
575 get { return currentStream; }
578 public TextReader TextReader {
579 get {
580 if (currentReader == null
581 && currentStream != null) {
582 currentReader = new StreamReader (currentStream);
585 return currentReader;
589 private bool Pull ()
591 if (IsFinished || HasError) {
592 Close ();
593 return false;
596 DoPull ();
598 if (HasError)
599 return false;
601 return true;
604 private void Close ()
606 Cleanup ();
608 if (currentStream == null)
609 return;
611 DoClose ();
613 // When crawling, give the OS a hint that we don't
614 // need to keep this file around in the page cache.
615 if (CrawlMode)
616 FileAdvise.FlushCache (currentStream);
618 if (currentReader != null)
619 currentReader.Close ();
621 currentStream.Close ();
622 currentStream = null;
624 if (snippetWriter != null)
625 snippetWriter.Close ();
628 public void Cleanup ()
630 if (tempFile != null) {
631 try {
632 File.Delete (tempFile);
633 } catch (Exception ex) {
634 // Just in case it is gone already
636 tempFile = null;
640 private bool PullFromArray (ArrayList array, StringBuilder sb)
642 while (array.Count == 0 && Pull ()) { }
644 // FIXME: Do we want to try to extract as much data as
645 // possible from the filter if we get an error, or
646 // should we just give up afterward entirely?
648 if (array.Count > 0) {
649 foreach (string str in array)
650 sb.Append (str);
652 array.Clear ();
653 return true;
655 return false;
658 private bool PullTextCarefully (ArrayList array, StringBuilder sb)
660 bool pulled = false;
661 try {
662 pulled = PullFromArray (array, sb);
663 } catch (Exception ex) {
664 Logger.Log.Debug (ex, "Caught exception while pulling text in filter '{0}'", Name);
667 return pulled;
670 private bool PullText (StringBuilder sb)
672 return PullTextCarefully (textPool, sb);
675 private bool PullHotText (StringBuilder sb)
677 return PullTextCarefully (hotPool, sb);
680 public TextReader GetTextReader ()
682 PullingReader pr = new PullingReader (new PullingReader.Pull (PullText));
683 pr.Identifier = Identifier;
684 return pr;
687 public TextReader GetHotTextReader ()
689 return new PullingReader (new PullingReader.Pull (PullHotText));
692 public IEnumerable Properties {
693 get { return propertyPool; }
696 //////////////////////////////
698 // This is used primarily for the generation of URIs for the
699 // child indexables that can be created as a result of the
700 // filtering process.
702 private Uri uri = null;
704 public Uri Uri {
705 get { return uri; }
706 set { uri = value; }
709 //////////////////////////////
711 private ArrayList child_indexables = new ArrayList ();
713 protected void AddChildIndexable (Indexable indexable)
715 this.child_indexables.Add (indexable);
718 protected void AddChildIndexables (ICollection indexables)
720 this.child_indexables.AddRange (indexables);
723 public ArrayList ChildIndexables {
724 get { return this.child_indexables; }
728 [AttributeUsage (AttributeTargets.Assembly)]
729 public class FilterTypesAttribute : TypeCacheAttribute {
730 public FilterTypesAttribute (params Type[] filter_types) : base (filter_types) { }