cvsimport
[beagle.git] / beagled / Filter.cs
blob816383b8a6ccc658bc410a0fda841a01e22c3f8d
1 //
2 // Filter.cs
3 //
4 // Copyright (C) 2004 Novell, Inc.
5 //
7 //
8 // Permission is hereby granted, free of charge, to any person obtaining a
9 // copy of this software and associated documentation files (the "Software"),
10 // to deal in the Software without restriction, including without limitation
11 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 // and/or sell copies of the Software, and to permit persons to whom the
13 // Software is furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 // DEALINGS IN THE SOFTWARE.
28 using System;
29 using System.Collections;
30 using System.IO;
31 using System.Text;
32 using System.Reflection;
34 using Beagle.Util;
36 namespace Beagle.Daemon {
38 public class Filter {
40 static private bool Debug = false;
41 // Lucene fields allow a maximum of 10000 words
42 // Some of the words will be stop words... so a failsafe maximum of 40000 words
43 // Dont accept more words than that
44 const int MAXWORDS = 40000; // Lucene.Net.Index.IndexWriter.DEFAULT_MAX_FIELD_LENGTH * 4
46 // Derived classes always must have a constructor that
47 // takes no arguments.
48 public Filter () { }
50 //////////////////////////
52 private string identifier;
54 public string Identifier {
55 get { return identifier; }
56 set { identifier = value; }
59 //////////////////////////
61 private ArrayList supported_flavors = new ArrayList ();
63 protected void AddSupportedFlavor (FilterFlavor flavor)
65 supported_flavors.Add (flavor);
68 public ICollection SupportedFlavors {
69 get { return supported_flavors; }
72 //////////////////////////
74 // Filters are versioned. This allows us to automatically re-index
75 // files when a newer filter is available.
77 public string Name {
78 get { return this.GetType ().Name; }
81 private int version = -1;
83 public int Version {
84 get { return version < 0 ? 0 : version; }
87 protected void SetVersion (int v)
89 if (v < 0) {
90 string msg;
91 msg = String.Format ("Attempt to set invalid version {0} on Filter {1}", v, Name);
92 throw new Exception (msg);
95 if (version != -1) {
96 string msg;
97 msg = String.Format ("Attempt to re-set version from {0} to {1} on Filter {2}", version, v, Name);
98 throw new Exception (msg);
101 version = v;
106 //////////////////////////
108 private string this_mime_type = null;
109 private string this_extension = null;
110 private ArrayList indexable_properties = null;
111 private DateTime timestamp = DateTime.MinValue;
113 public string MimeType {
114 get { return this_mime_type; }
115 set { this_mime_type = value; }
118 public string Extension {
119 get { return this_extension; }
120 set { this_extension = value; }
123 // Allow the filter to access the properties
124 // set by indexable
125 public ArrayList IndexableProperties {
126 get { return indexable_properties; }
127 set { indexable_properties = value; }
130 // Allow the filter to access the timestamp,
131 // sometime filters know better
132 public DateTime Timestamp {
133 get { return timestamp; }
134 set { timestamp = value; }
137 //////////////////////////
139 private bool crawl_mode = false;
141 public void EnableCrawlMode ()
143 crawl_mode = true;
146 protected bool CrawlMode {
147 get { return crawl_mode; }
150 //////////////////////////
152 // Filters which deal with big files, and that don't need
153 // to read in whole files may want to set this to false
154 // to avoid wasting cycles in disk wait.
156 private bool preload = true;
158 protected bool PreLoad {
159 get { return preload; }
160 set { preload = value; }
163 //////////////////////////
165 int hotCount = 0;
166 int freezeCount = 0;
168 public void HotUp ()
170 ++hotCount;
173 public void HotDown ()
175 if (hotCount > 0)
176 --hotCount;
179 public bool IsHot {
180 get { return hotCount > 0; }
183 public void FreezeUp ()
185 ++freezeCount;
188 public void FreezeDown ()
190 if (freezeCount > 0)
191 --freezeCount;
194 public bool IsFrozen {
195 get { return freezeCount > 0; }
198 //////////////////////////
200 private bool snippetMode = false;
201 private bool originalIsText = false;
202 private TextWriter snippetWriter = null;
204 public bool SnippetMode {
205 get { return snippetMode; }
206 set { snippetMode = value; }
209 public bool OriginalIsText {
210 get { return originalIsText; }
211 set { originalIsText = value; }
214 public void AttachSnippetWriter (TextWriter writer)
216 if (snippetMode)
217 snippetWriter = writer;
220 //////////////////////////
222 private ArrayList textPool;
223 private ArrayList hotPool;
224 private ArrayList propertyPool;
226 private int word_count = 0;
227 private int hotword_count = 0;
229 protected bool AllowMoreWords ()
231 return (word_count < MAXWORDS);
234 private bool last_was_structural_break = true;
237 * This two-arg AppendText() will give flexibility to
238 * filters to segregate hot-contents and
239 * normal-contents of a para and call this method with
240 * respective contents.
242 * str : Holds both the normal-contents and hot contents.
243 * strHot: Holds only hot-contents.
244 * Both arguments can be null.
246 * Ex:- suppose the actual-content is "one <b>two</b> three"
247 * str = "one two three"
248 * strHot = "two"
250 * NOTE: HotUp() or HotDown() has NO-EFFECT on this variant
251 * of AppendText ()
253 public int AppendText (string str, string strHot)
255 int num_words = 0;
257 if (!IsFrozen && word_count < MAXWORDS && str != null && str != String.Empty) {
258 string[] lines;
260 // Avoid unnecessary allocation of a string
261 // FIXME: Handle \r, \r\n cases.
262 if (str.IndexOf ('\n') > -1) {
263 lines = str.Split ('\n');
264 foreach (string line in lines) {
265 if (line.Length > 0) {
266 ReallyAppendText (line, null);
267 AppendStructuralBreak ();
270 } else
271 ReallyAppendText (str, null);
272 num_words = StringFu.CountWords (str, 3, -1);
273 word_count += num_words;
276 if (hotword_count < MAXWORDS) {
277 ReallyAppendText (null, strHot);
278 hotword_count += StringFu.CountWords (strHot, 3, -1);
281 return num_words;
284 /* Append text to the textpool. If IsHot is true, then also add to the hottext pool.
285 * Handles null str.
287 public int AppendText (string str)
289 if (Debug)
290 Logger.Log.Debug ("AppendText (\"{0}\")", str);
292 if (! IsFrozen && str != null && str != String.Empty)
293 return AppendText (str, IsHot ? str : null);
295 return 0;
298 // Does adding text to to text/hot pools respectively.
299 private void ReallyAppendText (string str, string strHot)
301 if (!IsFrozen && strHot != null && strHot != String.Empty)
302 hotPool.Add (strHot.Trim()+" ");
304 if (str != null) {
305 textPool.Add (str);
307 if (snippetWriter != null)
308 snippetWriter.Write (str);
310 last_was_structural_break = false;
313 private bool NeedsWhiteSpace (ArrayList array)
315 if (array.Count == 0)
316 return true;
318 string last = (string) array [array.Count-1];
319 if (last.Length > 0
320 && char.IsWhiteSpace (last [last.Length-1]))
321 return false;
323 return true;
327 * Adds whitespace to the textpool.
329 public void AppendWhiteSpace ()
331 if (last_was_structural_break)
332 return;
334 if (Debug)
335 Logger.Log.Debug ("AppendWhiteSpace ()");
337 if (NeedsWhiteSpace (textPool)) {
338 textPool.Add (" ");
339 if (snippetWriter != null)
340 snippetWriter.Write (" ");
341 last_was_structural_break = false;
346 * Adds property prop.
347 * prop can be null or can have null value; in both cases nothing is added.
349 public void AddProperty (Property prop)
351 if (prop != null && prop.Value != null && prop.Value != String.Empty)
352 propertyPool.Add (prop);
356 * Creates a new paragraph. Mainly useful for storing cached contents.
358 public void AppendStructuralBreak ()
360 if (snippetWriter != null && ! last_was_structural_break) {
361 snippetWriter.WriteLine ();
362 last_was_structural_break = true;
364 // When adding a "newline" to the textCache, we need to
365 // append a "Whitespace" to the text pool.
366 if (NeedsWhiteSpace (textPool))
367 textPool.Add (" ");
370 //////////////////////////
372 private bool isFinished = false;
374 public bool IsFinished {
375 get { return isFinished; }
378 protected void Finished ()
380 isFinished = true;
383 private bool has_error = false;
385 public bool HasError {
386 get { return has_error; }
389 protected void Error ()
391 Cleanup (); // force the clean-up of temporary files on an error
392 has_error = true;
395 //////////////////////////
397 protected virtual void DoOpen (FileSystemInfo info) {
398 if (info is FileInfo)
399 DoOpen (info as FileInfo);
400 else if (info is DirectoryInfo)
401 DoOpen (info as DirectoryInfo);
404 protected virtual void DoOpen (FileInfo info) { }
406 protected virtual void DoOpen (DirectoryInfo info) { }
408 protected virtual void DoPullProperties () { }
410 protected virtual void DoPullSetup () { }
412 protected virtual void DoPull () { Finished (); }
414 protected virtual void DoClose () { }
416 //////////////////////////
419 Open () calls:
420 (1) DoOpen (FileInfo info) or DoOpen (Stream)
421 (2) DoPullProperties ()
422 (3) DoPullSetup ()
423 At this point all properties must be in place
425 Once someone starts reading from the TextReader,
426 the following are called:
427 DoPull () [until Finished() is called]
428 DoClose () [when finished]
432 private string tempFile = null;
433 private FileSystemInfo currentInfo = null;
434 private FileStream currentStream = null;
435 private StreamReader currentReader = null;
437 public bool Open (TextReader reader)
439 tempFile = Path.GetTempFileName ();
440 FileStream file_stream = File.OpenWrite (tempFile);
442 if (Debug)
443 Logger.Log.Debug ("Storing text in tempFile {0}", tempFile);
445 // When we dump the contents of a reader into a file, we
446 // expect to use it again soon.
447 FileAdvise.PreLoad (file_stream);
449 // Make sure the temporary file is only readable by the owner.
450 // FIXME: There is probably a race here. Could some malicious program
451 // do something to the file between creation and the chmod?
452 Mono.Unix.Native.Syscall.chmod (tempFile, (Mono.Unix.Native.FilePermissions) 256);
454 BufferedStream buffered_stream = new BufferedStream (file_stream);
455 StreamWriter writer = new StreamWriter (buffered_stream);
457 const int BUFFER_SIZE = 8192;
458 char [] buffer = new char [BUFFER_SIZE];
460 int read;
461 do {
462 read = reader.Read (buffer, 0, BUFFER_SIZE);
463 if (read > 0)
464 writer.Write (buffer, 0, read);
465 } while (read > 0);
467 writer.Close ();
469 return Open (new FileInfo (tempFile));
472 public bool Open (Stream stream)
474 tempFile = Path.GetTempFileName ();
475 FileStream file_stream = File.OpenWrite (tempFile);
477 if (Debug)
478 Logger.Log.Debug ("Storing stream in tempFile {0}", tempFile);
480 // When we dump the contents of a reader into a file, we
481 // expect to use it again soon.
482 FileAdvise.PreLoad (file_stream);
484 // Make sure the temporary file is only readable by the owner.
485 // FIXME: There is probably a race here. Could some malicious program
486 // do something to the file between creation and the chmod?
487 Mono.Unix.Native.Syscall.chmod (tempFile, (Mono.Unix.Native.FilePermissions) 256);
489 BufferedStream buffered_stream = new BufferedStream (file_stream);
491 const int BUFFER_SIZE = 8192;
492 byte [] buffer = new byte [BUFFER_SIZE];
494 int read;
495 do {
496 read = stream.Read (buffer, 0, BUFFER_SIZE);
497 if (read > 0)
498 buffered_stream.Write (buffer, 0, read);
499 } while (read > 0);
501 buffered_stream.Close ();
503 return Open (new FileInfo (tempFile));
506 public bool Open (FileSystemInfo info)
508 isFinished = false;
509 textPool = new ArrayList ();
510 hotPool = new ArrayList ();
511 propertyPool = new ArrayList ();
513 currentInfo = info;
515 if (info is FileInfo) {
516 // Open a stream for this file.
517 currentStream = new FileStream (info.FullName,
518 FileMode.Open,
519 FileAccess.Read,
520 FileShare.Read);
522 if (preload) {
523 // Our default assumption is sequential reads.
524 // FIXME: Is this the right thing to do here?
525 FileAdvise.IncreaseReadAhead (currentStream);
527 // Give the OS a hint that we will be reading this
528 // file soon.
529 FileAdvise.PreLoad (currentStream);
533 try {
534 DoOpen (info);
536 if (IsFinished)
537 return true;
538 else if (HasError)
539 return false;
541 DoPullProperties ();
543 if (IsFinished)
544 return true;
545 else if (HasError)
546 return false;
548 // Reset our TextReader
549 // Dont close the streamreader as
550 // that will also close the stream
551 if (currentReader != null) {
552 currentReader.DiscardBufferedData ();
555 // Seek back to the beginning of our stream
556 currentStream.Seek (0, SeekOrigin.Begin);
558 DoPullSetup ();
560 if (HasError)
561 return false;
562 } catch (Exception e) {
563 Log.Warn (e, "Unable to filter {0}:", info.FullName);
564 Cleanup (); // clean up temporary files on an exception
565 return false;
568 return true;
571 public bool Open (string path)
573 if (File.Exists (path))
574 return Open (new FileInfo (path));
575 else if (Directory.Exists (path))
576 return Open (new DirectoryInfo (path));
577 else
578 return false;
581 public FileInfo FileInfo {
582 get { return currentInfo as FileInfo; }
585 public DirectoryInfo DirectoryInfo {
586 get { return currentInfo as DirectoryInfo; }
589 public Stream Stream {
590 get { return currentStream; }
593 public TextReader TextReader {
594 get {
595 if (currentReader == null
596 && currentStream != null) {
597 currentReader = new StreamReader (currentStream);
600 return currentReader;
604 private bool Pull ()
606 if (IsFinished || HasError) {
607 Close ();
608 return false;
611 DoPull ();
613 if (HasError)
614 return false;
616 return true;
619 private void Close ()
621 Cleanup ();
623 if (currentStream == null)
624 return;
626 DoClose ();
628 // When crawling, give the OS a hint that we don't
629 // need to keep this file around in the page cache.
630 if (CrawlMode)
631 FileAdvise.FlushCache (currentStream);
633 if (currentReader != null)
634 currentReader.Close ();
636 currentStream.Close ();
637 currentStream = null;
639 if (snippetWriter != null)
640 snippetWriter.Close ();
643 public void Cleanup ()
645 if (tempFile != null) {
646 try {
647 File.Delete (tempFile);
648 } catch (Exception ex) {
649 // Just in case it is gone already
651 tempFile = null;
655 private bool PullFromArray (ArrayList array, StringBuilder sb)
657 while (array.Count == 0 && Pull ()) { }
659 // FIXME: Do we want to try to extract as much data as
660 // possible from the filter if we get an error, or
661 // should we just give up afterward entirely?
663 if (array.Count > 0) {
664 foreach (string str in array)
665 sb.Append (str);
667 array.Clear ();
668 return true;
670 return false;
673 private bool PullTextCarefully (ArrayList array, StringBuilder sb)
675 bool pulled = false;
676 try {
677 pulled = PullFromArray (array, sb);
678 } catch (Exception ex) {
679 Logger.Log.Debug (ex, "Caught exception while pulling text in filter '{0}'", Name);
682 return pulled;
685 private bool PullText (StringBuilder sb)
687 return PullTextCarefully (textPool, sb);
690 private bool PullHotText (StringBuilder sb)
692 return PullTextCarefully (hotPool, sb);
695 public TextReader GetTextReader ()
697 PullingReader pr = new PullingReader (new PullingReader.Pull (PullText));
698 pr.Identifier = Identifier;
699 return pr;
702 public TextReader GetHotTextReader ()
704 return new PullingReader (new PullingReader.Pull (PullHotText));
707 public IEnumerable Properties {
708 get { return propertyPool; }
711 //////////////////////////////
713 // This is used primarily for the generation of URIs for the
714 // child indexables that can be created as a result of the
715 // filtering process.
717 private Uri uri = null;
719 public Uri Uri {
720 get { return uri; }
721 set { uri = value; }
724 private Uri display_uri = null;
726 public Uri DisplayUri {
727 get { return display_uri; }
728 set { display_uri = value; }
731 //////////////////////////////
733 private ArrayList child_indexables = new ArrayList ();
735 protected void AddChildIndexable (Indexable indexable)
737 this.child_indexables.Add (indexable);
740 protected void AddChildIndexables (ICollection indexables)
742 this.child_indexables.AddRange (indexables);
745 public ArrayList ChildIndexables {
746 get { return this.child_indexables; }
750 [AttributeUsage (AttributeTargets.Assembly)]
751 public class FilterTypesAttribute : TypeCacheAttribute {
752 public FilterTypesAttribute (params Type[] filter_types) : base (filter_types) { }