4 // Copyright (C) 2004 Novell, Inc.
8 // Permission is hereby granted, free of charge, to any person obtaining a
9 // copy of this software and associated documentation files (the "Software"),
10 // to deal in the Software without restriction, including without limitation
11 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 // and/or sell copies of the Software, and to permit persons to whom the
13 // Software is furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 // DEALINGS IN THE SOFTWARE.
29 using System
.Collections
;
32 using System
.Reflection
;
36 namespace Beagle
.Daemon
{
40 static private bool Debug
= false;
41 // Lucene fields allow a maximum of 10000 words
42 // Some of the words will be stop words... so a failsafe maximum of 40000 words
43 // Dont accept more words than that
44 const int MAXWORDS
= 40000; // Lucene.Net.Index.IndexWriter.DEFAULT_MAX_FIELD_LENGTH * 4
46 // Derived classes always must have a constructor that
47 // takes no arguments.
50 //////////////////////////
52 private string identifier
;
54 public string Identifier
{
55 get { return identifier; }
56 set { identifier = value; }
59 //////////////////////////
61 private ArrayList supported_flavors
= new ArrayList ();
63 protected void AddSupportedFlavor (FilterFlavor flavor
)
65 supported_flavors
.Add (flavor
);
68 public ICollection SupportedFlavors
{
69 get { return supported_flavors; }
72 //////////////////////////
74 // Filters are versioned. This allows us to automatically re-index
75 // files when a newer filter is available.
78 get { return this.GetType ().Name; }
81 private int version
= -1;
84 get { return version < 0 ? 0 : version; }
87 protected void SetVersion (int v
)
91 msg
= String
.Format ("Attempt to set invalid version {0} on Filter {1}", v
, Name
);
92 throw new Exception (msg
);
97 msg
= String
.Format ("Attempt to re-set version from {0} to {1} on Filter {2}", version
, v
, Name
);
98 throw new Exception (msg
);
106 //////////////////////////
108 private string this_mime_type
= null;
109 private string this_extension
= null;
110 private ArrayList indexable_properties
= null;
111 private DateTime timestamp
= DateTime
.MinValue
;
113 public string MimeType
{
114 get { return this_mime_type; }
115 set { this_mime_type = value; }
118 public string Extension
{
119 get { return this_extension; }
120 set { this_extension = value; }
123 // Allow the filter to access the properties
125 public ArrayList IndexableProperties
{
126 get { return indexable_properties; }
127 set { indexable_properties = value; }
130 // Allow the filter to access the timestamp,
131 // sometime filters know better
132 public DateTime Timestamp
{
133 get { return timestamp; }
134 set { timestamp = value; }
137 //////////////////////////
139 private bool crawl_mode
= false;
141 public void EnableCrawlMode ()
146 protected bool CrawlMode
{
147 get { return crawl_mode; }
150 //////////////////////////
152 // Filters which deal with big files, and that don't need
153 // to read in whole files may want to set this to false
154 // to avoid wasting cycles in disk wait.
156 private bool preload
= true;
158 protected bool PreLoad
{
159 get { return preload; }
160 set { preload = value; }
163 //////////////////////////
173 public void HotDown ()
180 get { return hotCount > 0; }
183 public void FreezeUp ()
188 public void FreezeDown ()
194 public bool IsFrozen
{
195 get { return freezeCount > 0; }
198 //////////////////////////
200 private bool snippetMode
= false;
201 private bool originalIsText
= false;
202 private TextWriter snippetWriter
= null;
204 public bool SnippetMode
{
205 get { return snippetMode; }
206 set { snippetMode = value; }
209 public bool OriginalIsText
{
210 get { return originalIsText; }
211 set { originalIsText = value; }
214 public void AttachSnippetWriter (TextWriter writer
)
217 snippetWriter
= writer
;
220 //////////////////////////
222 private ArrayList textPool
;
223 private ArrayList hotPool
;
224 private ArrayList propertyPool
;
226 private int word_count
= 0;
227 private int hotword_count
= 0;
229 protected bool AllowMoreWords ()
231 return (word_count
< MAXWORDS
);
234 private bool last_was_structural_break
= true;
237 * This two-arg AppendText() will give flexibility to
238 * filters to segregate hot-contents and
239 * normal-contents of a para and call this method with
240 * respective contents.
242 * str : Holds both the normal-contents and hot contents.
243 * strHot: Holds only hot-contents.
244 * Both arguments can be null.
246 * Ex:- suppose the actual-content is "one <b>two</b> three"
247 * str = "one two three"
250 * NOTE: HotUp() or HotDown() has NO-EFFECT on this variant
253 public int AppendText (string str
, string strHot
)
257 if (!IsFrozen
&& word_count
< MAXWORDS
&& str
!= null && str
!= String
.Empty
) {
260 // Avoid unnecessary allocation of a string
261 // FIXME: Handle \r, \r\n cases.
262 if (str
.IndexOf ('\n') > -1) {
263 lines
= str
.Split ('\n');
264 foreach (string line
in lines
) {
265 if (line
.Length
> 0) {
266 ReallyAppendText (line
, null);
267 AppendStructuralBreak ();
271 ReallyAppendText (str
, null);
272 num_words
= StringFu
.CountWords (str
, 3, -1);
273 word_count
+= num_words
;
276 if (hotword_count
< MAXWORDS
) {
277 ReallyAppendText (null, strHot
);
278 hotword_count
+= StringFu
.CountWords (strHot
, 3, -1);
284 /* Append text to the textpool. If IsHot is true, then also add to the hottext pool.
287 public int AppendText (string str
)
290 Logger
.Log
.Debug ("AppendText (\"{0}\")", str
);
292 if (! IsFrozen
&& str
!= null && str
!= String
.Empty
)
293 return AppendText (str
, IsHot
? str
: null);
298 // Does adding text to to text/hot pools respectively.
299 private void ReallyAppendText (string str
, string strHot
)
301 if (!IsFrozen
&& strHot
!= null && strHot
!= String
.Empty
)
302 hotPool
.Add (strHot
.Trim()+" ");
307 if (snippetWriter
!= null)
308 snippetWriter
.Write (str
);
310 last_was_structural_break
= false;
313 private bool NeedsWhiteSpace (ArrayList array
)
315 if (array
.Count
== 0)
318 string last
= (string) array
[array
.Count
-1];
320 && char.IsWhiteSpace (last
[last
.Length
-1]))
327 * Adds whitespace to the textpool.
329 public void AppendWhiteSpace ()
331 if (last_was_structural_break
)
335 Logger
.Log
.Debug ("AppendWhiteSpace ()");
337 if (NeedsWhiteSpace (textPool
)) {
339 if (snippetWriter
!= null)
340 snippetWriter
.Write (" ");
341 last_was_structural_break
= false;
346 * Adds property prop.
347 * prop can be null or can have null value; in both cases nothing is added.
349 public void AddProperty (Property prop
)
351 if (prop
!= null && prop
.Value
!= null && prop
.Value
!= String
.Empty
)
352 propertyPool
.Add (prop
);
356 * Creates a new paragraph. Mainly useful for storing cached contents.
358 public void AppendStructuralBreak ()
360 if (snippetWriter
!= null && ! last_was_structural_break
) {
361 snippetWriter
.WriteLine ();
362 last_was_structural_break
= true;
364 // When adding a "newline" to the textCache, we need to
365 // append a "Whitespace" to the text pool.
366 if (NeedsWhiteSpace (textPool
))
370 //////////////////////////
372 private bool isFinished
= false;
374 public bool IsFinished
{
375 get { return isFinished; }
378 protected void Finished ()
383 private bool has_error
= false;
385 public bool HasError
{
386 get { return has_error; }
389 protected void Error ()
391 Cleanup (); // force the clean-up of temporary files on an error
395 //////////////////////////
397 protected virtual void DoOpen (FileSystemInfo info
) {
398 if (info
is FileInfo
)
399 DoOpen (info
as FileInfo
);
400 else if (info
is DirectoryInfo
)
401 DoOpen (info
as DirectoryInfo
);
404 protected virtual void DoOpen (FileInfo info
) { }
406 protected virtual void DoOpen (DirectoryInfo info
) { }
408 protected virtual void DoPullProperties () { }
410 protected virtual void DoPullSetup () { }
412 protected virtual void DoPull () { Finished (); }
414 protected virtual void DoClose () { }
416 //////////////////////////
420 (1) DoOpen (FileInfo info) or DoOpen (Stream)
421 (2) DoPullProperties ()
423 At this point all properties must be in place
425 Once someone starts reading from the TextReader,
426 the following are called:
427 DoPull () [until Finished() is called]
428 DoClose () [when finished]
432 private string tempFile
= null;
433 private FileSystemInfo currentInfo
= null;
434 private FileStream currentStream
= null;
435 private StreamReader currentReader
= null;
437 public bool Open (TextReader reader
)
439 tempFile
= Path
.GetTempFileName ();
440 FileStream file_stream
= File
.OpenWrite (tempFile
);
443 Logger
.Log
.Debug ("Storing text in tempFile {0}", tempFile
);
445 // When we dump the contents of a reader into a file, we
446 // expect to use it again soon.
447 FileAdvise
.PreLoad (file_stream
);
449 // Make sure the temporary file is only readable by the owner.
450 // FIXME: There is probably a race here. Could some malicious program
451 // do something to the file between creation and the chmod?
452 Mono
.Unix
.Native
.Syscall
.chmod (tempFile
, (Mono
.Unix
.Native
.FilePermissions
) 256);
454 BufferedStream buffered_stream
= new BufferedStream (file_stream
);
455 StreamWriter writer
= new StreamWriter (buffered_stream
);
457 const int BUFFER_SIZE
= 8192;
458 char [] buffer
= new char [BUFFER_SIZE
];
462 read
= reader
.Read (buffer
, 0, BUFFER_SIZE
);
464 writer
.Write (buffer
, 0, read
);
469 return Open (new FileInfo (tempFile
));
472 public bool Open (Stream stream
)
474 tempFile
= Path
.GetTempFileName ();
475 FileStream file_stream
= File
.OpenWrite (tempFile
);
478 Logger
.Log
.Debug ("Storing stream in tempFile {0}", tempFile
);
480 // When we dump the contents of a reader into a file, we
481 // expect to use it again soon.
482 FileAdvise
.PreLoad (file_stream
);
484 // Make sure the temporary file is only readable by the owner.
485 // FIXME: There is probably a race here. Could some malicious program
486 // do something to the file between creation and the chmod?
487 Mono
.Unix
.Native
.Syscall
.chmod (tempFile
, (Mono
.Unix
.Native
.FilePermissions
) 256);
489 BufferedStream buffered_stream
= new BufferedStream (file_stream
);
491 const int BUFFER_SIZE
= 8192;
492 byte [] buffer
= new byte [BUFFER_SIZE
];
496 read
= stream
.Read (buffer
, 0, BUFFER_SIZE
);
498 buffered_stream
.Write (buffer
, 0, read
);
501 buffered_stream
.Close ();
503 return Open (new FileInfo (tempFile
));
506 public bool Open (FileSystemInfo info
)
509 textPool
= new ArrayList ();
510 hotPool
= new ArrayList ();
511 propertyPool
= new ArrayList ();
515 if (info
is FileInfo
) {
516 // Open a stream for this file.
517 currentStream
= new FileStream (info
.FullName
,
523 // Our default assumption is sequential reads.
524 // FIXME: Is this the right thing to do here?
525 FileAdvise
.IncreaseReadAhead (currentStream
);
527 // Give the OS a hint that we will be reading this
529 FileAdvise
.PreLoad (currentStream
);
548 // Reset our TextReader
549 // Dont close the streamreader as
550 // that will also close the stream
551 if (currentReader
!= null) {
552 currentReader
.DiscardBufferedData ();
555 // Seek back to the beginning of our stream
556 currentStream
.Seek (0, SeekOrigin
.Begin
);
562 } catch (Exception e
) {
563 Log
.Warn (e
, "Unable to filter {0}:", info
.FullName
);
564 Cleanup (); // clean up temporary files on an exception
571 public bool Open (string path
)
573 if (File
.Exists (path
))
574 return Open (new FileInfo (path
));
575 else if (Directory
.Exists (path
))
576 return Open (new DirectoryInfo (path
));
581 public FileInfo FileInfo
{
582 get { return currentInfo as FileInfo; }
585 public DirectoryInfo DirectoryInfo
{
586 get { return currentInfo as DirectoryInfo; }
589 public Stream Stream
{
590 get { return currentStream; }
593 public TextReader TextReader
{
595 if (currentReader
== null
596 && currentStream
!= null) {
597 currentReader
= new StreamReader (currentStream
);
600 return currentReader
;
606 if (IsFinished
|| HasError
) {
619 private void Close ()
623 if (currentStream
== null)
628 // When crawling, give the OS a hint that we don't
629 // need to keep this file around in the page cache.
631 FileAdvise
.FlushCache (currentStream
);
633 if (currentReader
!= null)
634 currentReader
.Close ();
636 currentStream
.Close ();
637 currentStream
= null;
639 if (snippetWriter
!= null)
640 snippetWriter
.Close ();
643 public void Cleanup ()
645 if (tempFile
!= null) {
647 File
.Delete (tempFile
);
648 } catch (Exception ex
) {
649 // Just in case it is gone already
655 private bool PullFromArray (ArrayList array
, StringBuilder sb
)
657 while (array
.Count
== 0 && Pull ()) { }
659 // FIXME: Do we want to try to extract as much data as
660 // possible from the filter if we get an error, or
661 // should we just give up afterward entirely?
663 if (array
.Count
> 0) {
664 foreach (string str
in array
)
673 private bool PullTextCarefully (ArrayList array
, StringBuilder sb
)
677 pulled
= PullFromArray (array
, sb
);
678 } catch (Exception ex
) {
679 Logger
.Log
.Debug (ex
, "Caught exception while pulling text in filter '{0}'", Name
);
685 private bool PullText (StringBuilder sb
)
687 return PullTextCarefully (textPool
, sb
);
690 private bool PullHotText (StringBuilder sb
)
692 return PullTextCarefully (hotPool
, sb
);
695 public TextReader
GetTextReader ()
697 PullingReader pr
= new PullingReader (new PullingReader
.Pull (PullText
));
698 pr
.Identifier
= Identifier
;
702 public TextReader
GetHotTextReader ()
704 return new PullingReader (new PullingReader
.Pull (PullHotText
));
707 public IEnumerable Properties
{
708 get { return propertyPool; }
711 //////////////////////////////
713 // This is used primarily for the generation of URIs for the
714 // child indexables that can be created as a result of the
715 // filtering process.
717 private Uri uri
= null;
724 private Uri display_uri
= null;
726 public Uri DisplayUri
{
727 get { return display_uri; }
728 set { display_uri = value; }
731 //////////////////////////////
733 private ArrayList child_indexables
= new ArrayList ();
735 protected void AddChildIndexable (Indexable indexable
)
737 this.child_indexables
.Add (indexable
);
740 protected void AddChildIndexables (ICollection indexables
)
742 this.child_indexables
.AddRange (indexables
);
745 public ArrayList ChildIndexables
{
746 get { return this.child_indexables; }
750 [AttributeUsage (AttributeTargets
.Assembly
)]
751 public class FilterTypesAttribute
: TypeCacheAttribute
{
752 public FilterTypesAttribute (params Type
[] filter_types
) : base (filter_types
) { }