4 // Copyright (C) 2004 Novell, Inc.
8 // Permission is hereby granted, free of charge, to any person obtaining a
9 // copy of this software and associated documentation files (the "Software"),
10 // to deal in the Software without restriction, including without limitation
11 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 // and/or sell copies of the Software, and to permit persons to whom the
13 // Software is furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 // DEALINGS IN THE SOFTWARE.
29 using System
.Collections
;
32 using System
.Reflection
;
36 namespace Beagle
.Daemon
{
40 static private bool Debug
= false;
41 // Lucene fields allow a maximum of 10000 words
42 // Some of the words will be stop words... so a failsafe maximum of 40000 words
43 // Dont accept more words than that
44 const int MAXWORDS
= 40000; // Lucene.Net.Index.IndexWriter.DEFAULT_MAX_FIELD_LENGTH * 4
46 // Derived classes always must have a constructor that
47 // takes no arguments.
50 //////////////////////////
52 private string identifier
;
54 public string Identifier
{
55 get { return identifier; }
56 set { identifier = value; }
59 //////////////////////////
61 private ArrayList supported_flavors
= new ArrayList ();
63 protected void AddSupportedFlavor (FilterFlavor flavor
)
65 supported_flavors
.Add (flavor
);
68 public ICollection SupportedFlavors
{
69 get { return supported_flavors; }
72 //////////////////////////
74 // Filters are versioned. This allows us to automatically re-index
75 // files when a newer filter is available.
78 get { return this.GetType ().Name; }
81 private int version
= -1;
84 get { return version < 0 ? 0 : version; }
87 protected void SetVersion (int v
)
91 msg
= String
.Format ("Attempt to set invalid version {0} on Filter {1}", v
, Name
);
92 throw new Exception (msg
);
97 msg
= String
.Format ("Attempt to re-set version from {0} to {1} on Filter {2}", version
, v
, Name
);
98 throw new Exception (msg
);
106 //////////////////////////
108 private string this_mime_type
= null;
109 private string this_extension
= null;
110 private ArrayList indexable_properties
= null;
112 public string MimeType
{
113 get { return this_mime_type; }
114 set { this_mime_type = value; }
117 public string Extension
{
118 get { return this_extension; }
119 set { this_extension = value; }
122 // allow the filter to access the properties
124 public ArrayList IndexableProperties
{
125 get { return indexable_properties; }
126 set { indexable_properties = value; }
129 //////////////////////////
131 private bool crawl_mode
= false;
133 public void EnableCrawlMode ()
138 protected bool CrawlMode
{
139 get { return crawl_mode; }
142 //////////////////////////
144 // Filters which deal with big files, and that don't need
145 // to read in whole files may want to set this to false
146 // to avoid wasting cycles in disk wait.
148 private bool preload
= true;
150 protected bool PreLoad
{
151 get { return preload; }
152 set { preload = value; }
155 //////////////////////////
165 public void HotDown ()
172 get { return hotCount > 0; }
175 public void FreezeUp ()
180 public void FreezeDown ()
186 public bool IsFrozen
{
187 get { return freezeCount > 0; }
190 //////////////////////////
192 private bool snippetMode
= false;
193 private bool originalIsText
= false;
194 private TextWriter snippetWriter
= null;
196 public bool SnippetMode
{
197 get { return snippetMode; }
198 set { snippetMode = value; }
201 public bool OriginalIsText
{
202 get { return originalIsText; }
203 set { originalIsText = value; }
206 public void AttachSnippetWriter (TextWriter writer
)
209 snippetWriter
= writer
;
212 //////////////////////////
214 private ArrayList textPool
;
215 private ArrayList hotPool
;
216 private ArrayList propertyPool
;
218 private int word_count
= 0;
219 private int hotword_count
= 0;
221 protected bool AllowMoreWords ()
223 return (word_count
< MAXWORDS
);
226 private bool last_was_structural_break
= true;
228 // This two-arg AppendText() will give flexibility to
229 // filters to segregate hot-contents and
230 // normal-contents of a para and call this method with
231 // respective contents.
233 // str : Holds both the normal-contents and hot contents.
234 // strHot: Holds only hot-contents.
236 // Ex:- suppose the actual-content is "one <b>two</b> three"
237 // str = "one two three"
240 // NOTE: HotUp() or HotDown() has NO-EFFECT on this variant
243 public int AppendText (string str
, string strHot
)
247 if (!IsFrozen
&& word_count
< MAXWORDS
&& str
!= null && str
!= "") {
250 // Avoid unnecessary allocation of a string
251 // FIXME: Handle \r, \r\n cases.
252 if (str
.IndexOf ('\n') > -1) {
253 lines
= str
.Split ('\n');
254 foreach (string line
in lines
) {
255 if (line
.Length
> 0) {
256 ReallyAppendText (line
, null);
257 AppendStructuralBreak ();
261 ReallyAppendText (str
, null);
262 num_words
= StringFu
.CountWords (str
, 3, -1);
263 word_count
+= num_words
;
266 if (hotword_count
< MAXWORDS
) {
267 ReallyAppendText (null, strHot
);
268 hotword_count
+= StringFu
.CountWords (strHot
, 3, -1);
274 public int AppendText (string str
)
277 Logger
.Log
.Debug ("AppendText (\"{0}\")", str
);
279 if (! IsFrozen
&& str
!= null && str
!= "")
280 return AppendText (str
, IsHot
? str
: null);
285 // Does adding text to to text/hot pools respectively.
286 private void ReallyAppendText (string str
, string strHot
)
288 if (!IsFrozen
&& strHot
!= null && strHot
!= "")
289 hotPool
.Add (strHot
.Trim()+" ");
294 if (snippetWriter
!= null)
295 snippetWriter
.Write (str
);
297 last_was_structural_break
= false;
300 private bool NeedsWhiteSpace (ArrayList array
)
302 if (array
.Count
== 0)
305 string last
= (string) array
[array
.Count
-1];
307 && char.IsWhiteSpace (last
[last
.Length
-1]))
313 public void AppendWhiteSpace ()
315 if (last_was_structural_break
)
319 Logger
.Log
.Debug ("AppendWhiteSpace ()");
321 if (NeedsWhiteSpace (textPool
)) {
323 if (snippetWriter
!= null)
324 snippetWriter
.Write (" ");
325 last_was_structural_break
= false;
329 public void AddProperty (Property prop
)
331 if (prop
!= null && prop
.Value
!= null && prop
.Value
!= "")
332 propertyPool
.Add (prop
);
335 public void AppendStructuralBreak ()
337 if (snippetWriter
!= null && ! last_was_structural_break
) {
338 snippetWriter
.WriteLine ();
339 last_was_structural_break
= true;
341 // When adding a "newline" to the textCache, we need to
342 // append a "Whitespace" to the text pool.
343 if (NeedsWhiteSpace (textPool
))
347 //////////////////////////
349 private bool isFinished
= false;
351 public bool IsFinished
{
352 get { return isFinished; }
355 protected void Finished ()
360 private bool has_error
= false;
362 public bool HasError
{
363 get { return has_error; }
366 protected void Error ()
368 Cleanup (); // force the clean-up of temporary files on an error
372 //////////////////////////
374 protected virtual void DoOpen (FileSystemInfo info
) {
375 if (info
is FileInfo
)
376 DoOpen (info
as FileInfo
);
377 else if (info
is DirectoryInfo
)
378 DoOpen (info
as DirectoryInfo
);
381 protected virtual void DoOpen (FileInfo info
) { }
383 protected virtual void DoOpen (DirectoryInfo info
) { }
385 protected virtual void DoPullProperties () { }
387 protected virtual void DoPullSetup () { }
389 protected virtual void DoPull () { Finished (); }
391 protected virtual void DoClose () { }
393 //////////////////////////
397 (1) DoOpen (FileInfo info) or DoOpen (Stream)
398 (2) DoPullProperties ()
400 At this point all properties must be in place
402 Once someone starts reading from the TextReader,
403 the following are called:
404 DoPull () [until Finished() is called]
405 DoClose () [when finished]
409 private string tempFile
= null;
410 private FileSystemInfo currentInfo
= null;
411 private FileStream currentStream
= null;
412 private StreamReader currentReader
= null;
414 public bool Open (TextReader reader
)
416 tempFile
= Path
.GetTempFileName ();
417 FileStream file_stream
= File
.OpenWrite (tempFile
);
420 Logger
.Log
.Debug ("Storing text in tempFile {0}", tempFile
);
422 // When we dump the contents of a reader into a file, we
423 // expect to use it again soon.
424 FileAdvise
.PreLoad (file_stream
);
426 // Make sure the temporary file is only readable by the owner.
427 // FIXME: There is probably a race here. Could some malicious program
428 // do something to the file between creation and the chmod?
429 Mono
.Unix
.Native
.Syscall
.chmod (tempFile
, (Mono
.Unix
.Native
.FilePermissions
) 256);
431 BufferedStream buffered_stream
= new BufferedStream (file_stream
);
432 StreamWriter writer
= new StreamWriter (buffered_stream
);
434 const int BUFFER_SIZE
= 8192;
435 char [] buffer
= new char [BUFFER_SIZE
];
439 read
= reader
.Read (buffer
, 0, BUFFER_SIZE
);
441 writer
.Write (buffer
, 0, read
);
446 return Open (new FileInfo (tempFile
));
449 public bool Open (Stream stream
)
451 tempFile
= Path
.GetTempFileName ();
452 FileStream file_stream
= File
.OpenWrite (tempFile
);
455 Logger
.Log
.Debug ("Storing stream in tempFile {0}", tempFile
);
457 // When we dump the contents of a reader into a file, we
458 // expect to use it again soon.
459 FileAdvise
.PreLoad (file_stream
);
461 // Make sure the temporary file is only readable by the owner.
462 // FIXME: There is probably a race here. Could some malicious program
463 // do something to the file between creation and the chmod?
464 Mono
.Unix
.Native
.Syscall
.chmod (tempFile
, (Mono
.Unix
.Native
.FilePermissions
) 256);
466 BufferedStream buffered_stream
= new BufferedStream (file_stream
);
468 const int BUFFER_SIZE
= 8192;
469 byte [] buffer
= new byte [BUFFER_SIZE
];
473 read
= stream
.Read (buffer
, 0, BUFFER_SIZE
);
475 buffered_stream
.Write (buffer
, 0, read
);
478 buffered_stream
.Close ();
480 return Open (new FileInfo (tempFile
));
483 public bool Open (FileSystemInfo info
)
486 textPool
= new ArrayList ();
487 hotPool
= new ArrayList ();
488 propertyPool
= new ArrayList ();
492 if (info
is FileInfo
) {
493 // Open a stream for this file.
494 currentStream
= new FileStream (info
.FullName
,
500 // Our default assumption is sequential reads.
501 // FIXME: Is this the right thing to do here?
502 FileAdvise
.IncreaseReadAhead (currentStream
);
504 // Give the OS a hint that we will be reading this
506 FileAdvise
.PreLoad (currentStream
);
525 // Close and reset our TextReader
526 if (currentReader
!= null) {
527 currentReader
.Close ();
528 currentReader
= null;
531 // Seek back to the beginning of our stream
532 currentStream
.Seek (0, SeekOrigin
.Begin
);
538 } catch (Exception e
) {
539 Logger
.Log
.Warn ("Unable to filter {0}: {1}", info
.FullName
, e
.Message
);
540 Cleanup (); // clean up temporary files on an exception
547 public bool Open (string path
)
549 if (File
.Exists (path
))
550 return Open (new FileInfo (path
));
551 else if (Directory
.Exists (path
))
552 return Open (new DirectoryInfo (path
));
557 public FileInfo FileInfo
{
558 get { return currentInfo as FileInfo; }
561 public DirectoryInfo DirectoryInfo
{
562 get { return currentInfo as DirectoryInfo; }
565 public Stream Stream
{
566 get { return currentStream; }
569 public TextReader TextReader
{
571 if (currentReader
== null
572 && currentStream
!= null) {
573 currentReader
= new StreamReader (currentStream
);
576 return currentReader
;
582 if (IsFinished
|| HasError
) {
595 private void Close ()
599 if (currentStream
== null)
604 // When crawling, give the OS a hint that we don't
605 // need to keep this file around in the page cache.
607 FileAdvise
.FlushCache (currentStream
);
609 if (currentReader
!= null)
610 currentReader
.Close ();
612 currentStream
.Close ();
613 currentStream
= null;
615 if (snippetWriter
!= null)
616 snippetWriter
.Close ();
619 public void Cleanup ()
621 if (tempFile
!= null) {
623 File
.Delete (tempFile
);
624 } catch (Exception ex
) {
625 // Just in case it is gone already
631 private bool PullFromArray (ArrayList array
, StringBuilder sb
)
633 while (array
.Count
== 0 && Pull ()) { }
635 // FIXME: Do we want to try to extract as much data as
636 // possible from the filter if we get an error, or
637 // should we just give up afterward entirely?
639 if (array
.Count
> 0) {
640 foreach (string str
in array
)
649 private bool PullTextCarefully (ArrayList array
, StringBuilder sb
)
653 pulled
= PullFromArray (array
, sb
);
654 } catch (Exception ex
) {
655 Logger
.Log
.Debug ("Caught exception while pulling text in filter '{0}'", Name
);
656 Logger
.Log
.Debug (ex
);
662 private bool PullText (StringBuilder sb
)
664 return PullTextCarefully (textPool
, sb
);
667 private bool PullHotText (StringBuilder sb
)
669 return PullTextCarefully (hotPool
, sb
);
672 public TextReader
GetTextReader ()
674 PullingReader pr
= new PullingReader (new PullingReader
.Pull (PullText
));
675 pr
.Identifier
= Identifier
;
679 public TextReader
GetHotTextReader ()
681 return new PullingReader (new PullingReader
.Pull (PullHotText
));
684 public IEnumerable Properties
{
685 get { return propertyPool; }
688 //////////////////////////////
690 // This is used primarily for the generation of URIs for the
691 // child indexables that can be created as a result of the
692 // filtering process.
694 private Uri uri
= null;
701 //////////////////////////////
703 private ArrayList child_indexables
= new ArrayList ();
705 protected void AddChildIndexable (Indexable indexable
)
707 this.child_indexables
.Add (indexable
);
710 protected void AddChildIndexables (ICollection indexables
)
712 this.child_indexables
.AddRange (indexables
);
715 public ArrayList ChildIndexables
{
716 get { return this.child_indexables; }