4 // Copyright (C) 2004 Novell, Inc.
8 // Permission is hereby granted, free of charge, to any person obtaining a
9 // copy of this software and associated documentation files (the "Software"),
10 // to deal in the Software without restriction, including without limitation
11 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 // and/or sell copies of the Software, and to permit persons to whom the
13 // Software is furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 // DEALINGS IN THE SOFTWARE.
29 using System
.Collections
;
32 using System
.Reflection
;
36 namespace Beagle
.Daemon
{
40 static private bool Debug
= false;
41 // Lucene fields allow a maximum of 10000 words
42 // Some of the words will be stop words... so a failsafe maximum of 40000 words
43 // Dont accept more words than that
44 const int MAXWORDS
= 40000; // Lucene.Net.Index.IndexWriter.DEFAULT_MAX_FIELD_LENGTH * 4
46 // Derived classes always must have a constructor that
47 // takes no arguments.
50 //////////////////////////
52 private string identifier
;
54 public string Identifier
{
55 get { return identifier; }
56 set { identifier = value; }
59 //////////////////////////
61 private ArrayList supported_flavors
= new ArrayList ();
63 protected void AddSupportedFlavor (FilterFlavor flavor
)
65 supported_flavors
.Add (flavor
);
68 public ICollection SupportedFlavors
{
69 get { return supported_flavors; }
72 //////////////////////////
74 // Filters are versioned. This allows us to automatically re-index
75 // files when a newer filter is available.
78 get { return this.GetType ().Name; }
81 private int version
= -1;
84 get { return version < 0 ? 0 : version; }
87 protected void SetVersion (int v
)
91 msg
= String
.Format ("Attempt to set invalid version {0} on Filter {1}", v
, Name
);
92 throw new Exception (msg
);
97 msg
= String
.Format ("Attempt to re-set version from {0} to {1} on Filter {2}", version
, v
, Name
);
98 throw new Exception (msg
);
106 //////////////////////////
108 private string this_mime_type
= null;
109 private string this_extension
= null;
110 private ArrayList indexable_properties
= null;
111 private DateTime timestamp
= DateTime
.MinValue
;
113 public string MimeType
{
114 get { return this_mime_type; }
115 set { this_mime_type = value; }
118 public string Extension
{
119 get { return this_extension; }
120 set { this_extension = value; }
123 // Allow the filter to access the properties
125 public ArrayList IndexableProperties
{
126 get { return indexable_properties; }
127 set { indexable_properties = value; }
130 // Allow the filter to access the timestamp,
131 // sometime filters know better
132 public DateTime Timestamp
{
133 get { return timestamp; }
134 set { timestamp = value; }
137 //////////////////////////
139 private bool crawl_mode
= false;
141 public void EnableCrawlMode ()
146 protected bool CrawlMode
{
147 get { return crawl_mode; }
150 //////////////////////////
152 // Filters which deal with big files, and that don't need
153 // to read in whole files may want to set this to false
154 // to avoid wasting cycles in disk wait.
156 private bool preload
= true;
158 protected bool PreLoad
{
159 get { return preload; }
160 set { preload = value; }
163 //////////////////////////
173 public void HotDown ()
180 get { return hotCount > 0; }
183 public void FreezeUp ()
188 public void FreezeDown ()
194 public bool IsFrozen
{
195 get { return freezeCount > 0; }
198 //////////////////////////
200 private bool snippetMode
= false;
201 private bool originalIsText
= false;
202 private TextWriter snippetWriter
= null;
204 public bool SnippetMode
{
205 get { return snippetMode; }
206 set { snippetMode = value; }
209 public bool OriginalIsText
{
210 get { return originalIsText; }
211 set { originalIsText = value; }
214 public void AttachSnippetWriter (TextWriter writer
)
217 snippetWriter
= writer
;
220 //////////////////////////
222 private ArrayList textPool
;
223 private ArrayList hotPool
;
224 private ArrayList propertyPool
;
226 private int word_count
= 0;
227 private int hotword_count
= 0;
229 protected bool AllowMoreWords ()
231 return (word_count
< MAXWORDS
);
234 private bool last_was_structural_break
= true;
236 // This two-arg AppendText() will give flexibility to
237 // filters to segregate hot-contents and
238 // normal-contents of a para and call this method with
239 // respective contents.
241 // str : Holds both the normal-contents and hot contents.
242 // strHot: Holds only hot-contents.
244 // Ex:- suppose the actual-content is "one <b>two</b> three"
245 // str = "one two three"
248 // NOTE: HotUp() or HotDown() has NO-EFFECT on this variant
251 public int AppendText (string str
, string strHot
)
255 if (!IsFrozen
&& word_count
< MAXWORDS
&& str
!= null && str
!= "") {
258 // Avoid unnecessary allocation of a string
259 // FIXME: Handle \r, \r\n cases.
260 if (str
.IndexOf ('\n') > -1) {
261 lines
= str
.Split ('\n');
262 foreach (string line
in lines
) {
263 if (line
.Length
> 0) {
264 ReallyAppendText (line
, null);
265 AppendStructuralBreak ();
269 ReallyAppendText (str
, null);
270 num_words
= StringFu
.CountWords (str
, 3, -1);
271 word_count
+= num_words
;
274 if (hotword_count
< MAXWORDS
) {
275 ReallyAppendText (null, strHot
);
276 hotword_count
+= StringFu
.CountWords (strHot
, 3, -1);
282 public int AppendText (string str
)
285 Logger
.Log
.Debug ("AppendText (\"{0}\")", str
);
287 if (! IsFrozen
&& str
!= null && str
!= "")
288 return AppendText (str
, IsHot
? str
: null);
293 // Does adding text to to text/hot pools respectively.
294 private void ReallyAppendText (string str
, string strHot
)
296 if (!IsFrozen
&& strHot
!= null && strHot
!= "")
297 hotPool
.Add (strHot
.Trim()+" ");
302 if (snippetWriter
!= null)
303 snippetWriter
.Write (str
);
305 last_was_structural_break
= false;
308 private bool NeedsWhiteSpace (ArrayList array
)
310 if (array
.Count
== 0)
313 string last
= (string) array
[array
.Count
-1];
315 && char.IsWhiteSpace (last
[last
.Length
-1]))
321 public void AppendWhiteSpace ()
323 if (last_was_structural_break
)
327 Logger
.Log
.Debug ("AppendWhiteSpace ()");
329 if (NeedsWhiteSpace (textPool
)) {
331 if (snippetWriter
!= null)
332 snippetWriter
.Write (" ");
333 last_was_structural_break
= false;
337 public void AddProperty (Property prop
)
339 if (prop
!= null && prop
.Value
!= null && prop
.Value
!= "")
340 propertyPool
.Add (prop
);
343 public void AppendStructuralBreak ()
345 if (snippetWriter
!= null && ! last_was_structural_break
) {
346 snippetWriter
.WriteLine ();
347 last_was_structural_break
= true;
349 // When adding a "newline" to the textCache, we need to
350 // append a "Whitespace" to the text pool.
351 if (NeedsWhiteSpace (textPool
))
355 //////////////////////////
357 private bool isFinished
= false;
359 public bool IsFinished
{
360 get { return isFinished; }
363 protected void Finished ()
368 private bool has_error
= false;
370 public bool HasError
{
371 get { return has_error; }
374 protected void Error ()
376 Cleanup (); // force the clean-up of temporary files on an error
380 //////////////////////////
382 protected virtual void DoOpen (FileSystemInfo info
) {
383 if (info
is FileInfo
)
384 DoOpen (info
as FileInfo
);
385 else if (info
is DirectoryInfo
)
386 DoOpen (info
as DirectoryInfo
);
389 protected virtual void DoOpen (FileInfo info
) { }
391 protected virtual void DoOpen (DirectoryInfo info
) { }
393 protected virtual void DoPullProperties () { }
395 protected virtual void DoPullSetup () { }
397 protected virtual void DoPull () { Finished (); }
399 protected virtual void DoClose () { }
401 //////////////////////////
405 (1) DoOpen (FileInfo info) or DoOpen (Stream)
406 (2) DoPullProperties ()
408 At this point all properties must be in place
410 Once someone starts reading from the TextReader,
411 the following are called:
412 DoPull () [until Finished() is called]
413 DoClose () [when finished]
417 private string tempFile
= null;
418 private FileSystemInfo currentInfo
= null;
419 private FileStream currentStream
= null;
420 private StreamReader currentReader
= null;
422 public bool Open (TextReader reader
)
424 tempFile
= Path
.GetTempFileName ();
425 FileStream file_stream
= File
.OpenWrite (tempFile
);
428 Logger
.Log
.Debug ("Storing text in tempFile {0}", tempFile
);
430 // When we dump the contents of a reader into a file, we
431 // expect to use it again soon.
432 FileAdvise
.PreLoad (file_stream
);
434 // Make sure the temporary file is only readable by the owner.
435 // FIXME: There is probably a race here. Could some malicious program
436 // do something to the file between creation and the chmod?
437 Mono
.Unix
.Native
.Syscall
.chmod (tempFile
, (Mono
.Unix
.Native
.FilePermissions
) 256);
439 BufferedStream buffered_stream
= new BufferedStream (file_stream
);
440 StreamWriter writer
= new StreamWriter (buffered_stream
);
442 const int BUFFER_SIZE
= 8192;
443 char [] buffer
= new char [BUFFER_SIZE
];
447 read
= reader
.Read (buffer
, 0, BUFFER_SIZE
);
449 writer
.Write (buffer
, 0, read
);
454 return Open (new FileInfo (tempFile
));
457 public bool Open (Stream stream
)
459 tempFile
= Path
.GetTempFileName ();
460 FileStream file_stream
= File
.OpenWrite (tempFile
);
463 Logger
.Log
.Debug ("Storing stream in tempFile {0}", tempFile
);
465 // When we dump the contents of a reader into a file, we
466 // expect to use it again soon.
467 FileAdvise
.PreLoad (file_stream
);
469 // Make sure the temporary file is only readable by the owner.
470 // FIXME: There is probably a race here. Could some malicious program
471 // do something to the file between creation and the chmod?
472 Mono
.Unix
.Native
.Syscall
.chmod (tempFile
, (Mono
.Unix
.Native
.FilePermissions
) 256);
474 BufferedStream buffered_stream
= new BufferedStream (file_stream
);
476 const int BUFFER_SIZE
= 8192;
477 byte [] buffer
= new byte [BUFFER_SIZE
];
481 read
= stream
.Read (buffer
, 0, BUFFER_SIZE
);
483 buffered_stream
.Write (buffer
, 0, read
);
486 buffered_stream
.Close ();
488 return Open (new FileInfo (tempFile
));
491 public bool Open (FileSystemInfo info
)
494 textPool
= new ArrayList ();
495 hotPool
= new ArrayList ();
496 propertyPool
= new ArrayList ();
500 if (info
is FileInfo
) {
501 // Open a stream for this file.
502 currentStream
= new FileStream (info
.FullName
,
508 // Our default assumption is sequential reads.
509 // FIXME: Is this the right thing to do here?
510 FileAdvise
.IncreaseReadAhead (currentStream
);
512 // Give the OS a hint that we will be reading this
514 FileAdvise
.PreLoad (currentStream
);
533 // Reset our TextReader
534 // Dont close the streamreader as
535 // that will also close the stream
536 if (currentReader
!= null) {
537 currentReader
.DiscardBufferedData ();
540 // Seek back to the beginning of our stream
541 currentStream
.Seek (0, SeekOrigin
.Begin
);
547 } catch (Exception e
) {
548 Log
.Warn (e
, "Unable to filter {0}:", info
.FullName
);
549 Cleanup (); // clean up temporary files on an exception
556 public bool Open (string path
)
558 if (File
.Exists (path
))
559 return Open (new FileInfo (path
));
560 else if (Directory
.Exists (path
))
561 return Open (new DirectoryInfo (path
));
566 public FileInfo FileInfo
{
567 get { return currentInfo as FileInfo; }
570 public DirectoryInfo DirectoryInfo
{
571 get { return currentInfo as DirectoryInfo; }
574 public Stream Stream
{
575 get { return currentStream; }
578 public TextReader TextReader
{
580 if (currentReader
== null
581 && currentStream
!= null) {
582 currentReader
= new StreamReader (currentStream
);
585 return currentReader
;
591 if (IsFinished
|| HasError
) {
604 private void Close ()
608 if (currentStream
== null)
613 // When crawling, give the OS a hint that we don't
614 // need to keep this file around in the page cache.
616 FileAdvise
.FlushCache (currentStream
);
618 if (currentReader
!= null)
619 currentReader
.Close ();
621 currentStream
.Close ();
622 currentStream
= null;
624 if (snippetWriter
!= null)
625 snippetWriter
.Close ();
628 public void Cleanup ()
630 if (tempFile
!= null) {
632 File
.Delete (tempFile
);
633 } catch (Exception ex
) {
634 // Just in case it is gone already
640 private bool PullFromArray (ArrayList array
, StringBuilder sb
)
642 while (array
.Count
== 0 && Pull ()) { }
644 // FIXME: Do we want to try to extract as much data as
645 // possible from the filter if we get an error, or
646 // should we just give up afterward entirely?
648 if (array
.Count
> 0) {
649 foreach (string str
in array
)
658 private bool PullTextCarefully (ArrayList array
, StringBuilder sb
)
662 pulled
= PullFromArray (array
, sb
);
663 } catch (Exception ex
) {
664 Logger
.Log
.Debug (ex
, "Caught exception while pulling text in filter '{0}'", Name
);
670 private bool PullText (StringBuilder sb
)
672 return PullTextCarefully (textPool
, sb
);
675 private bool PullHotText (StringBuilder sb
)
677 return PullTextCarefully (hotPool
, sb
);
680 public TextReader
GetTextReader ()
682 PullingReader pr
= new PullingReader (new PullingReader
.Pull (PullText
));
683 pr
.Identifier
= Identifier
;
687 public TextReader
GetHotTextReader ()
689 return new PullingReader (new PullingReader
.Pull (PullHotText
));
692 public IEnumerable Properties
{
693 get { return propertyPool; }
696 //////////////////////////////
698 // This is used primarily for the generation of URIs for the
699 // child indexables that can be created as a result of the
700 // filtering process.
702 private Uri uri
= null;
709 //////////////////////////////
711 private ArrayList child_indexables
= new ArrayList ();
713 protected void AddChildIndexable (Indexable indexable
)
715 this.child_indexables
.Add (indexable
);
718 protected void AddChildIndexables (ICollection indexables
)
720 this.child_indexables
.AddRange (indexables
);
723 public ArrayList ChildIndexables
{
724 get { return this.child_indexables; }