4 // Copyright (C) 2004 Novell, Inc.
8 // Permission is hereby granted, free of charge, to any person obtaining a
9 // copy of this software and associated documentation files (the "Software"),
10 // to deal in the Software without restriction, including without limitation
11 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 // and/or sell copies of the Software, and to permit persons to whom the
13 // Software is furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 // DEALINGS IN THE SOFTWARE.
29 using System
.Collections
;
32 using System
.Threading
;
34 using System
.Xml
.Serialization
;
39 public enum IndexableType
{
45 public enum IndexableFiltering
{
46 Never
, // Never try to filter this indexable, it contains no content
47 AlreadyFiltered
, // The readers promise to return nice clean text, so do nothing
48 Automatic
, // Try to determine automatically if this needs to be filtered
49 Always
// Always try to filter this indexable
52 public class Indexable
: Versioned
, IComparable
{
54 static private bool Debug
= false;
56 // This is the type of indexing operation represented by
57 // this Indexable object. We default to Add, for historical
59 private IndexableType type
= IndexableType
.Add
;
61 // The URI of the item being indexed.
62 private Uri uri
= null;
64 // The URI of the parent indexable, if any.
65 private Uri parent_uri
= null;
67 // The URI of the contents to index
68 private Uri contentUri
= null;
70 // The URI of the hot contents to index
71 private Uri hotContentUri
= null;
73 // Whether the content should be deleted after indexing
74 private bool deleteContent
= false;
76 // File, WebLink, MailMessage, IMLog, etc.
77 private String hit_type
= null;
79 // If applicable, otherwise set to null.
80 private String mimeType
= null;
82 // The source backend that generated this indexable
83 private string source
= null;
85 // List of Property objects
86 private ArrayList properties
= new ArrayList ();
88 // Is this being indexed because of crawling or other
89 // background activity?
90 private bool crawled
= true;
92 // Is this object inherently contentless?
93 private bool no_content
= false;
95 // If necessary, should we cache this object's content?
96 // The cached version is used to generate snippets.
97 private bool cache_content
= true;
99 // Is this indexable a child of another indexable ?
100 // If true, then parent_uri points to the uri of the parent
101 // However, an indexable can have parent_uri set but may not be a child
102 private bool is_child
= false;
104 // A stream of the content to index
105 private TextReader textReader
;
107 // A stream of the hot content to index
108 private TextReader hotTextReader
;
110 // A stream of binary data to filter
111 private Stream binary_stream
;
113 // When should we try to filter this indexable?
114 private IndexableFiltering filtering
= IndexableFiltering
.Automatic
;
116 // Local state: these are key/value pairs that never get serialized
118 Hashtable local_state
= new Hashtable ();
120 //////////////////////////
122 static private XmlSerializer our_serializer
;
126 our_serializer
= new XmlSerializer (typeof (Indexable
));
129 //////////////////////////
131 public Indexable (IndexableType type
,
136 this.hit_type
= "File"; // FIXME: Why do we default to this?
139 public Indexable (Uri uri
) : this (IndexableType
.Add
, uri
)
144 // Only used when reading from xml
147 public static Indexable
NewFromXml (string xml
)
149 StringReader reader
= new StringReader (xml
);
150 return (Indexable
) our_serializer
.Deserialize (reader
);
153 //////////////////////////
155 [XmlAttribute ("Type")]
156 public IndexableType Type
{
158 set { type = value; }
167 [XmlAttribute ("Uri")]
168 public string UriString
{
169 get { return UriFu.UriToEscapedString (uri); }
170 set { uri = UriFu.EscapedStringToUri (value); }
174 public Uri ParentUri
{
175 get { return parent_uri; }
176 set { parent_uri = value; }
179 [XmlAttribute ("ParentUri")]
180 public string ParentUriString
{
182 if (parent_uri
== null)
185 return UriFu
.UriToEscapedString (parent_uri
);
192 parent_uri
= UriFu
.EscapedStringToUri (value);
197 public Uri ContentUri
{
198 get { return contentUri != null ? contentUri : Uri; }
199 set { contentUri = value; }
202 [XmlAttribute ("ContentUri")]
203 public string ContentUriString
{
204 get { return UriFu.UriToEscapedString (ContentUri); }
205 set { contentUri = UriFu.EscapedStringToUri (value); }
209 private Uri HotContentUri
{
210 get { return hotContentUri; }
211 set { hotContentUri = value; }
214 [XmlAttribute ("HotContentUri")]
215 public string HotContentUriString
{
216 get { return HotContentUri != null ? UriFu.UriToEscapedString (HotContentUri) : ""; }
217 set { hotContentUri = (value != "") ? UriFu.EscapedStringToUri (value) : null; }
220 private Uri display_uri
= null;
223 public Uri DisplayUri
{
224 get { return display_uri != null ? display_uri : Uri; }
225 set { display_uri = value; }
228 [XmlAttribute ("DisplayUri")]
229 public string DisplayUriString
{
230 get { return UriFu.UriToEscapedString (DisplayUri); }
231 set { DisplayUri = UriFu.EscapedStringToUri (value); }
235 public bool DeleteContent
{
236 get { return deleteContent; }
237 set { deleteContent = value; }
241 public String HitType
{
242 get { return hit_type; }
243 set { hit_type = value; }
247 public String MimeType
{
248 get { return mimeType; }
249 set { mimeType = value; }
253 public string Source
{
254 get { return source; }
255 set { source = value; }
259 public bool IsNonTransient
{
261 * - content should not be deleted after indexing and
262 * - actual source of data (data might be stored in temporary files for indexing) is a file and
263 * - there is no parent uri set.
265 get { return ! DeleteContent && ContentUri.IsFile && ParentUri == null; }
269 public bool Crawled
{
270 get { return crawled; }
271 set { crawled = value; }
275 public bool NoContent
{
276 get { return no_content; }
277 set { no_content = value; }
281 public bool CacheContent
{
282 get { return cache_content; }
283 set { cache_content = value; }
287 public IndexableFiltering Filtering
{
288 get { return filtering; }
289 set { filtering = value; }
293 public IDictionary LocalState
{
294 get { return local_state; }
298 public bool IsChild
{
299 get { return is_child; }
300 set { is_child = value; }
303 //////////////////////////
305 public void Cleanup ()
308 if (contentUri
!= null) {
310 Logger
.Log
.Debug ("Cleaning up {0}", contentUri
.LocalPath
);
313 File
.Delete (contentUri
.LocalPath
);
315 // It might be gone already, so catch the exception.
321 if (hotContentUri
!= null) {
323 Logger
.Log
.Debug ("Cleaning up {0}", hotContentUri
.LocalPath
);
326 File
.Delete (hotContentUri
.LocalPath
);
331 hotContentUri
= null;
336 private Stream
StreamFromUri (Uri uri
)
338 Stream stream
= null;
340 if (uri
!= null && uri
.IsFile
&& ! no_content
) {
341 stream
= new FileStream (uri
.LocalPath
,
350 private TextReader
ReaderFromUri (Uri uri
)
352 Stream stream
= StreamFromUri (uri
);
357 return new StreamReader (stream
);
361 public TextReader
GetTextReader ()
366 if (textReader
== null)
367 textReader
= ReaderFromUri (ContentUri
);
372 public void SetTextReader (TextReader reader
)
377 public TextReader
GetHotTextReader ()
382 if (hotTextReader
== null)
383 hotTextReader
= ReaderFromUri (HotContentUri
);
384 return hotTextReader
;
387 public void SetHotTextReader (TextReader reader
)
389 hotTextReader
= reader
;
392 public Stream
GetBinaryStream ()
397 if (binary_stream
== null)
398 binary_stream
= StreamFromUri (ContentUri
);
400 return binary_stream
;
403 public void SetBinaryStream (Stream stream
)
405 binary_stream
= stream
;
408 [XmlArrayItem (ElementName
="Property", Type
=typeof (Property
))]
409 public ArrayList Properties
{
410 get { return properties; }
413 public void AddProperty (Property prop
) {
416 if (type
== IndexableType
.PropertyChange
&& ! prop
.IsMutable
)
417 throw new ArgumentException ("Non-mutable properties aren't allowed in this indexable");
419 // If this is a mutable property, make sure that
420 // we don't already contain another mutable property
421 // with the same name. If we do, replace it.
422 if (prop
.IsMutable
) {
423 for (int i
= 0; i
< properties
.Count
; ++i
) {
424 Property other_prop
= properties
[i
] as Property
;
425 if (other_prop
.IsMutable
&& prop
.Key
== other_prop
.Key
) {
426 properties
[i
] = prop
;
432 properties
.Add (prop
);
436 public bool HasProperty (string keyword
) {
437 foreach (Property property
in properties
)
438 if (property
.Key
== keyword
)
444 // This doesn't check if it makes sense to actually
445 // merge the two indexables: it just does it.
446 public void Merge (Indexable other
)
448 this.Timestamp
= other
.Timestamp
;
450 foreach (Property prop
in other
.Properties
)
451 this.AddProperty (prop
);
453 foreach (DictionaryEntry entry
in other
.local_state
)
454 this.local_state
[entry
.Key
] = entry
.Value
;
457 //////////////////////////
459 public void SetChildOf (Indexable parent
)
463 this.ParentUri
= parent
.ParentUri
;
465 this.ParentUri
= parent
.Uri
;
467 if (!this.ValidTimestamp
)
468 this.Timestamp
= parent
.Timestamp
;
470 // FIXME: Set all of the parent's properties on the
471 // child so that we get matches against the child
472 // that otherwise would match only the parent, at
473 // least until we have proper RDF support.
476 CopyPropertyChildToChild (parent
);
478 CopyPropertyParentToChild (parent
);
481 // FIXME: Copying the correct properties from parent to child:
482 // (This is not perfect yet)
483 // It does not make sense to have parent:parent:parent:...:parent:foo
484 // for property names of a nested child
485 // Moreover, if indexable a.mbox has child b.zip which has child c.zip,
486 // then upon matching c.zip, we would like to get the information from
487 // a.mbox (i.e. the toplevel indexable) only. Intermediate parent information
488 // is not necessary for displaying results; in fact, storing them would cause
489 // confusion during display.
490 // E.g. storing parent:beagle:filename for all parents
491 // would cause, parent:beagle:filename=a.mbox, parent.beagle.filename=b.zip
492 // whereas we are only interested in toplevel parent:beagle:filename=a.mbox
493 // For indexables which need to store the intermediate/immediate parent info
494 // separately, explicitly store them.
495 // Another problem is, toplevel indexable might want to store information
496 // which should not be matched when searching for its child. Copying those
497 // properties in all children will incorrectly match them.
500 private void CopyPropertyChildToChild (Indexable parent
)
502 // If parent itself is a child,
503 // then only copy parents' parent:xxx and _private:xxx properties
504 foreach (Property prop
in parent
.Properties
) {
506 if (prop
.Key
.StartsWith ("parent:") ||
507 prop
.Key
.StartsWith (Property
.PrivateNamespace
)) {
509 Property new_prop
= (Property
) prop
.Clone ();
510 this.AddProperty (new_prop
);
513 Property new_prop
= (Property
) prop
.Clone ();
514 new_prop
.IsStored
= false;
515 this.AddProperty (new_prop
);
520 private void CopyPropertyParentToChild (Indexable parent
)
522 // Parent is a top level indexable
523 // Copy all properties
524 foreach (Property prop
in parent
.Properties
) {
526 Property new_prop
= (Property
) prop
.Clone ();
527 // Add parent: to property names ONLY IF
528 // - not private property (these are not properties of the file content)
529 // - property name does not already start with parent:
530 if (! new_prop
.Key
.StartsWith (Property
.PrivateNamespace
) &&
531 ! new_prop
.Key
.StartsWith ("parent:"))
532 new_prop
.Key
= "parent:" + new_prop
.Key
;
534 this.AddProperty (new_prop
);
538 //////////////////////////
540 public override string ToString ()
542 StringWriter writer
= new StringWriter ();
543 our_serializer
.Serialize (writer
, this);
545 return writer
.ToString ();
548 //////////////////////////
550 const int BUFFER_SIZE
= 8192;
552 private static char [] GetCharBuffer ()
554 LocalDataStoreSlot slot
;
555 slot
= Thread
.GetNamedDataSlot ("Char Buffer");
559 obj
= Thread
.GetData (slot
);
561 buffer
= new char [BUFFER_SIZE
];
562 Thread
.SetData (slot
, buffer
);
564 buffer
= (char []) obj
;
570 private static byte [] GetByteBuffer ()
572 LocalDataStoreSlot slot
;
573 slot
= Thread
.GetNamedDataSlot ("Byte Buffer");
577 obj
= Thread
.GetData (slot
);
579 buffer
= new byte [BUFFER_SIZE
];
580 Thread
.SetData (slot
, buffer
);
582 buffer
= (byte []) obj
;
588 //////////////////////////
590 private static Uri
TextReaderToTempFileUri (TextReader reader
)
595 string filename
= Path
.GetTempFileName ();
596 FileStream fileStream
= File
.OpenWrite (filename
);
598 // When we dump the contents of an indexable into a file, we
599 // expect to use it again soon.
600 FileAdvise
.PreLoad (fileStream
);
602 // Make sure the temporary file is only readable by the owner.
603 // FIXME: There is probably a race here. Could some malicious program
604 // do something to the file between creation and the chmod?
605 Mono
.Unix
.Native
.Syscall
.chmod (filename
, (Mono
.Unix
.Native
.FilePermissions
) 256);
607 BufferedStream bufferedStream
= new BufferedStream (fileStream
);
608 StreamWriter writer
= new StreamWriter (bufferedStream
);
612 buffer
= GetCharBuffer ();
616 read
= reader
.Read (buffer
, 0, buffer
.Length
);
618 writer
.Write (buffer
, 0, read
);
623 return UriFu
.PathToFileUri (filename
);
626 private static Uri
BinaryStreamToTempFileUri (Stream stream
)
631 string filename
= Path
.GetTempFileName ();
632 FileStream fileStream
= File
.OpenWrite (filename
);
634 // When we dump the contents of an indexable into a file, we
635 // expect to use it again soon.
636 FileAdvise
.PreLoad (fileStream
);
638 // Make sure the temporary file is only readable by the owner.
639 // FIXME: There is probably a race here. Could some malicious program
640 // do something to the file between creation and the chmod?
641 Mono
.Unix
.Native
.Syscall
.chmod (filename
, (Mono
.Unix
.Native
.FilePermissions
) 256);
643 BufferedStream bufferedStream
= new BufferedStream (fileStream
);
646 buffer
= GetByteBuffer ();
650 read
= stream
.Read (buffer
, 0, buffer
.Length
);
652 bufferedStream
.Write (buffer
, 0, read
);
655 bufferedStream
.Close ();
657 return UriFu
.PathToFileUri (filename
);
660 public void StoreStream () {
661 if (textReader
!= null) {
662 ContentUri
= TextReaderToTempFileUri (textReader
);
665 Logger
.Log
.Debug ("Storing text content from {0} in {1}", Uri
, ContentUri
);
667 DeleteContent
= true;
668 } else if (binary_stream
!= null) {
669 ContentUri
= BinaryStreamToTempFileUri (binary_stream
);
672 Logger
.Log
.Debug ("Storing binary content from {0} in {1}", Uri
, ContentUri
);
674 DeleteContent
= true;
677 if (hotTextReader
!= null) {
678 HotContentUri
= TextReaderToTempFileUri (hotTextReader
);
681 Logger
.Log
.Debug ("Storing hot content from {0} in {1}", Uri
, HotContentUri
);
683 DeleteContent
= true;
687 public void CloseStreams ()
689 if (textReader
!= null)
691 else if (binary_stream
!= null)
692 binary_stream
.Close ();
694 if (hotTextReader
!= null)
695 hotTextReader
.Close ();
698 //////////////////////////
700 public override int GetHashCode ()
702 return (uri
!= null ? uri
.GetHashCode () : 0) ^ type
.GetHashCode ();
705 public int CompareTo (object obj
)
707 Indexable other
= (Indexable
) obj
;
708 return DateTime
.Compare (this.Timestamp
, other
.Timestamp
);