Dont reindex already indexed files. Yet another bug uncovered by the DateTime fixes.
[beagle.git] / BeagleClient / Indexable.cs
blob28bd8817ffb9f4806b10eb259a73d73c21127b98
1 //
2 // Indexable.cs
3 //
4 // Copyright (C) 2004 Novell, Inc.
5 //
7 //
8 // Permission is hereby granted, free of charge, to any person obtaining a
9 // copy of this software and associated documentation files (the "Software"),
10 // to deal in the Software without restriction, including without limitation
11 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 // and/or sell copies of the Software, and to permit persons to whom the
13 // Software is furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 // DEALINGS IN THE SOFTWARE.
28 using System;
29 using System.Collections;
30 using System.IO;
31 using System.Text;
32 using System.Threading;
33 using System.Xml;
34 using System.Xml.Serialization;
35 using Beagle.Util;
37 namespace Beagle {
39 public enum IndexableType {
40 Add,
41 Remove,
42 PropertyChange
45 public enum IndexableFiltering {
46 Never, // Never try to filter this indexable, it contains no content
47 AlreadyFiltered, // The readers promise to return nice clean text, so do nothing
48 Automatic, // Try to determine automatically if this needs to be filtered
49 Always // Always try to filter this indexable
52 public class Indexable : Versioned, IComparable {
54 static private bool Debug = false;
56 // This is the type of indexing operation represented by
57 // this Indexable object. We default to Add, for historical
58 // reasons.
59 private IndexableType type = IndexableType.Add;
61 // The URI of the item being indexed.
62 private Uri uri = null;
64 // The URI of the parent indexable, if any.
65 private Uri parent_uri = null;
67 // The URI of the contents to index
68 private Uri contentUri = null;
70 // The URI of the hot contents to index
71 private Uri hotContentUri = null;
73 // Whether the content should be deleted after indexing
74 private bool deleteContent = false;
76 // File, WebLink, MailMessage, IMLog, etc.
77 private String hit_type = null;
79 // If applicable, otherwise set to null.
80 private String mimeType = null;
82 // The source backend that generated this indexable
83 private string source = null;
85 // List of Property objects
86 private ArrayList properties = new ArrayList ();
88 // Is this being indexed because of crawling or other
89 // background activity?
90 private bool crawled = true;
92 // Is this object inherently contentless?
93 private bool no_content = false;
95 // If necessary, should we cache this object's content?
96 // The cached version is used to generate snippets.
97 private bool cache_content = true;
99 // Is this indexable a child of another indexable ?
100 // If true, then parent_uri points to the uri of the parent
101 // However, an indexable can have parent_uri set but may not be a child
102 private bool is_child = false;
104 // A stream of the content to index
105 private TextReader textReader;
107 // A stream of the hot content to index
108 private TextReader hotTextReader;
110 // A stream of binary data to filter
111 private Stream binary_stream;
113 // When should we try to filter this indexable?
114 private IndexableFiltering filtering = IndexableFiltering.Automatic;
116 // Local state: these are key/value pairs that never get serialized
117 // into XML
118 Hashtable local_state = new Hashtable ();
120 //////////////////////////
122 static private XmlSerializer our_serializer;
124 static Indexable ()
126 our_serializer = new XmlSerializer (typeof (Indexable));
129 //////////////////////////
131 public Indexable (IndexableType type,
132 Uri uri)
134 this.type = type;
135 this.uri = uri;
136 this.hit_type = "File"; // FIXME: Why do we default to this?
139 public Indexable (Uri uri) : this (IndexableType.Add, uri)
142 public Indexable ()
144 // Only used when reading from xml
147 public static Indexable NewFromXml (string xml)
149 StringReader reader = new StringReader (xml);
150 return (Indexable) our_serializer.Deserialize (reader);
153 //////////////////////////
155 [XmlAttribute ("Type")]
156 public IndexableType Type {
157 get { return type; }
158 set { type = value; }
161 [XmlIgnore]
162 public Uri Uri {
163 get { return uri; }
164 set { uri = value; }
167 [XmlAttribute ("Uri")]
168 public string UriString {
169 get { return UriFu.UriToEscapedString (uri); }
170 set { uri = UriFu.EscapedStringToUri (value); }
173 [XmlIgnore]
174 public Uri ParentUri {
175 get { return parent_uri; }
176 set { parent_uri = value; }
179 [XmlAttribute ("ParentUri")]
180 public string ParentUriString {
181 get {
182 if (parent_uri == null)
183 return null;
185 return UriFu.UriToEscapedString (parent_uri);
188 set {
189 if (value == null)
190 parent_uri = null;
191 else
192 parent_uri = UriFu.EscapedStringToUri (value);
196 [XmlIgnore]
197 public Uri ContentUri {
198 get { return contentUri != null ? contentUri : Uri; }
199 set { contentUri = value; }
202 [XmlAttribute ("ContentUri")]
203 public string ContentUriString {
204 get { return UriFu.UriToEscapedString (ContentUri); }
205 set { contentUri = UriFu.EscapedStringToUri (value); }
208 [XmlIgnore]
209 private Uri HotContentUri {
210 get { return hotContentUri; }
211 set { hotContentUri = value; }
214 [XmlAttribute ("HotContentUri")]
215 public string HotContentUriString {
216 get { return HotContentUri != null ? UriFu.UriToEscapedString (HotContentUri) : ""; }
217 set { hotContentUri = (value != "") ? UriFu.EscapedStringToUri (value) : null; }
220 private Uri display_uri = null;
222 [XmlIgnore]
223 public Uri DisplayUri {
224 get { return display_uri != null ? display_uri : Uri; }
225 set { display_uri = value; }
228 [XmlAttribute ("DisplayUri")]
229 public string DisplayUriString {
230 get { return UriFu.UriToEscapedString (DisplayUri); }
231 set { DisplayUri = UriFu.EscapedStringToUri (value); }
234 [XmlAttribute]
235 public bool DeleteContent {
236 get { return deleteContent; }
237 set { deleteContent = value; }
240 [XmlAttribute]
241 public String HitType {
242 get { return hit_type; }
243 set { hit_type = value; }
246 [XmlAttribute]
247 public String MimeType {
248 get { return mimeType; }
249 set { mimeType = value; }
252 [XmlAttribute]
253 public string Source {
254 get { return source; }
255 set { source = value; }
258 [XmlIgnore]
259 public bool IsNonTransient {
260 /* Not transient if
261 * - content should not be deleted after indexing and
262 * - actual source of data (data might be stored in temporary files for indexing) is a file and
263 * - there is no parent uri set.
265 get { return ! DeleteContent && ContentUri.IsFile && ParentUri == null; }
268 [XmlAttribute]
269 public bool Crawled {
270 get { return crawled; }
271 set { crawled = value; }
274 [XmlAttribute]
275 public bool NoContent {
276 get { return no_content; }
277 set { no_content = value; }
280 [XmlAttribute]
281 public bool CacheContent {
282 get { return cache_content; }
283 set { cache_content = value; }
286 [XmlAttribute]
287 public IndexableFiltering Filtering {
288 get { return filtering; }
289 set { filtering = value; }
292 [XmlIgnore]
293 public IDictionary LocalState {
294 get { return local_state; }
297 [XmlAttribute]
298 public bool IsChild {
299 get { return is_child; }
300 set { is_child = value; }
303 //////////////////////////
305 public void Cleanup ()
307 if (DeleteContent) {
308 if (contentUri != null) {
309 if (Debug)
310 Logger.Log.Debug ("Cleaning up {0}", contentUri.LocalPath);
312 try {
313 File.Delete (contentUri.LocalPath);
314 } catch {
315 // It might be gone already, so catch the exception.
318 contentUri = null;
321 if (hotContentUri != null) {
322 if (Debug)
323 Logger.Log.Debug ("Cleaning up {0}", hotContentUri.LocalPath);
325 try {
326 File.Delete (hotContentUri.LocalPath);
327 } catch {
328 // Ditto
331 hotContentUri = null;
336 private Stream StreamFromUri (Uri uri)
338 Stream stream = null;
340 if (uri != null && uri.IsFile && ! no_content) {
341 stream = new FileStream (uri.LocalPath,
342 FileMode.Open,
343 FileAccess.Read,
344 FileShare.Read);
347 return stream;
350 private TextReader ReaderFromUri (Uri uri)
352 Stream stream = StreamFromUri (uri);
354 if (stream == null)
355 return null;
357 return new StreamReader (stream);
361 public TextReader GetTextReader ()
363 if (NoContent)
364 return null;
366 if (textReader == null)
367 textReader = ReaderFromUri (ContentUri);
369 return textReader;
372 public void SetTextReader (TextReader reader)
374 textReader = reader;
377 public TextReader GetHotTextReader ()
379 if (NoContent)
380 return null;
382 if (hotTextReader == null)
383 hotTextReader = ReaderFromUri (HotContentUri);
384 return hotTextReader;
387 public void SetHotTextReader (TextReader reader)
389 hotTextReader = reader;
392 public Stream GetBinaryStream ()
394 if (NoContent)
395 return null;
397 if (binary_stream == null)
398 binary_stream = StreamFromUri (ContentUri);
400 return binary_stream;
403 public void SetBinaryStream (Stream stream)
405 binary_stream = stream;
408 [XmlArrayItem (ElementName="Property", Type=typeof (Property))]
409 public ArrayList Properties {
410 get { return properties; }
413 public void AddProperty (Property prop) {
414 if (prop != null) {
416 if (type == IndexableType.PropertyChange && ! prop.IsMutable)
417 throw new ArgumentException ("Non-mutable properties aren't allowed in this indexable");
419 // If this is a mutable property, make sure that
420 // we don't already contain another mutable property
421 // with the same name. If we do, replace it.
422 if (prop.IsMutable) {
423 for (int i = 0; i < properties.Count; ++i) {
424 Property other_prop = properties [i] as Property;
425 if (other_prop.IsMutable && prop.Key == other_prop.Key) {
426 properties [i] = prop;
427 return;
432 properties.Add (prop);
436 public bool HasProperty (string keyword) {
437 foreach (Property property in properties)
438 if (property.Key == keyword)
439 return true;
441 return false;
444 // This doesn't check if it makes sense to actually
445 // merge the two indexables: it just does it.
446 public void Merge (Indexable other)
448 this.Timestamp = other.Timestamp;
450 foreach (Property prop in other.Properties)
451 this.AddProperty (prop);
453 foreach (DictionaryEntry entry in other.local_state)
454 this.local_state [entry.Key] = entry.Value;
457 //////////////////////////
459 public void SetChildOf (Indexable parent)
461 this.IsChild = true;
462 if (parent.IsChild)
463 this.ParentUri = parent.ParentUri;
464 else
465 this.ParentUri = parent.Uri;
467 if (!this.ValidTimestamp)
468 this.Timestamp = parent.Timestamp;
470 // FIXME: Set all of the parent's properties on the
471 // child so that we get matches against the child
472 // that otherwise would match only the parent, at
473 // least until we have proper RDF support.
475 if (parent.IsChild)
476 CopyPropertyChildToChild (parent);
477 else
478 CopyPropertyParentToChild (parent);
481 // FIXME: Copying the correct properties from parent to child:
482 // (This is not perfect yet)
483 // It does not make sense to have parent:parent:parent:...:parent:foo
484 // for property names of a nested child
485 // Moreover, if indexable a.mbox has child b.zip which has child c.zip,
486 // then upon matching c.zip, we would like to get the information from
487 // a.mbox (i.e. the toplevel indexable) only. Intermediate parent information
488 // is not necessary for displaying results; in fact, storing them would cause
489 // confusion during display.
490 // E.g. storing parent:beagle:filename for all parents
491 // would cause, parent:beagle:filename=a.mbox, parent.beagle.filename=b.zip
492 // whereas we are only interested in toplevel parent:beagle:filename=a.mbox
493 // For indexables which need to store the intermediate/immediate parent info
494 // separately, explicitly store them.
495 // Another problem is, toplevel indexable might want to store information
496 // which should not be matched when searching for its child. Copying those
497 // properties in all children will incorrectly match them.
500 private void CopyPropertyChildToChild (Indexable parent)
502 // If parent itself is a child,
503 // then only copy parents' parent:xxx and _private:xxx properties
504 foreach (Property prop in parent.Properties) {
506 if (prop.Key.StartsWith ("parent:") ||
507 prop.Key.StartsWith (Property.PrivateNamespace)) {
509 Property new_prop = (Property) prop.Clone ();
510 this.AddProperty (new_prop);
511 } else {
513 Property new_prop = (Property) prop.Clone ();
514 new_prop.IsStored = false;
515 this.AddProperty (new_prop);
520 private void CopyPropertyParentToChild (Indexable parent)
522 // Parent is a top level indexable
523 // Copy all properties
524 foreach (Property prop in parent.Properties) {
526 Property new_prop = (Property) prop.Clone ();
527 // Add parent: to property names ONLY IF
528 // - not private property (these are not properties of the file content)
529 // - property name does not already start with parent:
530 if (! new_prop.Key.StartsWith (Property.PrivateNamespace) &&
531 ! new_prop.Key.StartsWith ("parent:"))
532 new_prop.Key = "parent:" + new_prop.Key;
534 this.AddProperty (new_prop);
538 //////////////////////////
540 public override string ToString ()
542 StringWriter writer = new StringWriter ();
543 our_serializer.Serialize (writer, this);
544 writer.Close ();
545 return writer.ToString ();
548 //////////////////////////
550 const int BUFFER_SIZE = 8192;
552 private static char [] GetCharBuffer ()
554 LocalDataStoreSlot slot;
555 slot = Thread.GetNamedDataSlot ("Char Buffer");
557 object obj;
558 char [] buffer;
559 obj = Thread.GetData (slot);
560 if (obj == null) {
561 buffer = new char [BUFFER_SIZE];
562 Thread.SetData (slot, buffer);
563 } else {
564 buffer = (char []) obj;
567 return buffer;
570 private static byte [] GetByteBuffer ()
572 LocalDataStoreSlot slot;
573 slot = Thread.GetNamedDataSlot ("Byte Buffer");
575 object obj;
576 byte [] buffer;
577 obj = Thread.GetData (slot);
578 if (obj == null) {
579 buffer = new byte [BUFFER_SIZE];
580 Thread.SetData (slot, buffer);
581 } else {
582 buffer = (byte []) obj;
585 return buffer;
588 //////////////////////////
590 private static Uri TextReaderToTempFileUri (TextReader reader)
592 if (reader == null)
593 return null;
595 string filename = Path.GetTempFileName ();
596 FileStream fileStream = File.OpenWrite (filename);
598 // When we dump the contents of an indexable into a file, we
599 // expect to use it again soon.
600 FileAdvise.PreLoad (fileStream);
602 // Make sure the temporary file is only readable by the owner.
603 // FIXME: There is probably a race here. Could some malicious program
604 // do something to the file between creation and the chmod?
605 Mono.Unix.Native.Syscall.chmod (filename, (Mono.Unix.Native.FilePermissions) 256);
607 BufferedStream bufferedStream = new BufferedStream (fileStream);
608 StreamWriter writer = new StreamWriter (bufferedStream);
611 char [] buffer;
612 buffer = GetCharBuffer ();
614 int read;
615 do {
616 read = reader.Read (buffer, 0, buffer.Length);
617 if (read > 0)
618 writer.Write (buffer, 0, read);
619 } while (read > 0);
621 writer.Close ();
623 return UriFu.PathToFileUri (filename);
626 private static Uri BinaryStreamToTempFileUri (Stream stream)
628 if (stream == null)
629 return null;
631 string filename = Path.GetTempFileName ();
632 FileStream fileStream = File.OpenWrite (filename);
634 // When we dump the contents of an indexable into a file, we
635 // expect to use it again soon.
636 FileAdvise.PreLoad (fileStream);
638 // Make sure the temporary file is only readable by the owner.
639 // FIXME: There is probably a race here. Could some malicious program
640 // do something to the file between creation and the chmod?
641 Mono.Unix.Native.Syscall.chmod (filename, (Mono.Unix.Native.FilePermissions) 256);
643 BufferedStream bufferedStream = new BufferedStream (fileStream);
645 byte [] buffer;
646 buffer = GetByteBuffer ();
648 int read;
649 do {
650 read = stream.Read (buffer, 0, buffer.Length);
651 if (read > 0)
652 bufferedStream.Write (buffer, 0, read);
653 } while (read > 0);
655 bufferedStream.Close ();
657 return UriFu.PathToFileUri (filename);
660 public void StoreStream () {
661 if (textReader != null) {
662 ContentUri = TextReaderToTempFileUri (textReader);
664 if (Debug)
665 Logger.Log.Debug ("Storing text content from {0} in {1}", Uri, ContentUri);
667 DeleteContent = true;
668 } else if (binary_stream != null) {
669 ContentUri = BinaryStreamToTempFileUri (binary_stream);
671 if (Debug)
672 Logger.Log.Debug ("Storing binary content from {0} in {1}", Uri, ContentUri);
674 DeleteContent = true;
677 if (hotTextReader != null) {
678 HotContentUri = TextReaderToTempFileUri (hotTextReader);
680 if (Debug)
681 Logger.Log.Debug ("Storing hot content from {0} in {1}", Uri, HotContentUri);
683 DeleteContent = true;
687 public void CloseStreams ()
689 if (textReader != null)
690 textReader.Close ();
691 else if (binary_stream != null)
692 binary_stream.Close ();
694 if (hotTextReader != null)
695 hotTextReader.Close ();
698 //////////////////////////
700 public override int GetHashCode ()
702 return (uri != null ? uri.GetHashCode () : 0) ^ type.GetHashCode ();
705 public int CompareTo (object obj)
707 Indexable other = (Indexable) obj;
708 return DateTime.Compare (this.Timestamp, other.Timestamp);