cvsimport
[beagle.git] / beagled / LuceneCommon.cs
blob0b58cde2fadcfbeeb4530944f14f1b4bc6d1a129
1 //
2 // LuceneCommon.cs
3 //
4 // Copyright (C) 2004-2005 Novell, Inc.
5 //
7 //
8 // Permission is hereby granted, free of charge, to any person obtaining a
9 // copy of this software and associated documentation files (the "Software"),
10 // to deal in the Software without restriction, including without limitation
11 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 // and/or sell copies of the Software, and to permit persons to whom the
13 // Software is furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 // DEALINGS IN THE SOFTWARE.
27 using System;
28 using System.Collections;
29 using System.Diagnostics;
30 using System.Globalization;
31 using System.IO;
32 using System.Text;
33 using System.Threading;
34 using System.Xml;
35 using System.Xml.Serialization;
37 using Lucene.Net.Analysis;
38 using Lucene.Net.Analysis.Standard;
39 using Lucene.Net.Documents;
40 using Lucene.Net.Index;
41 using Lucene.Net.QueryParsers;
42 using LNS = Lucene.Net.Search;
44 using Beagle.Util;
46 namespace Beagle.Daemon {
48 public class LuceneCommon {
50 public delegate bool HitFilter (Hit hit);
52 // VERSION HISTORY
53 // ---------------
55 // 1: Original
56 // 2: Changed format of timestamp strings
57 // 3: Schema changed to be more Dashboard-Match-like
58 // 4: Schema changed for files to include _Directory property
59 // 5: Changed analyzer to support stemming. Bumped version # to
60 // force everyone to re-index.
61 // 6: lots of schema changes as part of the general refactoring
62 // 7: incremented to force a re-index after our upgrade to lucene 1.4
63 // (in theory the file formats are compatible, we are seeing 'term
64 // out of order' exceptions in some cases)
65 // 8: another forced re-index, this time because of massive changes
66 // in the file system backend (it would be nice to have per-backend
67 // versioning so that we didn't have to purge all indexes just
68 // because one changed)
69 // 9: changed the way properties are stored, changed in conjunction
70 // with sane handling of multiple properties on hits.
71 // 10: changed to support typed and mutable properties
72 // 11: moved mime type and hit type into properties
73 // 12: added year-month and year-month-day resolutions for all
74 // date properties
75 // 13: moved source into a property
76 // 14: allow wildcard queries to also match keywords
77 // 15: analyze PropertyKeyword field, and store all properties as
78 // lower case so that we're truly case insensitive.
79 // 16: add inverted timestamp to make querying substantially faster
80 // 17: add boolean property to denote a child indexable
81 private const int MAJOR_VERSION = 17;
82 private int minor_version = 0;
84 private string index_name;
85 private string top_dir;
87 private string fingerprint;
88 private int last_item_count = -1;
90 // This is the big index, containing document full-texts and
91 // data that is expensive to index.
92 private Lucene.Net.Store.Directory primary_store = null;
94 // This is the small index, containing document info that we
95 // expect to have change. Canonical example: file names.
96 private Lucene.Net.Store.Directory secondary_store = null;
98 //////////////////////////////////////////////////////////////////////////////
100 protected LuceneCommon (string index_name, int minor_version)
102 this.index_name = index_name;
103 this.minor_version = minor_version;
105 this.top_dir = (Path.IsPathRooted (index_name)) ? index_name : Path.Combine (PathFinder.IndexDir, index_name);
108 //////////////////////////////////////////////////////////////////////////////
110 protected string IndexName { get { return index_name; } }
112 public Lucene.Net.Store.Directory PrimaryStore { get { return primary_store; } }
114 public Lucene.Net.Store.Directory SecondaryStore { get { return secondary_store; } }
116 public string Fingerprint { get { return fingerprint; } }
118 public string TopDirectory { get { return top_dir; } }
120 //////////////////////////////////////////////////////////////////////////////
122 protected TextCache text_cache = null;
124 public TextCache TextCache {
125 get { return text_cache; }
126 set { text_cache = value; }
129 //////////////////////////////////////////////////////////////////////////////
131 private string VersionFile {
132 get { return Path.Combine (top_dir, "version"); }
135 private string FingerprintFile {
136 get { return Path.Combine (top_dir, "fingerprint"); }
139 // Shouldn't really be public
140 public string PrimaryIndexDirectory {
141 get { return Path.Combine (top_dir, "PrimaryIndex"); }
144 // Shouldn't really be public
145 public string SecondaryIndexDirectory {
146 get { return Path.Combine (top_dir, "SecondaryIndex"); }
149 public string LockDirectory {
150 get { return Path.Combine (top_dir, "Locks"); }
153 //////////////////////////////////////////////////////////////////////////////
155 // Deal with dangling locks
157 private bool IsDanglingLock (FileInfo info)
159 Log.Debug ("Checking for dangling locks...");
161 // It isn't even a lock file
162 if (! info.Name.EndsWith (".lock"))
163 return false;
165 StreamReader reader;
166 string pid = null;
168 try {
169 reader = new StreamReader (info.FullName);
170 pid = reader.ReadLine ();
171 reader.Close ();
173 } catch {
174 // We couldn't read the lockfile, so it probably went away.
175 return false;
179 if (pid == null) {
180 // Looks like the lock file was empty, which really
181 // shouldn't happen. It should contain the PID of
182 // the process which locked it. Lets be on the safe
183 // side and assume it's a dangling lock.
184 Log.Warn ("Found an empty lock file, that shouldn't happen: {0}", info.FullName);
185 return true;
188 string cmdline_file;
189 cmdline_file = String.Format ("/proc/{0}/cmdline", pid);
191 string cmdline = "";
192 try {
193 reader = new StreamReader (cmdline_file);
194 cmdline = reader.ReadLine ();
195 reader.Close ();
196 } catch {
197 // If we can't open that file, either:
198 // (1) The process doesn't exist
199 // (2) It does exist, but it doesn't belong to us.
200 // Thus it isn't an IndexHelper
201 // In either case, the lock is dangling --- if it
202 // still exists.
203 return info.Exists;
206 // The process exists, but isn't an IndexHelper.
207 // If the lock file is still there, it is dangling.
208 // FIXME: During one run of bludgeon I got a null reference
209 // exception here, so I added the cmdline == null check.
210 // Why exactly would that happen? Is this logic correct
211 // in that (odd and presumably rare) case?
212 if (cmdline == null || cmdline.IndexOf ("IndexHelper.exe") == -1)
213 return info.Exists;
215 // If we reach this point, we know:
216 // (1) The process still exists
217 // (2) We own it
218 // (3) It is an IndexHelper process
219 // Thus it almost certainly isn't a dangling lock.
220 // The process might be wedged, but that is
221 // another issue...
222 return false;
225 protected bool Exists ()
227 if (! (Directory.Exists (top_dir)
228 && File.Exists (VersionFile)
229 && File.Exists (FingerprintFile)
230 && Directory.Exists (PrimaryIndexDirectory)
231 && IndexReader.IndexExists (PrimaryIndexDirectory)
232 && Directory.Exists (SecondaryIndexDirectory)
233 && IndexReader.IndexExists (SecondaryIndexDirectory)
234 && Directory.Exists (LockDirectory)))
235 return false;
237 // Check the index's version number. If it is wrong,
238 // declare the index non-existent.
240 StreamReader version_reader;
241 string version_str;
242 version_reader = new StreamReader (VersionFile);
243 version_str = version_reader.ReadLine ();
244 version_reader.Close ();
246 int current_major_version, current_minor_version;
247 int i = version_str.IndexOf ('.');
249 if (i != -1) {
250 current_major_version = Convert.ToInt32 (version_str.Substring (0, i));
251 current_minor_version = Convert.ToInt32 (version_str.Substring (i+1));
252 } else {
253 current_minor_version = Convert.ToInt32 (version_str);
254 current_major_version = 0;
257 if (current_major_version != MAJOR_VERSION
258 || (minor_version >= 0 && current_minor_version != minor_version)) {
259 Logger.Log.Debug ("Version mismatch in {0}", index_name);
260 Logger.Log.Debug ("Index has version {0}.{1}, expected {2}.{3}",
261 current_major_version, current_minor_version,
262 MAJOR_VERSION, minor_version);
263 return false;
266 // Check the lock directory: If there is a dangling write lock,
267 // assume that the index is corrupted and declare it non-existent.
268 DirectoryInfo lock_dir_info;
269 lock_dir_info = new DirectoryInfo (LockDirectory);
270 foreach (FileInfo info in lock_dir_info.GetFiles ()) {
271 if (IsDanglingLock (info)) {
272 Logger.Log.Warn ("Found a dangling index lock on {0}", info.FullName);
273 return false;
277 return true;
280 private Lucene.Net.Store.Directory CreateIndex (string path)
282 // Create a directory to put the index in.
283 Directory.CreateDirectory (path);
285 // Create a new store.
286 Lucene.Net.Store.Directory store;
287 store = Lucene.Net.Store.FSDirectory.GetDirectory (path, LockDirectory, true);
289 // Create an empty index in that store.
290 IndexWriter writer;
291 writer = new IndexWriter (store, null, true);
292 writer.Close ();
294 return store;
297 // Create will kill your index dead. Use it with care.
298 // You don't need to call Open after calling Create.
299 protected void Create ()
301 if (minor_version < 0)
302 minor_version = 0;
304 // Purge any existing directories.
305 if (Directory.Exists (top_dir)) {
306 Logger.Log.Debug ("Purging {0}", top_dir);
307 Directory.Delete (top_dir, true);
310 // Create any necessary directories.
311 Directory.CreateDirectory (top_dir);
312 Directory.CreateDirectory (LockDirectory);
314 // Create the indexes.
315 primary_store = CreateIndex (PrimaryIndexDirectory);
316 secondary_store = CreateIndex (SecondaryIndexDirectory);
318 // Generate and store the index fingerprint.
319 fingerprint = GuidFu.ToShortString (Guid.NewGuid ());
320 TextWriter writer;
321 writer = new StreamWriter (FingerprintFile, false);
322 writer.WriteLine (fingerprint);
323 writer.Close ();
325 // Store our index version information.
326 writer = new StreamWriter (VersionFile, false);
327 writer.WriteLine ("{0}.{1}", MAJOR_VERSION, minor_version);
328 writer.Close ();
331 protected void Open ()
333 Open (false);
336 protected void Open (bool read_only_mode)
338 // Read our index fingerprint.
339 TextReader reader;
340 reader = new StreamReader (FingerprintFile);
341 fingerprint = reader.ReadLine ();
342 reader.Close ();
344 // Create stores for our indexes.
345 primary_store = Lucene.Net.Store.FSDirectory.GetDirectory (PrimaryIndexDirectory, LockDirectory, false, read_only_mode);
346 secondary_store = Lucene.Net.Store.FSDirectory.GetDirectory (SecondaryIndexDirectory, LockDirectory, false, read_only_mode);
349 ////////////////////////////////////////////////////////////////
352 // Custom Analyzers
355 private class SingletonTokenStream : TokenStream {
357 private string singleton_str;
359 public SingletonTokenStream (string singleton_str)
361 this.singleton_str = singleton_str;
364 override public Lucene.Net.Analysis.Token Next ()
366 if (singleton_str == null)
367 return null;
369 Lucene.Net.Analysis.Token token;
370 token = new Lucene.Net.Analysis.Token (singleton_str, 0, singleton_str.Length);
372 singleton_str = null;
374 return token;
378 // FIXME: This assumes everything being indexed is in English!
379 internal class BeagleAnalyzer : StandardAnalyzer {
381 private char [] buffer = new char [2];
382 private bool strip_extra_property_info = false;
383 private bool tokenize_email_hostname = false;
385 public BeagleAnalyzer (bool is_indexing_analyzer)
387 if (is_indexing_analyzer) {
388 this.strip_extra_property_info = true;
389 this.tokenize_email_hostname = true;
390 } else {
391 this.strip_extra_property_info = false;
392 this.tokenize_email_hostname = false;
396 public override TokenStream TokenStream (string fieldName, TextReader reader)
398 bool is_text_prop = false;
400 // Strip off the first two characters in a property.
401 // We store type information in those two characters, so we don't
402 // want to index them.
403 if (fieldName.StartsWith ("prop:")) {
405 if (strip_extra_property_info) {
406 // Skip everything up to and including the first :
407 int c;
408 do {
409 c = reader.Read ();
410 } while (c != -1 && c != ':');
413 is_text_prop = fieldName.StartsWith ("prop:t");
415 // If this is non-text property, just return one token
416 // containing the entire string. We do this to avoid
417 // tokenizing keywords.
418 if (! is_text_prop) {
419 // We don't want to lower case the token if it's
420 // not in the private namespace.
422 TokenStream singleton_stream = new SingletonTokenStream (reader.ReadToEnd ());
424 if (fieldName.StartsWith ("prop:k:" + Property.PrivateNamespace))
425 return singleton_stream;
426 else
427 return new LowerCaseFilter (singleton_stream);
429 } else if (fieldName == "PropertyKeyword")
430 return new LowerCaseFilter (new SingletonTokenStream (reader.ReadToEnd ()));
432 TokenStream outstream;
433 outstream = base.TokenStream (fieldName, reader);
435 if (fieldName == "Text"
436 || fieldName == "HotText"
437 || fieldName == "PropertyText"
438 || is_text_prop) {
439 outstream = new NoiseEmailHostFilter (outstream, tokenize_email_hostname);
440 outstream = new PorterStemFilter (outstream);
443 return outstream;
447 static private Analyzer indexing_analyzer = new BeagleAnalyzer (true);
448 static private Analyzer query_analyzer = new BeagleAnalyzer (false);
450 static protected Analyzer IndexingAnalyzer { get { return indexing_analyzer; } }
451 static protected Analyzer QueryAnalyzer { get { return query_analyzer; } }
453 ////////////////////////////////////////////////////////////////
456 // Dealing with properties
459 static private char TypeToCode (PropertyType type)
461 switch (type) {
462 case PropertyType.Text: return 't';
463 case PropertyType.Keyword: return 'k';
464 case PropertyType.Date: return 'd';
466 throw new Exception ("Bad property type: " + type);
469 static private PropertyType CodeToType (char c)
471 switch (c) {
472 case 't': return PropertyType.Text;
473 case 'k': return PropertyType.Keyword;
474 case 'd': return PropertyType.Date;
477 throw new Exception ("Bad property code: " + c);
480 static private string TypeToWildcardField (PropertyType type)
482 switch (type) {
483 case PropertyType.Text: return "PropertyText";
484 case PropertyType.Keyword: return "PropertyKeyword";
485 case PropertyType.Date: return "PropertyDate";
488 throw new Exception ("Bad property type: " + type);
491 static private Field.Index TypeToIndexInstruction (PropertyType type)
493 switch (type) {
494 case PropertyType.Text: return Field.Index.TOKENIZED; // Full analysis
495 case PropertyType.Keyword: return Field.Index.TOKENIZED; // Lowercases keywords
496 case PropertyType.Date: return Field.Index.NO_NORMS; // Do nothing
499 throw new Exception ("Bad property type: " + type);
502 // Exposing this is a little bit suspicious.
503 static protected string PropertyToFieldName (PropertyType type, string key)
505 return String.Format ("prop:{0}:{1}", TypeToCode (type), key);
509 static private void AddDateFields (string field_name, Property prop, Document doc)
511 DateTime dt = StringFu.StringToDateTime (prop.Value);
513 Field f;
514 f = new Field ("YM:" + field_name,
515 StringFu.DateTimeToYearMonthString (dt),
516 Field.Store.NO,
517 Field.Index.NO_NORMS);
518 doc.Add (f);
520 f = new Field ("D:" + field_name,
521 StringFu.DateTimeToDayString (dt),
522 Field.Store.NO,
523 Field.Index.NO_NORMS);
524 doc.Add (f);
527 static protected void AddPropertyToDocument (Property prop, Document doc)
529 if (prop == null || prop.Value == null || prop.Value == String.Empty)
530 return;
532 // Don't actually put properties in the UnindexedNamespace
533 // in the document. A horrible (and yet lovely!) hack.
534 if (prop.Key.StartsWith (StringFu.UnindexedNamespace))
535 return;
537 Field f;
539 if (prop.IsSearched) {
540 string wildcard_field = TypeToWildcardField (prop.Type);
542 f = new Field (wildcard_field,
543 prop.Value,
544 Field.Store.NO,
545 TypeToIndexInstruction (prop.Type));
547 // We don't want to include norms for non-text
548 // fields, even if we do tokenize them.
549 if (prop.Type == PropertyType.Keyword || prop.Type == PropertyType.Date)
550 f.SetOmitNorms (true);
552 doc.Add (f);
554 if (prop.Type == PropertyType.Date)
555 AddDateFields (wildcard_field, prop, doc);
558 string coded_value;
559 coded_value = String.Format ("{0}:{1}",
560 prop.IsSearched ? 's' : '_',
561 prop.Value);
563 string field_name = PropertyToFieldName (prop.Type, prop.Key);
565 f = new Field (field_name,
566 coded_value,
567 prop.IsStored ? Field.Store.YES : Field.Store.NO,
568 Field.Index.TOKENIZED);
569 doc.Add (f);
571 if (prop.Type == PropertyType.Date)
572 AddDateFields (field_name, prop, doc);
575 static protected Property GetPropertyFromDocument (Field f, Document doc, bool from_primary_index)
577 // Note: we don't use the document that we pass in,
578 // but in theory we could. At some later point we
579 // might need to split a property's data across two or
580 // more fields in the document.
582 if (f == null)
583 return null;
585 string field_name;
586 field_name = f.Name ();
587 if (field_name.Length < 7
588 || ! field_name.StartsWith ("prop:"))
589 return null;
591 string field_value;
592 field_value = f.StringValue ();
594 Property prop;
595 prop = new Property ();
596 prop.Type = CodeToType (field_name [5]);
597 prop.Key = field_name.Substring (7);
598 prop.Value = field_value.Substring (2);
599 prop.IsSearched = (field_value [0] == 's');
600 prop.IsMutable = ! from_primary_index;
601 prop.IsStored = f.IsStored ();
603 return prop;
606 //////////////////////////////////////////////////////////////////////////////
609 // Dealing with documents
612 static protected void BuildDocuments (Indexable indexable,
613 out Document primary_doc,
614 out Document secondary_doc)
616 primary_doc = new Document ();
617 secondary_doc = null;
619 Field f;
621 f = new Field ("Uri", UriFu.UriToEscapedString (indexable.Uri),
622 Field.Store.YES, Field.Index.NO_NORMS);
623 primary_doc.Add (f);
625 if (indexable.ParentUri != null) {
626 f = new Field ("ParentUri", UriFu.UriToEscapedString (indexable.ParentUri),
627 Field.Store.YES, Field.Index.NO_NORMS);
628 primary_doc.Add (f);
631 if (indexable.ValidTimestamp) {
632 // Note that we also want to search in the
633 // Timestamp field when we do a wildcard date
634 // query, so that's why we also add a wildcard
635 // field for each item here.
637 string wildcard_field = TypeToWildcardField (PropertyType.Date);
639 string str = StringFu.DateTimeToString (indexable.Timestamp);
640 f = new Field ("Timestamp", str, Field.Store.YES, Field.Index.NO_NORMS);
641 primary_doc.Add (f);
642 f = new Field (wildcard_field, str, Field.Store.NO, Field.Index.NO_NORMS);
643 primary_doc.Add (f);
645 // Create an inverted timestamp so that we can
646 // sort by timestamp at search-time.
647 long timeval = Convert.ToInt64 (str);
648 f = new Field ("InvertedTimestamp", (Int64.MaxValue - timeval).ToString (),
649 Field.Store.NO, Field.Index.NO_NORMS);
650 primary_doc.Add (f);
652 str = StringFu.DateTimeToYearMonthString (indexable.Timestamp);
653 f = new Field ("YM:Timestamp", str, Field.Store.YES, Field.Index.NO_NORMS);
654 primary_doc.Add (f);
655 f = new Field ("YM:" + wildcard_field, str,
656 Field.Store.NO, Field.Index.NO_NORMS);
657 primary_doc.Add (f);
659 str = StringFu.DateTimeToDayString (indexable.Timestamp);
660 f = new Field ("D:Timestamp", str, Field.Store.YES, Field.Index.NO_NORMS);
661 primary_doc.Add (f);
662 f = new Field ("D:" + wildcard_field, str,
663 Field.Store.NO, Field.Index.NO_NORMS);
664 primary_doc.Add (f);
667 if (indexable.NoContent) {
668 // If there is no content, make a note of that
669 // in a special property.
670 Property prop;
671 prop = Property.NewBool ("beagle:NoContent", true);
672 AddPropertyToDocument (prop, primary_doc);
674 } else {
676 // Since we might have content, add our text
677 // readers.
679 TextReader reader;
681 reader = indexable.GetTextReader ();
682 if (reader != null) {
683 f = new Field ("Text", reader);
684 primary_doc.Add (f);
687 reader = indexable.GetHotTextReader ();
688 if (reader != null) {
689 f = new Field ("HotText", reader);
690 primary_doc.Add (f);
694 // Store the Type and MimeType in special properties
696 if (indexable.HitType != null) {
697 Property prop;
698 prop = Property.NewUnsearched ("beagle:HitType", indexable.HitType);
699 AddPropertyToDocument (prop, primary_doc);
702 if (indexable.MimeType != null) {
703 Property prop;
704 prop = Property.NewUnsearched ("beagle:MimeType", indexable.MimeType);
705 AddPropertyToDocument (prop, primary_doc);
708 if (indexable.Source != null) {
709 Property prop;
710 prop = Property.NewUnsearched ("beagle:Source", indexable.Source);
711 AddPropertyToDocument (prop, primary_doc);
715 Property prop;
716 prop = Property.NewBool (Property.IsChildPropKey, indexable.IsChild);
717 AddPropertyToDocument (prop, primary_doc);
720 // Store the other properties
722 foreach (Property prop in indexable.Properties) {
723 Document target_doc = primary_doc;
724 if (prop.IsMutable) {
725 if (secondary_doc == null)
726 secondary_doc = CreateSecondaryDocument (indexable.Uri, indexable.ParentUri);
728 target_doc = secondary_doc;
731 AddPropertyToDocument (prop, target_doc);
735 static private Document CreateSecondaryDocument (Uri uri, Uri parent_uri)
737 Document secondary_doc = new Document ();
739 Field f = new Field ("Uri", UriFu.UriToEscapedString (uri), Field.Store.YES, Field.Index.NO_NORMS);
740 secondary_doc.Add (f);
742 if (parent_uri != null) {
743 // Store both Uri and ParentUri in secondary index for easy removal
744 f = new Field ("ParentUri", UriFu.UriToEscapedString (parent_uri), Field.Store.YES, Field.Index.NO_NORMS);
745 secondary_doc.Add (f);
748 return secondary_doc;
751 static protected Document RewriteDocument (Document old_secondary_doc,
752 Indexable prop_only_indexable)
754 Hashtable seen_props;
755 seen_props = new Hashtable ();
757 Document new_doc;
758 new_doc = new Document ();
760 Field uri_f;
761 uri_f = new Field ("Uri", UriFu.UriToEscapedString (prop_only_indexable.Uri), Field.Store.YES, Field.Index.NO_NORMS);
762 new_doc.Add (uri_f);
764 Logger.Log.Debug ("Rewriting {0}", prop_only_indexable.DisplayUri);
766 if (prop_only_indexable.ParentUri != null) {
767 uri_f = new Field ("ParentUri", UriFu.UriToEscapedString (prop_only_indexable.ParentUri), Field.Store.YES, Field.Index.NO_NORMS);
768 new_doc.Add (uri_f);
769 Logger.Log.Debug ("Parent Uri {0}", prop_only_indexable.ParentUri);
772 // Add the new properties to the new document. To
773 // delete a property, set the Value to null... then it
774 // will be added to seen_props (so the old value will
775 // be ignored below), but AddPropertyToDocument will
776 // return w/o doing anything.
777 foreach (Property prop in prop_only_indexable.Properties) {
778 seen_props [prop.Key] = prop;
780 // Don't add properties that are empty; they
781 // essentially mean "reset this property"
782 if (prop.Value == String.Empty) {
783 Logger.Log.Debug ("Resetting prop '{0}'", prop.Key);
784 continue;
787 AddPropertyToDocument (prop, new_doc);
788 Logger.Log.Debug ("New prop '{0}' = '{1}'", prop.Key, prop.Value);
791 // Copy the other properties from the old document to the
792 // new one, skipping any properties that we got new values
793 // for out of the Indexable.
794 if (old_secondary_doc != null) {
795 foreach (Field f in old_secondary_doc.Fields ()) {
796 Property prop;
797 prop = GetPropertyFromDocument (f, old_secondary_doc, false);
798 if (prop != null && ! seen_props.Contains (prop.Key)) {
799 Logger.Log.Debug ("Old prop '{0}' = '{1}'", prop.Key, prop.Value);
800 AddPropertyToDocument (prop, new_doc);
805 return new_doc;
808 static protected Uri GetUriFromDocument (Document doc)
810 string uri;
811 uri = doc.Get ("Uri");
812 if (uri == null)
813 throw new Exception ("Got document from Lucene w/o a URI!");
814 return UriFu.EscapedStringToUri (uri);
817 static protected Hit DocumentToHit (Document doc)
819 Hit hit;
820 hit = new Hit ();
822 hit.Uri = GetUriFromDocument (doc);
824 string str;
825 str = doc.Get ("ParentUri");
826 if (str != null)
827 hit.ParentUri = UriFu.EscapedStringToUri (str);
829 hit.Timestamp = StringFu.StringToDateTime (doc.Get ("Timestamp"));
831 AddPropertiesToHit (hit, doc, true);
833 // Get the Type and MimeType from the properties.
834 hit.Type = hit.GetFirstProperty ("beagle:HitType");
835 hit.MimeType = hit.GetFirstProperty ("beagle:MimeType");
836 hit.Source = hit.GetFirstProperty ("beagle:Source");
838 return hit;
841 static protected void AddPropertiesToHit (Hit hit, Document doc, bool from_primary_index)
843 foreach (Field f in doc.Fields ()) {
844 Property prop;
845 prop = GetPropertyFromDocument (f, doc, from_primary_index);
846 if (prop != null)
847 hit.AddProperty (prop);
852 //////////////////////////////////////////////////////////////////////////////
855 // Handle the index's item count
858 public int GetItemCount ()
860 if (last_item_count < 0) {
861 IndexReader reader;
862 reader = GetReader (PrimaryStore);
863 last_item_count = reader.NumDocs ();
864 ReleaseReader (reader);
866 return last_item_count;
869 // We should set the cached count of index items when IndexReaders
870 // are open and available, so calls to GetItemCount will return immediately.
872 protected bool HaveItemCount { get { return last_item_count >= 0; } }
874 protected void SetItemCount (IndexReader reader)
876 last_item_count = reader.NumDocs ();
879 public void SetItemCount (int count)
881 last_item_count = count;
884 protected void AdjustItemCount (int delta)
886 if (last_item_count >= 0)
887 last_item_count += delta;
890 //////////////////////////////////////////////////////////////////////////////
893 // Access to the stemmer and list of stop words
896 static PorterStemmer stemmer = new PorterStemmer ();
898 static public string Stem (string str)
900 return stemmer.Stem (str);
903 public static bool IsStopWord (string stemmed_word)
905 return ArrayFu.IndexOfString (StopAnalyzer.ENGLISH_STOP_WORDS, stemmed_word) != -1;
908 //////////////////////////////////////////////////////////////////////////////
911 // Special Hit Filtering classes
914 static private bool TrueHitFilter (Hit hit)
916 return true;
919 static private HitFilter true_hit_filter = new HitFilter (TrueHitFilter);
921 public class OrHitFilter {
923 private ArrayList all = new ArrayList ();
924 private bool contains_known_true = false;
926 public void Add (HitFilter hit_filter)
928 if (hit_filter == true_hit_filter)
929 contains_known_true = true;
930 all.Add (hit_filter);
933 public bool HitFilter (Hit hit)
935 if (contains_known_true)
936 return true;
937 foreach (HitFilter hit_filter in all)
938 if (hit_filter (hit))
939 return true;
940 return false;
944 public class AndHitFilter {
946 private ArrayList all = new ArrayList ();
948 public void Add (HitFilter hit_filter)
950 all.Add (hit_filter);
953 public bool HitFilter (Hit hit)
955 foreach (HitFilter hit_filter in all)
956 if (! hit_filter (hit))
957 return false;
958 return true;
962 public class NotHitFilter {
963 HitFilter original;
965 public NotHitFilter (HitFilter original)
967 this.original = original;
970 public bool HitFilter (Hit hit)
972 return ! original (hit);
976 //////////////////////////////////////////////////////////////////////////////
979 // Queries
982 static private LNS.Query StringToQuery (string field_name,
983 string text,
984 ArrayList term_list)
986 ArrayList tokens = new ArrayList ();
988 // Use the analyzer to extract the query's tokens.
989 // This code is taken from Lucene's query parser.
990 TokenStream source = QueryAnalyzer.TokenStream (field_name, new StringReader (text));
991 while (true) {
992 Lucene.Net.Analysis.Token token;
993 try {
994 token = source.Next ();
995 if (token == null)
996 break;
997 } catch (IOException) {
998 break;
1000 if (token != null)
1001 tokens.Add (token.TermText ());
1003 try {
1004 source.Close ();
1005 } catch (IOException) {
1006 // ignore
1009 if (tokens.Count == 0)
1010 return null;
1012 LNS.PhraseQuery query = new LNS.PhraseQuery ();
1014 foreach (string token in tokens) {
1015 Term term;
1016 term = new Term (field_name, token);
1017 query.Add (term);
1018 if (term_list != null)
1019 term_list.Add (term);
1022 return query;
1026 // Date Range Handling
1029 // This function will break down dates to discrete chunks of
1030 // time to avoid expanding RangeQuerys as much as possible.
1031 // For example, searching for
1033 // YMD(5 May 2005, 16 Oct 2006)
1035 // would break down into three queries:
1037 // (YM(May 2005) AND D(5,31)) OR
1038 // YM(Jun 2005, Sep 2006) OR
1039 // (YM(Oct 2006) AND D(1,16))
1041 static private DateTime lower_bound = DateTimeUtil.UnixToDateTimeUtc (0);
1043 // FIXME: we should probably boost this sometime around 2030.
1044 // Mark your calendar.
1045 static private DateTime upper_bound = new DateTime (2038, 12, 31);
1047 static private Term NewYearMonthTerm (string field_name, int y, int m)
1049 return new Term ("YM:" + field_name, String.Format ("{0}{1:00}", y, m));
1052 static private LNS.Query NewYearMonthQuery (string field_name, int y, int m)
1054 return new LNS.TermQuery (NewYearMonthTerm (field_name, y, m));
1057 static private LNS.Query NewYearMonthQuery (string field_name, int y1, int m1, int y2, int m2)
1059 return new LNS.RangeQuery (NewYearMonthTerm (field_name, y1, m1),
1060 NewYearMonthTerm (field_name, y2, m2),
1061 true); // query is inclusive
1064 static private Term NewDayTerm (string field_name, int d)
1066 return new Term ("D:" + field_name, String.Format ("{0:00}", d));
1069 static private LNS.Query NewDayQuery (string field_name, int d1, int d2)
1071 return new LNS.RangeQuery (NewDayTerm (field_name, d1),
1072 NewDayTerm (field_name, d2),
1073 true); // query is inclusive
1076 private class DateRangeHitFilter {
1077 public string Key;
1078 public DateTime StartDate;
1079 public DateTime EndDate;
1081 public bool HitFilter (Hit hit)
1083 // First, check the Timestamp
1084 if (Key == QueryPart_DateRange.AllPropertiesKey
1085 || Key == QueryPart_DateRange.TimestampKey) {
1086 DateTime dt;
1087 dt = hit.Timestamp;
1088 if (StartDate <= dt && dt <= EndDate)
1089 return true;
1090 if (Key == QueryPart_DateRange.TimestampKey)
1091 return false;
1094 if (Key == QueryPart_DateRange.AllPropertiesKey) {
1095 // Walk through all of the properties, and see if any
1096 // date properties fall inside the range.
1097 foreach (Property prop in hit.Properties) {
1098 if (prop.Type == PropertyType.Date) {
1099 DateTime dt;
1100 dt = StringFu.StringToDateTime (prop.Value);
1101 if (StartDate <= dt && dt <= EndDate)
1102 return true;
1105 return false;
1106 } else {
1107 // Walk through all of the properties with the given key,
1108 // and see if any of them fall inside of the range.
1109 string[] values;
1110 values = hit.GetProperties (Key);
1111 foreach (string v in values) {
1112 DateTime dt;
1113 dt = StringFu.StringToDateTime (v);
1114 if (StartDate <= dt && dt <= EndDate)
1115 return true;
1117 return false;
1122 static private LNS.Query GetDateRangeQuery (QueryPart_DateRange part, out HitFilter hit_filter)
1124 string field_name;
1125 if (part.Key == QueryPart_DateRange.AllPropertiesKey)
1126 field_name = TypeToWildcardField (PropertyType.Date);
1127 else if (part.Key == QueryPart_DateRange.TimestampKey)
1128 field_name = "Timestamp";
1129 else
1130 field_name = PropertyToFieldName (PropertyType.Date, part.Key);
1132 // FIXME: We could optimize this and reduce the size of our range
1133 // queries if we actually new the min and max date that appear in
1134 // any properties in the index. We would need to inspect the index to
1135 // determine that at start-up, and then track it as new documents
1136 // get added to the index.
1137 if (part.StartDate < lower_bound)
1138 part.StartDate = lower_bound;
1139 if (part.EndDate > upper_bound || part.EndDate == DateTime.MinValue)
1140 part.EndDate = upper_bound;
1142 // Swap the start and end dates if they come in reversed.
1143 if (part.StartDate > part.EndDate) {
1144 DateTime swap;
1145 swap = part.StartDate;
1146 part.StartDate = part.EndDate;
1147 part.EndDate = swap;
1150 // Set up our hit filter to cull out the bad dates.
1151 DateRangeHitFilter drhf;
1152 drhf = new DateRangeHitFilter ();
1153 drhf.Key = part.Key;
1154 drhf.StartDate = part.StartDate;
1155 drhf.EndDate = part.EndDate;
1156 hit_filter = new HitFilter (drhf.HitFilter);
1158 Logger.Log.Debug ("Building new date range query");
1159 Logger.Log.Debug ("Start: {0}", part.StartDate);
1160 Logger.Log.Debug ("End: {0}", part.EndDate);
1162 int y1, m1, d1, y2, m2, d2;
1163 y1 = part.StartDate.Year;
1164 m1 = part.StartDate.Month;
1165 d1 = part.StartDate.Day;
1166 y2 = part.EndDate.Year;
1167 m2 = part.EndDate.Month;
1168 d2 = part.EndDate.Day;
1170 LNS.BooleanQuery top_level_query;
1171 top_level_query = new LNS.BooleanQuery ();
1173 // A special case: both the start and the end of our range fall
1174 // in the same month.
1175 if (y1 == y2 && m1 == m2) {
1176 LNS.Query ym_query;
1177 ym_query = NewYearMonthQuery (field_name, y1, m1);
1179 // If our range only covers a part of the month, do a range query on the days.
1180 if (d1 != 1 || d2 != DateTime.DaysInMonth (y2, m2)) {
1181 LNS.BooleanQuery sub_query;
1182 sub_query = new LNS.BooleanQuery ();
1183 sub_query.Add (ym_query, true, false);
1184 sub_query.Add (NewDayQuery (field_name, d1, d2), true, false);
1185 top_level_query.Add (sub_query, false, false);
1186 } else {
1187 top_level_query.Add (ym_query, false, false);
1190 } else {
1192 // Handle a partial month at the beginning of our range.
1193 if (d1 > 1) {
1194 LNS.BooleanQuery sub_query;
1195 sub_query = new LNS.BooleanQuery ();
1196 sub_query.Add (NewYearMonthQuery (field_name, y1, m1), true, false);
1197 sub_query.Add (NewDayQuery (field_name, d1, DateTime.DaysInMonth (y1, m1)), true, false);
1198 top_level_query.Add (sub_query, false, false);
1200 ++m1;
1201 if (m1 == 13) {
1202 m1 = 1;
1203 ++y1;
1207 // And likewise, handle a partial month at the end of our range.
1208 if (d2 < DateTime.DaysInMonth (y2, m2)) {
1209 LNS.BooleanQuery sub_query;
1210 sub_query = new LNS.BooleanQuery ();
1211 sub_query.Add (NewYearMonthQuery (field_name, y2, m2), true, false);
1212 sub_query.Add (NewDayQuery (field_name, 1, d2), true, false);
1213 top_level_query.Add (sub_query, false, false);
1215 --m2;
1216 if (m2 == 0) {
1217 m2 = 12;
1218 --y2;
1222 // Generate the query for the "middle" of our period, if it is non-empty
1223 if (y1 < y2 || ((y1 == y2) && m1 <= m2))
1224 top_level_query.Add (NewYearMonthQuery (field_name, y1, m1, y2, m2),
1225 false, false);
1228 return top_level_query;
1231 // search_subset_uris is a list of Uris that this search should be
1232 // limited to.
1233 static protected void QueryPartToQuery (QueryPart abstract_part,
1234 bool only_build_primary_query,
1235 ArrayList term_list,
1236 out LNS.Query primary_query,
1237 out LNS.Query secondary_query,
1238 out HitFilter hit_filter)
1240 primary_query = null;
1241 secondary_query = null;
1243 // By default, we assume that our lucene queries will return exactly the
1244 // matching set of objects. We need to set the hit filter if further
1245 // refinement of the search results is required. (As in the case of
1246 // date range queries, for example.) We essentially have to do this
1247 // to make OR queries work correctly.
1248 hit_filter = true_hit_filter;
1250 // The exception is when dealing with a prohibited part. Just return
1251 // null for the hit filter in that case. This works since
1252 // prohibited parts are not allowed inside of OR queries.
1253 if (abstract_part.Logic == QueryPartLogic.Prohibited)
1254 hit_filter = null;
1256 if (abstract_part == null)
1257 return;
1259 if (abstract_part is QueryPart_Text) {
1260 QueryPart_Text part = (QueryPart_Text) abstract_part;
1262 if (! (part.SearchFullText || part.SearchTextProperties))
1263 return;
1265 LNS.BooleanQuery p_query = new LNS.BooleanQuery ();
1266 LNS.BooleanQuery s_query = new LNS.BooleanQuery ();
1268 if (part.SearchFullText) {
1269 LNS.Query subquery;
1270 subquery = StringToQuery ("Text", part.Text, term_list);
1271 if (subquery != null)
1272 p_query.Add (subquery, false, false);
1274 // FIXME: HotText is ignored for now!
1275 // subquery = StringToQuery ("HotText", part.Text);
1276 // if (subquery != null)
1277 // p_query.Add (subquery, false, false);
1280 if (part.SearchTextProperties) {
1281 LNS.Query subquery;
1282 subquery = StringToQuery ("PropertyText", part.Text, term_list);
1283 if (subquery != null) {
1284 p_query.Add (subquery, false, false);
1285 // Properties can live in either index
1286 if (! only_build_primary_query)
1287 s_query.Add (subquery.Clone () as LNS.Query, false, false);
1290 Term term;
1291 term = new Term ("PropertyKeyword", part.Text.ToLower ()); // make sure text is lowercased
1292 // FIXME: terms are already added in term_list. But they may have been tokenized
1293 // The term here is non-tokenized version. Should this be added to term_list ?
1294 // term_list is used to calculate scores
1295 if (term_list != null)
1296 term_list.Add (term);
1297 subquery = new LNS.TermQuery (term);
1298 p_query.Add (subquery, false, false);
1299 // Properties can live in either index
1300 if (! only_build_primary_query)
1301 s_query.Add (subquery.Clone () as LNS.Query, false, false);
1304 primary_query = p_query;
1305 if (! only_build_primary_query)
1306 secondary_query = s_query;
1308 return;
1311 if (abstract_part is QueryPart_Wildcard) {
1312 QueryPart_Wildcard part = (QueryPart_Wildcard) abstract_part;
1314 LNS.BooleanQuery p_query = new LNS.BooleanQuery ();
1315 LNS.BooleanQuery s_query = new LNS.BooleanQuery ();
1317 Term term;
1318 LNS.Query subquery;
1320 // Lower case the terms for searching
1321 string query_string_lower = part.QueryString.ToLower ();
1323 // Search text content
1324 term = new Term ("Text", query_string_lower);
1325 subquery = new LNS.WildcardQuery (term);
1326 p_query.Add (subquery, false, false);
1327 term_list.Add (term);
1329 // Search text properties
1330 term = new Term ("PropertyText", query_string_lower);
1331 subquery = new LNS.WildcardQuery (term);
1332 p_query.Add (subquery, false, false);
1333 // Properties can live in either index
1334 if (! only_build_primary_query)
1335 s_query.Add (subquery.Clone () as LNS.Query, false, false);
1336 term_list.Add (term);
1338 // Search property keywords
1339 term = new Term ("PropertyKeyword", query_string_lower);
1340 term_list.Add (term);
1341 subquery = new LNS.WildcardQuery (term);
1342 p_query.Add (subquery, false, false);
1343 // Properties can live in either index
1344 if (! only_build_primary_query)
1345 s_query.Add (subquery.Clone () as LNS.Query, false, false);
1347 primary_query = p_query;
1348 if (! only_build_primary_query)
1349 secondary_query = s_query;
1351 return;
1354 if (abstract_part is QueryPart_Property) {
1355 QueryPart_Property part = (QueryPart_Property) abstract_part;
1357 string field_name;
1358 if (part.Key == QueryPart_Property.AllProperties)
1359 field_name = TypeToWildcardField (part.Type);
1360 else
1361 field_name = PropertyToFieldName (part.Type, part.Key);
1363 if (part.Type == PropertyType.Text)
1364 primary_query = StringToQuery (field_name, part.Value, term_list);
1365 else {
1366 Term term;
1367 term = new Term (field_name, part.Value.ToLower ());
1368 if (term_list != null)
1369 term_list.Add (term);
1370 primary_query = new LNS.TermQuery (term);
1373 // Properties can live in either index
1374 if (! only_build_primary_query && primary_query != null)
1375 secondary_query = primary_query.Clone () as LNS.Query;
1377 return;
1380 if (abstract_part is QueryPart_DateRange) {
1382 QueryPart_DateRange part = (QueryPart_DateRange) abstract_part;
1384 primary_query = GetDateRangeQuery (part, out hit_filter);
1385 // Date properties can live in either index
1386 if (! only_build_primary_query && primary_query != null)
1387 secondary_query = primary_query.Clone () as LNS.Query;
1389 // If this is a prohibited part, invert our hit filter.
1390 if (part.Logic == QueryPartLogic.Prohibited) {
1391 NotHitFilter nhf;
1392 nhf = new NotHitFilter (hit_filter);
1393 hit_filter = new HitFilter (nhf.HitFilter);
1396 return;
1399 if (abstract_part is QueryPart_Or) {
1400 QueryPart_Or part = (QueryPart_Or) abstract_part;
1402 // Assemble a new BooleanQuery combining all of the sub-parts.
1403 LNS.BooleanQuery p_query;
1404 p_query = new LNS.BooleanQuery ();
1406 LNS.BooleanQuery s_query = null;
1407 if (! only_build_primary_query)
1408 s_query = new LNS.BooleanQuery ();
1410 primary_query = p_query;
1411 secondary_query = s_query;
1413 OrHitFilter or_hit_filter = null;
1415 foreach (QueryPart sub_part in part.SubParts) {
1416 LNS.Query p_subq, s_subq;
1417 HitFilter sub_hit_filter; // FIXME: This is (and must be) ignored
1418 // FIXME: Any subpart in an OR which has a hit filter won't work
1419 // correctly, because we can't tell which part of an OR we matched
1420 // against to filter correctly. This affects date range queries.
1421 QueryPartToQuery (sub_part, only_build_primary_query,
1422 term_list,
1423 out p_subq, out s_subq, out sub_hit_filter);
1424 if (p_subq != null)
1425 p_query.Add (p_subq, false, false);
1426 if (s_subq != null)
1427 s_query.Add (s_subq, false, false);
1428 if (sub_hit_filter != null) {
1429 if (or_hit_filter == null)
1430 or_hit_filter = new OrHitFilter ();
1431 or_hit_filter.Add (sub_hit_filter);
1435 if (or_hit_filter != null)
1436 hit_filter = new HitFilter (or_hit_filter.HitFilter);
1438 return;
1441 throw new Exception ("Unhandled QueryPart type! " + abstract_part.ToString ());
1444 static protected LNS.Query UriQuery (string field_name, Uri uri)
1446 return new LNS.TermQuery (new Term (field_name, UriFu.UriToEscapedString (uri)));
1449 static protected LNS.Query UriQuery (string field_name, ICollection uri_list)
1451 return UriQuery (field_name, uri_list, null);
1454 static protected LNS.Query UriQuery (string field_name, ICollection uri_list, LNS.Query extra_requirement)
1456 if (uri_list.Count == 0)
1457 return null;
1459 int max_clauses;
1460 max_clauses = LNS.BooleanQuery.GetMaxClauseCount ();
1462 int N;
1463 N = 1 + (uri_list.Count - 1) / max_clauses;
1465 LNS.BooleanQuery top_query;
1466 top_query = new LNS.BooleanQuery ();
1468 int cursor = 0;
1469 if (extra_requirement != null) {
1470 top_query.Add (extra_requirement, true, false);
1471 ++cursor;
1474 ArrayList bottom_queries = null;
1476 if (N > 1) {
1477 bottom_queries = new ArrayList ();
1478 for (int i = 0; i < N; ++i) {
1479 LNS.BooleanQuery bq;
1480 bq = new LNS.BooleanQuery ();
1481 bottom_queries.Add (bq);
1482 top_query.Add (bq, false, false);
1486 foreach (Uri uri in uri_list) {
1487 LNS.Query subquery;
1488 subquery = UriQuery (field_name, uri);
1490 LNS.BooleanQuery target;
1491 if (N == 1)
1492 target = top_query;
1493 else {
1494 target = (LNS.BooleanQuery) bottom_queries [cursor];
1495 ++cursor;
1496 if (cursor >= N)
1497 cursor = 0;
1500 target.Add (subquery, false, false);
1503 return top_query;
1506 ///////////////////////////////////////////////////////////////////////////////////
1508 public int SegmentCount {
1509 get {
1510 DirectoryInfo dir_info;
1511 int p_count = 0, s_count = 0;
1513 dir_info = new DirectoryInfo (PrimaryIndexDirectory);
1514 foreach (FileInfo file_info in dir_info.GetFiles ())
1515 if (file_info.Extension == ".cfs")
1516 ++p_count;
1518 dir_info = new DirectoryInfo (SecondaryIndexDirectory);
1519 foreach (FileInfo file_info in dir_info.GetFiles ())
1520 if (file_info.Extension == ".cfs")
1521 ++s_count;
1523 return p_count > s_count ? p_count : s_count;
1527 ///////////////////////////////////////////////////////////////////////////////////
1529 // Cache IndexReaders on a per-Lucene index basis, since they
1530 // are extremely expensive to create. Note that using this
1531 // only makes sense in situations where the index only
1532 // possibly might change from underneath us, but most of the
1533 // time probably won't. This means it makes sense to do
1534 // this in LuceneQueryingDriver.cs, but it doesn't in
1535 // LuceneIndexingDriver.cs.
1537 private class ReaderAndVersion {
1539 public IndexReader Reader;
1540 public long Version;
1541 public int Refcount;
1543 public ReaderAndVersion (IndexReader reader, long version)
1545 this.Reader = reader;
1546 this.Version = version;
1547 this.Refcount = 1;
1551 static private Hashtable directory_rav_map = new Hashtable ();
1552 static private Hashtable reader_rav_map = new Hashtable ();
1554 static public LNS.IndexSearcher GetSearcher (Lucene.Net.Store.Directory directory)
1556 IndexReader reader = GetReader (directory);
1558 return new LNS.IndexSearcher (reader);
1561 static public IndexReader GetReader (Lucene.Net.Store.Directory directory)
1563 IndexReader reader;
1564 long version;
1566 lock (reader_rav_map) {
1567 ReaderAndVersion rav = (ReaderAndVersion) directory_rav_map [directory];
1569 if (rav == null) {
1570 version = IndexReader.GetCurrentVersion (directory);
1571 reader = IndexReader.Open (directory);
1573 rav = new ReaderAndVersion (reader, version);
1575 directory_rav_map [directory] = rav;
1576 reader_rav_map [reader] = rav;
1578 return reader;
1581 version = IndexReader.GetCurrentVersion (directory);
1583 if (version != rav.Version) {
1584 reader = IndexReader.Open (directory);
1586 rav = new ReaderAndVersion (reader, version);
1588 directory_rav_map [directory] = rav;
1589 reader_rav_map [reader] = rav;
1590 } else {
1591 rav.Refcount++;
1594 return rav.Reader;
1598 static private void UnrefReaderAndVersion_Unlocked (ReaderAndVersion rav)
1600 rav.Refcount--;
1602 if (rav.Refcount == 0) {
1603 reader_rav_map.Remove (rav.Reader);
1604 directory_rav_map.Remove (rav.Reader.Directory ());
1605 rav.Reader.Close ();
1609 static public void ReleaseReader (IndexReader reader)
1611 lock (reader_rav_map) {
1612 ReaderAndVersion rav = (ReaderAndVersion) reader_rav_map [reader];
1614 if (rav != null)
1615 UnrefReaderAndVersion_Unlocked (rav);
1616 else
1617 reader.Close ();
1621 static public void ReleaseSearcher (LNS.IndexSearcher searcher)
1623 IndexReader reader = searcher.GetIndexReader ();
1625 searcher.Close ();
1626 ReleaseReader (reader);
1629 ///////////////////////////////////////////////////////////////////////////////////
1632 // Various ways to grab lots of hits at once.
1633 // These should never be used for querying, only for utility
1634 // functions.
1637 public int GetBlockOfHits (int cookie,
1638 Hit [] block_of_hits)
1640 IndexReader primary_reader;
1641 IndexReader secondary_reader;
1642 primary_reader = GetReader (PrimaryStore);
1643 secondary_reader = GetReader (SecondaryStore);
1645 int request_size;
1646 request_size = block_of_hits.Length;
1647 if (request_size > primary_reader.NumDocs ())
1648 request_size = primary_reader.NumDocs ();
1650 int max_doc;
1651 max_doc = primary_reader.MaxDoc ();
1653 if (cookie < 0) {
1654 Random random;
1655 random = new Random ();
1656 cookie = random.Next (max_doc);
1659 int original_cookie;
1660 original_cookie = cookie;
1662 Hashtable primary_docs, secondary_docs;
1663 primary_docs = UriFu.NewHashtable ();
1664 secondary_docs = UriFu.NewHashtable ();
1666 // Load the primary documents
1667 for (int i = 0; i < request_size; ++i) {
1669 if (! primary_reader.IsDeleted (cookie)) {
1670 Document doc;
1671 doc = primary_reader.Document (cookie);
1672 primary_docs [GetUriFromDocument (doc)] = doc;
1675 ++cookie;
1676 if (cookie >= max_doc) // wrap around
1677 cookie = 0;
1679 // If we somehow end up back where we started,
1680 // give up.
1681 if (cookie == original_cookie)
1682 break;
1685 // If necessary, load the secondary documents
1686 if (secondary_reader != null) {
1687 LNS.IndexSearcher searcher;
1688 searcher = new LNS.IndexSearcher (secondary_reader);
1690 LNS.Query uri_query;
1691 uri_query = UriQuery ("Uri", primary_docs.Keys);
1693 LNS.Hits hits;
1694 hits = searcher.Search (uri_query);
1695 for (int i = 0; i < hits.Length (); ++i) {
1696 Document doc;
1697 doc = hits.Doc (i);
1698 secondary_docs [GetUriFromDocument (doc)] = doc;
1701 searcher.Close ();
1704 ReleaseReader (primary_reader);
1705 ReleaseReader (secondary_reader);
1707 // Now assemble the hits
1708 int j = 0;
1709 foreach (Uri uri in primary_docs.Keys) {
1710 Document primary_doc, secondary_doc;
1711 primary_doc = primary_docs [uri] as Document;
1712 secondary_doc = secondary_docs [uri] as Document;
1714 Hit hit;
1715 hit = DocumentToHit (primary_doc);
1716 if (secondary_doc != null)
1717 AddPropertiesToHit (hit, secondary_doc, false);
1719 block_of_hits [j] = hit;
1720 ++j;
1723 // null-pad the array, if necessary
1724 for (; j < block_of_hits.Length; ++j)
1725 block_of_hits [j] = null;
1728 // Return the new cookie
1729 return cookie;
1732 // For a large index, this will be very slow and will consume
1733 // a lot of memory. Don't call it without a good reason!
1734 // We return a hashtable indexed by Uri.
1735 public Hashtable GetAllHitsByUri ()
1737 Hashtable all_hits;
1738 all_hits = UriFu.NewHashtable ();
1740 IndexReader primary_reader;
1741 IndexReader secondary_reader;
1742 primary_reader = GetReader (PrimaryStore);
1743 secondary_reader = GetReader (SecondaryStore);
1745 // Load everything from the primary index
1746 int max_doc;
1747 max_doc = primary_reader.MaxDoc ();
1748 for (int i = 0; i < max_doc; ++i) {
1750 if (primary_reader.IsDeleted (i))
1751 continue;
1753 Document doc;
1754 doc = primary_reader.Document (i);
1756 Hit hit;
1757 hit = DocumentToHit (doc);
1758 all_hits [hit.Uri] = hit;
1761 // Now add in everything from the secondary index, if it exists
1762 if (secondary_reader != null) {
1763 max_doc = secondary_reader.MaxDoc ();
1764 for (int i = 0; i < max_doc; ++i) {
1766 if (secondary_reader.IsDeleted (i))
1767 continue;
1769 Document doc;
1770 doc = secondary_reader.Document (i);
1772 Uri uri;
1773 uri = GetUriFromDocument (doc);
1775 Hit hit;
1776 hit = (Hit) all_hits [uri];
1777 if (hit != null)
1778 AddPropertiesToHit (hit, doc, false);
1782 ReleaseReader (primary_reader);
1783 ReleaseReader (secondary_reader);
1785 return all_hits;