Remove debug spew
[beagle.git] / beagled / LuceneCommon.cs
blobf5ca44912ae363b1875656561e4e5701a2181247
1 //
2 // LuceneCommon.cs
3 //
4 // Copyright (C) 2004-2005 Novell, Inc.
5 //
7 //
8 // Permission is hereby granted, free of charge, to any person obtaining a
9 // copy of this software and associated documentation files (the "Software"),
10 // to deal in the Software without restriction, including without limitation
11 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 // and/or sell copies of the Software, and to permit persons to whom the
13 // Software is furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 // DEALINGS IN THE SOFTWARE.
27 using System;
28 using System.Collections;
29 using System.Diagnostics;
30 using System.Globalization;
31 using System.IO;
32 using System.Text;
33 using System.Threading;
34 using System.Xml;
35 using System.Xml.Serialization;
37 using Lucene.Net.Analysis;
38 using Lucene.Net.Analysis.Standard;
39 using Lucene.Net.Documents;
40 using Lucene.Net.Index;
41 using Lucene.Net.QueryParsers;
42 using LNS = Lucene.Net.Search;
44 using Beagle.Util;
46 namespace Beagle.Daemon {
48 public class LuceneCommon {
50 public delegate bool HitFilter (Hit hit);
52 // VERSION HISTORY
53 // ---------------
55 // 1: Original
56 // 2: Changed format of timestamp strings
57 // 3: Schema changed to be more Dashboard-Match-like
58 // 4: Schema changed for files to include _Directory property
59 // 5: Changed analyzer to support stemming. Bumped version # to
60 // force everyone to re-index.
61 // 6: lots of schema changes as part of the general refactoring
62 // 7: incremented to force a re-index after our upgrade to lucene 1.4
63 // (in theory the file formats are compatible, we are seeing 'term
64 // out of order' exceptions in some cases)
65 // 8: another forced re-index, this time because of massive changes
66 // in the file system backend (it would be nice to have per-backend
67 // versioning so that we didn't have to purge all indexes just
68 // because one changed)
69 // 9: changed the way properties are stored, changed in conjunction
70 // with sane handling of multiple properties on hits.
71 // 10: changed to support typed and mutable properties
72 // 11: moved mime type and hit type into properties
73 // 12: added year-month and year-month-day resolutions for all
74 // date properties
75 // 13: moved source into a property
76 // 14: allow wildcard queries to also match keywords
77 // 15: analyze PropertyKeyword field, and store all properties as
78 // lower case so that we're truly case insensitive.
79 // 16: add inverted timestamp to make querying substantially faster
80 private const int MAJOR_VERSION = 16;
81 private int minor_version = 0;
83 private string index_name;
84 private string top_dir;
86 private string fingerprint;
87 private int last_item_count = -1;
89 // This is the big index, containing document full-texts and
90 // data that is expensive to index.
91 private Lucene.Net.Store.Directory primary_store = null;
93 // This is the small index, containing document info that we
94 // expect to have change. Canonical example: file names.
95 private Lucene.Net.Store.Directory secondary_store = null;
97 //////////////////////////////////////////////////////////////////////////////
99 protected LuceneCommon (string index_name, int minor_version)
101 this.index_name = index_name;
102 this.minor_version = minor_version;
104 this.top_dir = (Path.IsPathRooted (index_name)) ? index_name : Path.Combine (PathFinder.IndexDir, index_name);
107 //////////////////////////////////////////////////////////////////////////////
109 protected string IndexName { get { return index_name; } }
111 public Lucene.Net.Store.Directory PrimaryStore { get { return primary_store; } }
113 public Lucene.Net.Store.Directory SecondaryStore { get { return secondary_store; } }
115 public string Fingerprint { get { return fingerprint; } }
117 public string TopDirectory { get { return top_dir; } }
119 //////////////////////////////////////////////////////////////////////////////
121 protected TextCache text_cache = null;
123 public TextCache TextCache {
124 get { return text_cache; }
125 set { text_cache = value; }
128 //////////////////////////////////////////////////////////////////////////////
130 private string VersionFile {
131 get { return Path.Combine (top_dir, "version"); }
134 private string FingerprintFile {
135 get { return Path.Combine (top_dir, "fingerprint"); }
138 // Shouldn't really be public
139 public string PrimaryIndexDirectory {
140 get { return Path.Combine (top_dir, "PrimaryIndex"); }
143 // Shouldn't really be public
144 public string SecondaryIndexDirectory {
145 get { return Path.Combine (top_dir, "SecondaryIndex"); }
148 public string LockDirectory {
149 get { return Path.Combine (top_dir, "Locks"); }
152 //////////////////////////////////////////////////////////////////////////////
154 // Deal with dangling locks
156 private bool IsDanglingLock (FileInfo info)
158 Log.Debug ("Checking for dangling locks...");
160 // It isn't even a lock file
161 if (! info.Name.EndsWith (".lock"))
162 return false;
164 StreamReader reader;
165 string pid = null;
167 try {
168 reader = new StreamReader (info.FullName);
169 pid = reader.ReadLine ();
170 reader.Close ();
172 } catch {
173 // We couldn't read the lockfile, so it probably went away.
174 return false;
178 if (pid == null) {
179 // Looks like the lock file was empty, which really
180 // shouldn't happen. It should contain the PID of
181 // the process which locked it. Lets be on the safe
182 // side and assume it's a dangling lock.
183 Log.Warn ("Found an empty lock file, that shouldn't happen: {0}", info.FullName);
184 return true;
187 string cmdline_file;
188 cmdline_file = String.Format ("/proc/{0}/cmdline", pid);
190 string cmdline = "";
191 try {
192 reader = new StreamReader (cmdline_file);
193 cmdline = reader.ReadLine ();
194 reader.Close ();
195 } catch {
196 // If we can't open that file, either:
197 // (1) The process doesn't exist
198 // (2) It does exist, but it doesn't belong to us.
199 // Thus it isn't an IndexHelper
200 // In either case, the lock is dangling --- if it
201 // still exists.
202 return info.Exists;
205 // The process exists, but isn't an IndexHelper.
206 // If the lock file is still there, it is dangling.
207 // FIXME: During one run of bludgeon I got a null reference
208 // exception here, so I added the cmdline == null check.
209 // Why exactly would that happen? Is this logic correct
210 // in that (odd and presumably rare) case?
211 if (cmdline == null || cmdline.IndexOf ("IndexHelper.exe") == -1)
212 return info.Exists;
214 // If we reach this point, we know:
215 // (1) The process still exists
216 // (2) We own it
217 // (3) It is an IndexHelper process
218 // Thus it almost certainly isn't a dangling lock.
219 // The process might be wedged, but that is
220 // another issue...
221 return false;
224 protected bool Exists ()
226 if (! (Directory.Exists (top_dir)
227 && File.Exists (VersionFile)
228 && File.Exists (FingerprintFile)
229 && Directory.Exists (PrimaryIndexDirectory)
230 && IndexReader.IndexExists (PrimaryIndexDirectory)
231 && Directory.Exists (SecondaryIndexDirectory)
232 && IndexReader.IndexExists (SecondaryIndexDirectory)
233 && Directory.Exists (LockDirectory)))
234 return false;
236 // Check the index's version number. If it is wrong,
237 // declare the index non-existent.
239 StreamReader version_reader;
240 string version_str;
241 version_reader = new StreamReader (VersionFile);
242 version_str = version_reader.ReadLine ();
243 version_reader.Close ();
245 int current_major_version, current_minor_version;
246 int i = version_str.IndexOf ('.');
248 if (i != -1) {
249 current_major_version = Convert.ToInt32 (version_str.Substring (0, i));
250 current_minor_version = Convert.ToInt32 (version_str.Substring (i+1));
251 } else {
252 current_minor_version = Convert.ToInt32 (version_str);
253 current_major_version = 0;
256 if (current_major_version != MAJOR_VERSION
257 || (minor_version >= 0 && current_minor_version != minor_version)) {
258 Logger.Log.Debug ("Version mismatch in {0}", index_name);
259 Logger.Log.Debug ("Index has version {0}.{1}, expected {2}.{3}",
260 current_major_version, current_minor_version,
261 MAJOR_VERSION, minor_version);
262 return false;
265 // Check the lock directory: If there is a dangling write lock,
266 // assume that the index is corrupted and declare it non-existent.
267 DirectoryInfo lock_dir_info;
268 lock_dir_info = new DirectoryInfo (LockDirectory);
269 foreach (FileInfo info in lock_dir_info.GetFiles ()) {
270 if (IsDanglingLock (info)) {
271 Logger.Log.Warn ("Found a dangling index lock on {0}", info.FullName);
272 return false;
276 return true;
279 private Lucene.Net.Store.Directory CreateIndex (string path)
281 // Create a directory to put the index in.
282 Directory.CreateDirectory (path);
284 // Create a new store.
285 Lucene.Net.Store.Directory store;
286 store = Lucene.Net.Store.FSDirectory.GetDirectory (path, LockDirectory, true);
288 // Create an empty index in that store.
289 IndexWriter writer;
290 writer = new IndexWriter (store, null, true);
291 writer.Close ();
293 return store;
296 // Create will kill your index dead. Use it with care.
297 // You don't need to call Open after calling Create.
298 protected void Create ()
300 if (minor_version < 0)
301 minor_version = 0;
303 // Purge any existing directories.
304 if (Directory.Exists (top_dir)) {
305 Logger.Log.Debug ("Purging {0}", top_dir);
306 Directory.Delete (top_dir, true);
309 // Create any necessary directories.
310 Directory.CreateDirectory (top_dir);
311 Directory.CreateDirectory (LockDirectory);
313 // Create the indexes.
314 primary_store = CreateIndex (PrimaryIndexDirectory);
315 secondary_store = CreateIndex (SecondaryIndexDirectory);
317 // Generate and store the index fingerprint.
318 fingerprint = GuidFu.ToShortString (Guid.NewGuid ());
319 TextWriter writer;
320 writer = new StreamWriter (FingerprintFile, false);
321 writer.WriteLine (fingerprint);
322 writer.Close ();
324 // Store our index version information.
325 writer = new StreamWriter (VersionFile, false);
326 writer.WriteLine ("{0}.{1}", MAJOR_VERSION, minor_version);
327 writer.Close ();
330 protected void Open ()
332 Open (false);
335 protected void Open (bool read_only_mode)
337 // Read our index fingerprint.
338 TextReader reader;
339 reader = new StreamReader (FingerprintFile);
340 fingerprint = reader.ReadLine ();
341 reader.Close ();
343 // Create stores for our indexes.
344 primary_store = Lucene.Net.Store.FSDirectory.GetDirectory (PrimaryIndexDirectory, LockDirectory, false, read_only_mode);
345 secondary_store = Lucene.Net.Store.FSDirectory.GetDirectory (SecondaryIndexDirectory, LockDirectory, false, read_only_mode);
348 ////////////////////////////////////////////////////////////////
351 // Custom Analyzers
354 private class SingletonTokenStream : TokenStream {
356 private string singleton_str;
358 public SingletonTokenStream (string singleton_str)
360 this.singleton_str = singleton_str;
363 override public Lucene.Net.Analysis.Token Next ()
365 if (singleton_str == null)
366 return null;
368 Lucene.Net.Analysis.Token token;
369 token = new Lucene.Net.Analysis.Token (singleton_str, 0, singleton_str.Length);
371 singleton_str = null;
373 return token;
377 // FIXME: This assumes everything being indexed is in English!
378 internal class BeagleAnalyzer : StandardAnalyzer {
380 private char [] buffer = new char [2];
381 private bool strip_extra_property_info = false;
382 private bool tokenize_email_hostname = false;
384 public BeagleAnalyzer (bool is_indexing_analyzer)
386 if (is_indexing_analyzer) {
387 this.strip_extra_property_info = true;
388 this.tokenize_email_hostname = true;
389 } else {
390 this.strip_extra_property_info = false;
391 this.tokenize_email_hostname = false;
395 public override TokenStream TokenStream (string fieldName, TextReader reader)
397 bool is_text_prop = false;
399 // Strip off the first two characters in a property.
400 // We store type information in those two characters, so we don't
401 // want to index them.
402 if (fieldName.StartsWith ("prop:")) {
404 if (strip_extra_property_info) {
405 // Skip everything up to and including the first :
406 int c;
407 do {
408 c = reader.Read ();
409 } while (c != -1 && c != ':');
412 is_text_prop = fieldName.StartsWith ("prop:t");
414 // If this is non-text property, just return one token
415 // containing the entire string. We do this to avoid
416 // tokenizing keywords.
417 if (! is_text_prop) {
418 // We don't want to lower case the token if it's
419 // not in the private namespace.
421 TokenStream singleton_stream = new SingletonTokenStream (reader.ReadToEnd ());
423 if (fieldName.StartsWith ("prop:k:" + Property.PrivateNamespace))
424 return singleton_stream;
425 else
426 return new LowerCaseFilter (singleton_stream);
428 } else if (fieldName == "PropertyKeyword")
429 return new LowerCaseFilter (new SingletonTokenStream (reader.ReadToEnd ()));
431 TokenStream outstream;
432 outstream = base.TokenStream (fieldName, reader);
434 if (fieldName == "Text"
435 || fieldName == "HotText"
436 || fieldName == "PropertyText"
437 || is_text_prop) {
438 outstream = new NoiseEmailHostFilter (outstream, tokenize_email_hostname);
439 outstream = new PorterStemFilter (outstream);
442 return outstream;
446 static private Analyzer indexing_analyzer = new BeagleAnalyzer (true);
447 static private Analyzer query_analyzer = new BeagleAnalyzer (false);
449 static protected Analyzer IndexingAnalyzer { get { return indexing_analyzer; } }
450 static protected Analyzer QueryAnalyzer { get { return query_analyzer; } }
452 ////////////////////////////////////////////////////////////////
455 // Dealing with properties
458 static private char TypeToCode (PropertyType type)
460 switch (type) {
461 case PropertyType.Text: return 't';
462 case PropertyType.Keyword: return 'k';
463 case PropertyType.Date: return 'd';
465 throw new Exception ("Bad property type: " + type);
468 static private PropertyType CodeToType (char c)
470 switch (c) {
471 case 't': return PropertyType.Text;
472 case 'k': return PropertyType.Keyword;
473 case 'd': return PropertyType.Date;
476 throw new Exception ("Bad property code: " + c);
479 static private string TypeToWildcardField (PropertyType type)
481 switch (type) {
482 case PropertyType.Text: return "PropertyText";
483 case PropertyType.Keyword: return "PropertyKeyword";
484 case PropertyType.Date: return "PropertyDate";
487 return null;
490 // Exposing this is a little bit suspicious.
491 static protected string PropertyToFieldName (PropertyType type, string key)
493 return String.Format ("prop:{0}:{1}", TypeToCode (type), key);
497 static private void AddDateFields (string field_name, Property prop, Document doc)
499 DateTime dt = StringFu.StringToDateTime (prop.Value);
501 Field f;
502 f = new Field ("YM:" + field_name,
503 StringFu.DateTimeToYearMonthString (dt),
504 false, // never store
505 true, // always index
506 false); // never tokenize
507 doc.Add (f);
509 f = new Field ("D:" + field_name,
510 StringFu.DateTimeToDayString (dt),
511 false, // never store
512 true, // always index
513 false); // never tokenize
514 doc.Add (f);
517 static protected void AddPropertyToDocument (Property prop, Document doc)
519 if (prop == null || prop.Value == null)
520 return;
522 // Don't actually put properties in the UnindexedNamespace
523 // in the document. A horrible (and yet lovely!) hack.
524 if (prop.Key.StartsWith (StringFu.UnindexedNamespace))
525 return;
527 Field f;
529 if (prop.IsSearched) {
530 string wildcard_field = TypeToWildcardField (prop.Type);
531 if (wildcard_field != null) {
532 f = new Field (wildcard_field,
533 prop.Value,
534 false, // never stored
535 true, // always indexed
536 true); // always tokenize (just lowercases for keywords; full analysis for text)
537 doc.Add (f);
539 if (prop.Type == PropertyType.Date)
540 AddDateFields (wildcard_field, prop, doc);
544 string coded_value;
545 coded_value = String.Format ("{0}:{1}",
546 prop.IsSearched ? 's' : '_',
547 prop.Value);
549 string field_name = PropertyToFieldName (prop.Type, prop.Key);
551 f = new Field (field_name,
552 coded_value,
553 prop.IsStored,
554 true, // always index
555 true); // always tokenize (strips off type code for keywords and lowercases)
556 doc.Add (f);
558 if (prop.Type == PropertyType.Date)
559 AddDateFields (field_name, prop, doc);
562 static protected Property GetPropertyFromDocument (Field f, Document doc, bool from_primary_index)
564 // Note: we don't use the document that we pass in,
565 // but in theory we could. At some later point we
566 // might need to split a property's data across two or
567 // more fields in the document.
569 if (f == null)
570 return null;
572 string field_name;
573 field_name = f.Name ();
574 if (field_name.Length < 7
575 || ! field_name.StartsWith ("prop:"))
576 return null;
578 string field_value;
579 field_value = f.StringValue ();
581 Property prop;
582 prop = new Property ();
583 prop.Type = CodeToType (field_name [5]);
584 prop.Key = field_name.Substring (7);
585 prop.Value = field_value.Substring (2);
586 prop.IsSearched = (field_value [0] == 's');
587 prop.IsMutable = ! from_primary_index;
588 prop.IsStored = f.IsStored ();
590 return prop;
593 //////////////////////////////////////////////////////////////////////////////
596 // Dealing with documents
599 static protected void BuildDocuments (Indexable indexable,
600 out Document primary_doc,
601 out Document secondary_doc)
603 primary_doc = new Document ();
604 secondary_doc = null;
606 Field f;
608 f = Field.Keyword ("Uri", UriFu.UriToEscapedString (indexable.Uri));
609 primary_doc.Add (f);
611 if (indexable.ParentUri != null) {
612 f = Field.Keyword ("ParentUri", UriFu.UriToEscapedString (indexable.ParentUri));
613 primary_doc.Add (f);
616 if (indexable.ValidTimestamp) {
617 // Note that we also want to search in the
618 // Timestamp field when we do a wildcard date
619 // query, so that's why we also add a wildcard
620 // field for each item here.
622 string wildcard_field = TypeToWildcardField (PropertyType.Date);
624 string str = StringFu.DateTimeToString (indexable.Timestamp);
625 f = Field.Keyword ("Timestamp", str);
626 primary_doc.Add (f);
627 f = Field.UnStored (wildcard_field, str);
628 primary_doc.Add (f);
630 // Create an inverted timestamp so that we can
631 // sort by timestamp at search-time.
632 long timeval = Convert.ToInt64 (str);
633 f = Field.UnStored ("InvertedTimestamp", (Int64.MaxValue - timeval).ToString ());
634 primary_doc.Add (f);
636 str = StringFu.DateTimeToYearMonthString (indexable.Timestamp);
637 f = Field.Keyword ("YM:Timestamp", str);
638 primary_doc.Add (f);
639 f = Field.UnStored ("YM:" + wildcard_field, str);
640 primary_doc.Add (f);
642 str = StringFu.DateTimeToDayString (indexable.Timestamp);
643 f = Field.Keyword ("D:Timestamp", str);
644 primary_doc.Add (f);
645 f = Field.UnStored ("D:" + wildcard_field, str);
646 primary_doc.Add (f);
649 if (indexable.NoContent) {
650 // If there is no content, make a note of that
651 // in a special property.
652 Property prop;
653 prop = Property.NewBool ("beagle:NoContent", true);
654 AddPropertyToDocument (prop, primary_doc);
656 } else {
658 // Since we might have content, add our text
659 // readers.
661 TextReader reader;
663 reader = indexable.GetTextReader ();
664 if (reader != null) {
665 f = Field.Text ("Text", reader);
666 primary_doc.Add (f);
669 reader = indexable.GetHotTextReader ();
670 if (reader != null) {
671 f = Field.Text ("HotText", reader);
672 primary_doc.Add (f);
676 // Store the Type and MimeType in special properties
678 if (indexable.HitType != null) {
679 Property prop;
680 prop = Property.NewUnsearched ("beagle:HitType", indexable.HitType);
681 AddPropertyToDocument (prop, primary_doc);
684 if (indexable.MimeType != null) {
685 Property prop;
686 prop = Property.NewUnsearched ("beagle:MimeType", indexable.MimeType);
687 AddPropertyToDocument (prop, primary_doc);
690 if (indexable.Source != null) {
691 Property prop;
692 prop = Property.NewUnsearched ("beagle:Source", indexable.Source);
693 AddPropertyToDocument (prop, primary_doc);
697 Property prop;
698 prop = Property.NewBool (Property.IsChildPropKey, indexable.IsChild);
699 AddPropertyToDocument (prop, primary_doc);
702 // Store the other properties
704 foreach (Property prop in indexable.Properties) {
705 Document target_doc = primary_doc;
706 if (prop.IsMutable) {
707 if (secondary_doc == null)
708 secondary_doc = CreateSecondaryDocument (indexable.Uri, indexable.ParentUri);
710 target_doc = secondary_doc;
713 AddPropertyToDocument (prop, target_doc);
717 static private Document CreateSecondaryDocument (Uri uri, Uri parent_uri)
719 Document secondary_doc = new Document ();
721 Field f = Field.Keyword ("Uri", UriFu.UriToEscapedString (uri));
722 secondary_doc.Add (f);
724 if (parent_uri != null) {
725 // Store both Uri and ParentUri in secondary index for easy removal
726 f = Field.Keyword ("ParentUri", UriFu.UriToEscapedString (parent_uri));
727 secondary_doc.Add (f);
730 return secondary_doc;
733 static protected Document RewriteDocument (Document old_secondary_doc,
734 Indexable prop_only_indexable)
736 Hashtable seen_props;
737 seen_props = new Hashtable ();
739 Document new_doc;
740 new_doc = new Document ();
742 Field uri_f;
743 uri_f = Field.Keyword ("Uri", UriFu.UriToEscapedString (prop_only_indexable.Uri));
744 new_doc.Add (uri_f);
746 Logger.Log.Debug ("Rewriting {0}", prop_only_indexable.DisplayUri);
748 if (prop_only_indexable.ParentUri != null) {
749 uri_f = Field.Keyword ("ParentUri", UriFu.UriToEscapedString (prop_only_indexable.ParentUri));
750 new_doc.Add (uri_f);
751 Logger.Log.Debug ("Parent Uri {0}", prop_only_indexable.ParentUri);
754 // Add the new properties to the new document. To
755 // delete a property, set the Value to null... then it
756 // will be added to seen_props (so the old value will
757 // be ignored below), but AddPropertyToDocument will
758 // return w/o doing anything.
759 foreach (Property prop in prop_only_indexable.Properties) {
760 seen_props [prop.Key] = prop;
761 AddPropertyToDocument (prop, new_doc);
762 Logger.Log.Debug ("New prop '{0}' = '{1}'", prop.Key, prop.Value);
765 // Copy the other properties from the old document to the
766 // new one, skipping any properties that we got new values
767 // for out of the Indexable.
768 if (old_secondary_doc != null) {
769 foreach (Field f in old_secondary_doc.Fields ()) {
770 Property prop;
771 prop = GetPropertyFromDocument (f, old_secondary_doc, false);
772 if (prop != null && ! seen_props.Contains (prop.Key)) {
773 Logger.Log.Debug ("Old prop '{0}' = '{1}'", prop.Key, prop.Value);
774 AddPropertyToDocument (prop, new_doc);
779 return new_doc;
782 static protected Uri GetUriFromDocument (Document doc)
784 string uri;
785 uri = doc.Get ("Uri");
786 if (uri == null)
787 throw new Exception ("Got document from Lucene w/o a URI!");
788 return UriFu.EscapedStringToUri (uri);
791 static protected Hit DocumentToHit (Document doc)
793 Hit hit;
794 hit = new Hit ();
796 hit.Uri = GetUriFromDocument (doc);
798 string str;
799 str = doc.Get ("ParentUri");
800 if (str != null)
801 hit.ParentUri = UriFu.EscapedStringToUri (str);
803 hit.Timestamp = StringFu.StringToDateTime (doc.Get ("Timestamp"));
805 AddPropertiesToHit (hit, doc, true);
807 // Get the Type and MimeType from the properties.
808 hit.Type = hit.GetFirstProperty ("beagle:HitType");
809 hit.MimeType = hit.GetFirstProperty ("beagle:MimeType");
810 hit.Source = hit.GetFirstProperty ("beagle:Source");
812 return hit;
815 static protected void AddPropertiesToHit (Hit hit, Document doc, bool from_primary_index)
817 foreach (Field f in doc.Fields ()) {
818 Property prop;
819 prop = GetPropertyFromDocument (f, doc, from_primary_index);
820 if (prop != null)
821 hit.AddProperty (prop);
826 //////////////////////////////////////////////////////////////////////////////
829 // Handle the index's item count
832 public int GetItemCount ()
834 if (last_item_count < 0) {
835 IndexReader reader;
836 reader = GetReader (PrimaryStore);
837 last_item_count = reader.NumDocs ();
838 ReleaseReader (reader);
840 return last_item_count;
843 // We should set the cached count of index items when IndexReaders
844 // are open and available, so calls to GetItemCount will return immediately.
846 protected bool HaveItemCount { get { return last_item_count >= 0; } }
848 protected void SetItemCount (IndexReader reader)
850 last_item_count = reader.NumDocs ();
853 public void SetItemCount (int count)
855 last_item_count = count;
858 protected void AdjustItemCount (int delta)
860 if (last_item_count >= 0)
861 last_item_count += delta;
864 //////////////////////////////////////////////////////////////////////////////
867 // Access to the stemmer and list of stop words
870 static PorterStemmer stemmer = new PorterStemmer ();
872 static public string Stem (string str)
874 return stemmer.Stem (str);
877 public static bool IsStopWord (string stemmed_word)
879 return ArrayFu.IndexOfString (StopAnalyzer.ENGLISH_STOP_WORDS, stemmed_word) != -1;
882 //////////////////////////////////////////////////////////////////////////////
885 // Special Hit Filtering classes
888 static private bool TrueHitFilter (Hit hit)
890 return true;
893 static private HitFilter true_hit_filter = new HitFilter (TrueHitFilter);
895 public class OrHitFilter {
897 private ArrayList all = new ArrayList ();
898 private bool contains_known_true = false;
900 public void Add (HitFilter hit_filter)
902 if (hit_filter == true_hit_filter)
903 contains_known_true = true;
904 all.Add (hit_filter);
907 public bool HitFilter (Hit hit)
909 if (contains_known_true)
910 return true;
911 foreach (HitFilter hit_filter in all)
912 if (hit_filter (hit))
913 return true;
914 return false;
918 public class AndHitFilter {
920 private ArrayList all = new ArrayList ();
922 public void Add (HitFilter hit_filter)
924 all.Add (hit_filter);
927 public bool HitFilter (Hit hit)
929 foreach (HitFilter hit_filter in all)
930 if (! hit_filter (hit))
931 return false;
932 return true;
936 public class NotHitFilter {
937 HitFilter original;
939 public NotHitFilter (HitFilter original)
941 this.original = original;
944 public bool HitFilter (Hit hit)
946 return ! original (hit);
950 //////////////////////////////////////////////////////////////////////////////
953 // Queries
956 static private LNS.Query StringToQuery (string field_name,
957 string text,
958 ArrayList term_list)
960 ArrayList tokens = new ArrayList ();
962 // Use the analyzer to extract the query's tokens.
963 // This code is taken from Lucene's query parser.
964 TokenStream source = QueryAnalyzer.TokenStream (field_name, new StringReader (text));
965 while (true) {
966 Lucene.Net.Analysis.Token token;
967 try {
968 token = source.Next ();
969 if (token == null)
970 break;
971 } catch (IOException) {
972 break;
974 if (token != null)
975 tokens.Add (token.TermText ());
977 try {
978 source.Close ();
979 } catch (IOException) {
980 // ignore
983 if (tokens.Count == 0)
984 return null;
986 LNS.PhraseQuery query = new LNS.PhraseQuery ();
988 foreach (string token in tokens) {
989 Term term;
990 term = new Term (field_name, token);
991 query.Add (term);
992 if (term_list != null)
993 term_list.Add (term);
996 return query;
1000 // Date Range Handling
1003 // This function will break down dates to discrete chunks of
1004 // time to avoid expanding RangeQuerys as much as possible.
1005 // For example, searching for
1007 // YMD(5 May 2005, 16 Oct 2006)
1009 // would break down into three queries:
1011 // (YM(May 2005) AND D(5,31)) OR
1012 // YM(Jun 2005, Sep 2006) OR
1013 // (YM(Oct 2006) AND D(1,16))
1015 static private DateTime lower_bound = DateTimeUtil.UnixToDateTimeUtc (0);
1017 // FIXME: we should probably boost this sometime around 2030.
1018 // Mark your calendar.
1019 static private DateTime upper_bound = new DateTime (2038, 12, 31);
1021 static private Term NewYearMonthTerm (string field_name, int y, int m)
1023 return new Term ("YM:" + field_name, String.Format ("{0}{1:00}", y, m));
1026 static private LNS.Query NewYearMonthQuery (string field_name, int y, int m)
1028 return new LNS.TermQuery (NewYearMonthTerm (field_name, y, m));
1031 static private LNS.Query NewYearMonthQuery (string field_name, int y1, int m1, int y2, int m2)
1033 return new LNS.RangeQuery (NewYearMonthTerm (field_name, y1, m1),
1034 NewYearMonthTerm (field_name, y2, m2),
1035 true); // query is inclusive
1038 static private Term NewDayTerm (string field_name, int d)
1040 return new Term ("D:" + field_name, String.Format ("{0:00}", d));
1043 static private LNS.Query NewDayQuery (string field_name, int d1, int d2)
1045 return new LNS.RangeQuery (NewDayTerm (field_name, d1),
1046 NewDayTerm (field_name, d2),
1047 true); // query is inclusive
1050 private class DateRangeHitFilter {
1051 public string Key;
1052 public DateTime StartDate;
1053 public DateTime EndDate;
1055 public bool HitFilter (Hit hit)
1057 // First, check the Timestamp
1058 if (Key == QueryPart_DateRange.AllPropertiesKey
1059 || Key == QueryPart_DateRange.TimestampKey) {
1060 DateTime dt;
1061 dt = hit.Timestamp;
1062 if (StartDate <= dt && dt <= EndDate)
1063 return true;
1064 if (Key == QueryPart_DateRange.TimestampKey)
1065 return false;
1068 if (Key == QueryPart_DateRange.AllPropertiesKey) {
1069 // Walk through all of the properties, and see if any
1070 // date properties fall inside the range.
1071 foreach (Property prop in hit.Properties) {
1072 if (prop.Type == PropertyType.Date) {
1073 DateTime dt;
1074 dt = StringFu.StringToDateTime (prop.Value);
1075 if (StartDate <= dt && dt <= EndDate)
1076 return true;
1079 return false;
1080 } else {
1081 // Walk through all of the properties with the given key,
1082 // and see if any of them fall inside of the range.
1083 string[] values;
1084 values = hit.GetProperties (Key);
1085 foreach (string v in values) {
1086 DateTime dt;
1087 dt = StringFu.StringToDateTime (v);
1088 if (StartDate <= dt && dt <= EndDate)
1089 return true;
1091 return false;
1096 static private LNS.Query GetDateRangeQuery (QueryPart_DateRange part, out HitFilter hit_filter)
1098 string field_name;
1099 if (part.Key == QueryPart_DateRange.AllPropertiesKey)
1100 field_name = TypeToWildcardField (PropertyType.Date);
1101 else if (part.Key == QueryPart_DateRange.TimestampKey)
1102 field_name = "Timestamp";
1103 else
1104 field_name = PropertyToFieldName (PropertyType.Date, part.Key);
1106 // FIXME: We could optimize this and reduce the size of our range
1107 // queries if we actually new the min and max date that appear in
1108 // any properties in the index. We would need to inspect the index to
1109 // determine that at start-up, and then track it as new documents
1110 // get added to the index.
1111 if (part.StartDate < lower_bound)
1112 part.StartDate = lower_bound;
1113 if (part.EndDate > upper_bound || part.EndDate == DateTime.MinValue)
1114 part.EndDate = upper_bound;
1116 // Swap the start and end dates if they come in reversed.
1117 if (part.StartDate > part.EndDate) {
1118 DateTime swap;
1119 swap = part.StartDate;
1120 part.StartDate = part.EndDate;
1121 part.EndDate = swap;
1124 // Set up our hit filter to cull out the bad dates.
1125 DateRangeHitFilter drhf;
1126 drhf = new DateRangeHitFilter ();
1127 drhf.Key = part.Key;
1128 drhf.StartDate = part.StartDate;
1129 drhf.EndDate = part.EndDate;
1130 hit_filter = new HitFilter (drhf.HitFilter);
1132 Logger.Log.Debug ("Building new date range query");
1133 Logger.Log.Debug ("Start: {0}", part.StartDate);
1134 Logger.Log.Debug ("End: {0}", part.EndDate);
1136 int y1, m1, d1, y2, m2, d2;
1137 y1 = part.StartDate.Year;
1138 m1 = part.StartDate.Month;
1139 d1 = part.StartDate.Day;
1140 y2 = part.EndDate.Year;
1141 m2 = part.EndDate.Month;
1142 d2 = part.EndDate.Day;
1144 LNS.BooleanQuery top_level_query;
1145 top_level_query = new LNS.BooleanQuery ();
1147 // A special case: both the start and the end of our range fall
1148 // in the same month.
1149 if (y1 == y2 && m1 == m2) {
1150 LNS.Query ym_query;
1151 ym_query = NewYearMonthQuery (field_name, y1, m1);
1153 // If our range only covers a part of the month, do a range query on the days.
1154 if (d1 != 1 || d2 != DateTime.DaysInMonth (y2, m2)) {
1155 LNS.BooleanQuery sub_query;
1156 sub_query = new LNS.BooleanQuery ();
1157 sub_query.Add (ym_query, true, false);
1158 sub_query.Add (NewDayQuery (field_name, d1, d2), true, false);
1159 top_level_query.Add (sub_query, false, false);
1160 } else {
1161 top_level_query.Add (ym_query, false, false);
1164 } else {
1166 // Handle a partial month at the beginning of our range.
1167 if (d1 > 1) {
1168 LNS.BooleanQuery sub_query;
1169 sub_query = new LNS.BooleanQuery ();
1170 sub_query.Add (NewYearMonthQuery (field_name, y1, m1), true, false);
1171 sub_query.Add (NewDayQuery (field_name, d1, DateTime.DaysInMonth (y1, m1)), true, false);
1172 top_level_query.Add (sub_query, false, false);
1174 ++m1;
1175 if (m1 == 13) {
1176 m1 = 1;
1177 ++y1;
1181 // And likewise, handle a partial month at the end of our range.
1182 if (d2 < DateTime.DaysInMonth (y2, m2)) {
1183 LNS.BooleanQuery sub_query;
1184 sub_query = new LNS.BooleanQuery ();
1185 sub_query.Add (NewYearMonthQuery (field_name, y2, m2), true, false);
1186 sub_query.Add (NewDayQuery (field_name, 1, d2), true, false);
1187 top_level_query.Add (sub_query, false, false);
1189 --m2;
1190 if (m2 == 0) {
1191 m2 = 12;
1192 --y2;
1196 // Generate the query for the "middle" of our period, if it is non-empty
1197 if (y1 < y2 || ((y1 == y2) && m1 <= m2))
1198 top_level_query.Add (NewYearMonthQuery (field_name, y1, m1, y2, m2),
1199 false, false);
1202 return top_level_query;
1205 // search_subset_uris is a list of Uris that this search should be
1206 // limited to.
1207 static protected void QueryPartToQuery (QueryPart abstract_part,
1208 bool only_build_primary_query,
1209 ArrayList term_list,
1210 out LNS.Query primary_query,
1211 out LNS.Query secondary_query,
1212 out HitFilter hit_filter)
1214 primary_query = null;
1215 secondary_query = null;
1217 // By default, we assume that our lucene queries will return exactly the
1218 // matching set of objects. We need to set the hit filter if further
1219 // refinement of the search results is required. (As in the case of
1220 // date range queries, for example.) We essentially have to do this
1221 // to make OR queries work correctly.
1222 hit_filter = true_hit_filter;
1224 // The exception is when dealing with a prohibited part. Just return
1225 // null for the hit filter in that case. This works since
1226 // prohibited parts are not allowed inside of OR queries.
1227 if (abstract_part.Logic == QueryPartLogic.Prohibited)
1228 hit_filter = null;
1230 if (abstract_part == null)
1231 return;
1233 if (abstract_part is QueryPart_Text) {
1234 QueryPart_Text part = (QueryPart_Text) abstract_part;
1236 if (! (part.SearchFullText || part.SearchTextProperties))
1237 return;
1239 LNS.BooleanQuery p_query = new LNS.BooleanQuery ();
1240 LNS.BooleanQuery s_query = new LNS.BooleanQuery ();
1242 if (part.SearchFullText) {
1243 LNS.Query subquery;
1244 subquery = StringToQuery ("Text", part.Text, term_list);
1245 if (subquery != null)
1246 p_query.Add (subquery, false, false);
1248 // FIXME: HotText is ignored for now!
1249 // subquery = StringToQuery ("HotText", part.Text);
1250 // if (subquery != null)
1251 // p_query.Add (subquery, false, false);
1254 if (part.SearchTextProperties) {
1255 LNS.Query subquery;
1256 subquery = StringToQuery ("PropertyText", part.Text, term_list);
1257 if (subquery != null) {
1258 p_query.Add (subquery, false, false);
1259 // Properties can live in either index
1260 if (! only_build_primary_query)
1261 s_query.Add (subquery.Clone () as LNS.Query, false, false);
1264 Term term;
1265 term = new Term ("PropertyKeyword", part.Text.ToLower ()); // make sure text is lowercased
1266 // FIXME: terms are already added in term_list. But they may have been tokenized
1267 // The term here is non-tokenized version. Should this be added to term_list ?
1268 // term_list is used to calculate scores
1269 if (term_list != null)
1270 term_list.Add (term);
1271 subquery = new LNS.TermQuery (term);
1272 p_query.Add (subquery, false, false);
1273 // Properties can live in either index
1274 if (! only_build_primary_query)
1275 s_query.Add (subquery.Clone () as LNS.Query, false, false);
1278 primary_query = p_query;
1279 if (! only_build_primary_query)
1280 secondary_query = s_query;
1282 return;
1285 if (abstract_part is QueryPart_Wildcard) {
1286 QueryPart_Wildcard part = (QueryPart_Wildcard) abstract_part;
1288 LNS.BooleanQuery p_query = new LNS.BooleanQuery ();
1289 LNS.BooleanQuery s_query = new LNS.BooleanQuery ();
1291 Term term;
1292 LNS.Query subquery;
1294 // Lower case the terms for searching
1295 string query_string_lower = part.QueryString.ToLower ();
1297 // Search text content
1298 term = new Term ("Text", query_string_lower);
1299 subquery = new LNS.WildcardQuery (term);
1300 p_query.Add (subquery, false, false);
1301 term_list.Add (term);
1303 // Search text properties
1304 term = new Term ("PropertyText", query_string_lower);
1305 subquery = new LNS.WildcardQuery (term);
1306 p_query.Add (subquery, false, false);
1307 // Properties can live in either index
1308 if (! only_build_primary_query)
1309 s_query.Add (subquery.Clone () as LNS.Query, false, false);
1310 term_list.Add (term);
1312 // Search property keywords
1313 term = new Term ("PropertyKeyword", query_string_lower);
1314 term_list.Add (term);
1315 subquery = new LNS.WildcardQuery (term);
1316 p_query.Add (subquery, false, false);
1317 // Properties can live in either index
1318 if (! only_build_primary_query)
1319 s_query.Add (subquery.Clone () as LNS.Query, false, false);
1321 primary_query = p_query;
1322 if (! only_build_primary_query)
1323 secondary_query = s_query;
1325 return;
1328 if (abstract_part is QueryPart_Property) {
1329 QueryPart_Property part = (QueryPart_Property) abstract_part;
1331 string field_name;
1332 if (part.Key == QueryPart_Property.AllProperties) {
1333 field_name = TypeToWildcardField (part.Type);
1334 // FIXME: probably shouldn't just return silently
1335 if (field_name == null)
1336 return;
1337 } else
1338 field_name = PropertyToFieldName (part.Type, part.Key);
1340 if (part.Type == PropertyType.Text)
1341 primary_query = StringToQuery (field_name, part.Value, term_list);
1342 else {
1343 Term term;
1344 term = new Term (field_name, part.Value.ToLower ());
1345 if (term_list != null)
1346 term_list.Add (term);
1347 primary_query = new LNS.TermQuery (term);
1350 // Properties can live in either index
1351 if (! only_build_primary_query && primary_query != null)
1352 secondary_query = primary_query.Clone () as LNS.Query;
1354 return;
1357 if (abstract_part is QueryPart_DateRange) {
1359 QueryPart_DateRange part = (QueryPart_DateRange) abstract_part;
1361 primary_query = GetDateRangeQuery (part, out hit_filter);
1362 // Date properties can live in either index
1363 if (! only_build_primary_query && primary_query != null)
1364 secondary_query = primary_query.Clone () as LNS.Query;
1366 // If this is a prohibited part, invert our hit filter.
1367 if (part.Logic == QueryPartLogic.Prohibited) {
1368 NotHitFilter nhf;
1369 nhf = new NotHitFilter (hit_filter);
1370 hit_filter = new HitFilter (nhf.HitFilter);
1373 return;
1376 if (abstract_part is QueryPart_Or) {
1377 QueryPart_Or part = (QueryPart_Or) abstract_part;
1379 // Assemble a new BooleanQuery combining all of the sub-parts.
1380 LNS.BooleanQuery p_query;
1381 p_query = new LNS.BooleanQuery ();
1383 LNS.BooleanQuery s_query = null;
1384 if (! only_build_primary_query)
1385 s_query = new LNS.BooleanQuery ();
1387 primary_query = p_query;
1388 secondary_query = s_query;
1390 OrHitFilter or_hit_filter = null;
1392 foreach (QueryPart sub_part in part.SubParts) {
1393 LNS.Query p_subq, s_subq;
1394 HitFilter sub_hit_filter; // FIXME: This is (and must be) ignored
1395 // FIXME: Any subpart in an OR which has a hit filter won't work
1396 // correctly, because we can't tell which part of an OR we matched
1397 // against to filter correctly. This affects date range queries.
1398 QueryPartToQuery (sub_part, only_build_primary_query,
1399 term_list,
1400 out p_subq, out s_subq, out sub_hit_filter);
1401 if (p_subq != null)
1402 p_query.Add (p_subq, false, false);
1403 if (s_subq != null)
1404 s_query.Add (s_subq, false, false);
1405 if (sub_hit_filter != null) {
1406 if (or_hit_filter == null)
1407 or_hit_filter = new OrHitFilter ();
1408 or_hit_filter.Add (sub_hit_filter);
1412 if (or_hit_filter != null)
1413 hit_filter = new HitFilter (or_hit_filter.HitFilter);
1415 return;
1418 throw new Exception ("Unhandled QueryPart type! " + abstract_part.ToString ());
1421 static protected LNS.Query UriQuery (string field_name, Uri uri)
1423 return new LNS.TermQuery (new Term (field_name, UriFu.UriToEscapedString (uri)));
1426 static protected LNS.Query UriQuery (string field_name, ICollection uri_list)
1428 return UriQuery (field_name, uri_list, null);
1431 static protected LNS.Query UriQuery (string field_name, ICollection uri_list, LNS.Query extra_requirement)
1433 if (uri_list.Count == 0)
1434 return null;
1436 int max_clauses;
1437 max_clauses = LNS.BooleanQuery.GetMaxClauseCount ();
1439 int N;
1440 N = 1 + (uri_list.Count - 1) / max_clauses;
1442 LNS.BooleanQuery top_query;
1443 top_query = new LNS.BooleanQuery ();
1445 int cursor = 0;
1446 if (extra_requirement != null) {
1447 top_query.Add (extra_requirement, true, false);
1448 ++cursor;
1451 ArrayList bottom_queries = null;
1453 if (N > 1) {
1454 bottom_queries = new ArrayList ();
1455 for (int i = 0; i < N; ++i) {
1456 LNS.BooleanQuery bq;
1457 bq = new LNS.BooleanQuery ();
1458 bottom_queries.Add (bq);
1459 top_query.Add (bq, false, false);
1463 foreach (Uri uri in uri_list) {
1464 LNS.Query subquery;
1465 subquery = UriQuery (field_name, uri);
1467 LNS.BooleanQuery target;
1468 if (N == 1)
1469 target = top_query;
1470 else {
1471 target = (LNS.BooleanQuery) bottom_queries [cursor];
1472 ++cursor;
1473 if (cursor >= N)
1474 cursor = 0;
1477 target.Add (subquery, false, false);
1480 return top_query;
1483 ///////////////////////////////////////////////////////////////////////////////////
1485 public int SegmentCount {
1486 get {
1487 DirectoryInfo dir_info;
1488 int p_count = 0, s_count = 0;
1490 dir_info = new DirectoryInfo (PrimaryIndexDirectory);
1491 foreach (FileInfo file_info in dir_info.GetFiles ())
1492 if (file_info.Extension == ".cfs")
1493 ++p_count;
1495 dir_info = new DirectoryInfo (SecondaryIndexDirectory);
1496 foreach (FileInfo file_info in dir_info.GetFiles ())
1497 if (file_info.Extension == ".cfs")
1498 ++s_count;
1500 return p_count > s_count ? p_count : s_count;
1504 ///////////////////////////////////////////////////////////////////////////////////
1506 // Cache IndexReaders on a per-Lucene index basis, since they
1507 // are extremely expensive to create. Note that using this
1508 // only makes sense in situations where the index only
1509 // possibly might change from underneath us, but most of the
1510 // time probably won't. This means it makes sense to do
1511 // this in LuceneQueryingDriver.cs, but it doesn't in
1512 // LuceneIndexingDriver.cs.
1514 private class ReaderAndVersion {
1516 public IndexReader Reader;
1517 public long Version;
1518 public int Refcount;
1520 public ReaderAndVersion (IndexReader reader, long version)
1522 this.Reader = reader;
1523 this.Version = version;
1524 this.Refcount = 1;
1528 static private Hashtable directory_rav_map = new Hashtable ();
1529 static private Hashtable reader_rav_map = new Hashtable ();
1531 static public LNS.IndexSearcher GetSearcher (Lucene.Net.Store.Directory directory)
1533 IndexReader reader = GetReader (directory);
1535 return new LNS.IndexSearcher (reader);
1538 static public IndexReader GetReader (Lucene.Net.Store.Directory directory)
1540 IndexReader reader;
1541 long version;
1543 lock (reader_rav_map) {
1544 ReaderAndVersion rav = (ReaderAndVersion) directory_rav_map [directory];
1546 if (rav == null) {
1547 version = IndexReader.GetCurrentVersion (directory);
1548 reader = IndexReader.Open (directory);
1550 rav = new ReaderAndVersion (reader, version);
1551 rav.Refcount++;
1553 directory_rav_map [directory] = rav;
1554 reader_rav_map [reader] = rav;
1556 return reader;
1559 version = IndexReader.GetCurrentVersion (directory);
1561 if (version != rav.Version) {
1562 UnrefReaderAndVersion_Unlocked (rav);
1564 reader = IndexReader.Open (directory);
1566 rav = new ReaderAndVersion (reader, version);
1567 rav.Refcount++;
1569 directory_rav_map [directory] = rav;
1570 reader_rav_map [reader] = rav;
1571 } else
1572 rav.Refcount++;
1574 return rav.Reader;
1578 static private void UnrefReaderAndVersion_Unlocked (ReaderAndVersion rav)
1580 rav.Refcount--;
1582 if (rav.Refcount == 0) {
1583 rav.Reader.Close ();
1584 reader_rav_map.Remove (rav.Reader);
1588 static public void ReleaseReader (IndexReader reader)
1590 lock (reader_rav_map) {
1591 ReaderAndVersion rav = (ReaderAndVersion) reader_rav_map [reader];
1593 UnrefReaderAndVersion_Unlocked (rav);
1597 static public void ReleaseSearcher (LNS.IndexSearcher searcher)
1599 IndexReader reader = searcher.GetIndexReader ();
1601 searcher.Close ();
1602 ReleaseReader (reader);
1605 ///////////////////////////////////////////////////////////////////////////////////
1608 // Various ways to grab lots of hits at once.
1609 // These should never be used for querying, only for utility
1610 // functions.
1613 public int GetBlockOfHits (int cookie,
1614 Hit [] block_of_hits)
1616 IndexReader primary_reader;
1617 IndexReader secondary_reader;
1618 primary_reader = GetReader (PrimaryStore);
1619 secondary_reader = GetReader (SecondaryStore);
1621 int request_size;
1622 request_size = block_of_hits.Length;
1623 if (request_size > primary_reader.NumDocs ())
1624 request_size = primary_reader.NumDocs ();
1626 int max_doc;
1627 max_doc = primary_reader.MaxDoc ();
1629 if (cookie < 0) {
1630 Random random;
1631 random = new Random ();
1632 cookie = random.Next (max_doc);
1635 int original_cookie;
1636 original_cookie = cookie;
1638 Hashtable primary_docs, secondary_docs;
1639 primary_docs = UriFu.NewHashtable ();
1640 secondary_docs = UriFu.NewHashtable ();
1642 // Load the primary documents
1643 for (int i = 0; i < request_size; ++i) {
1645 if (! primary_reader.IsDeleted (cookie)) {
1646 Document doc;
1647 doc = primary_reader.Document (cookie);
1648 primary_docs [GetUriFromDocument (doc)] = doc;
1651 ++cookie;
1652 if (cookie >= max_doc) // wrap around
1653 cookie = 0;
1655 // If we somehow end up back where we started,
1656 // give up.
1657 if (cookie == original_cookie)
1658 break;
1661 // If necessary, load the secondary documents
1662 if (secondary_reader != null) {
1663 LNS.IndexSearcher searcher;
1664 searcher = new LNS.IndexSearcher (secondary_reader);
1666 LNS.Query uri_query;
1667 uri_query = UriQuery ("Uri", primary_docs.Keys);
1669 LNS.Hits hits;
1670 hits = searcher.Search (uri_query);
1671 for (int i = 0; i < hits.Length (); ++i) {
1672 Document doc;
1673 doc = hits.Doc (i);
1674 secondary_docs [GetUriFromDocument (doc)] = doc;
1677 searcher.Close ();
1680 ReleaseReader (primary_reader);
1681 ReleaseReader (secondary_reader);
1683 // Now assemble the hits
1684 int j = 0;
1685 foreach (Uri uri in primary_docs.Keys) {
1686 Document primary_doc, secondary_doc;
1687 primary_doc = primary_docs [uri] as Document;
1688 secondary_doc = secondary_docs [uri] as Document;
1690 Hit hit;
1691 hit = DocumentToHit (primary_doc);
1692 if (secondary_doc != null)
1693 AddPropertiesToHit (hit, secondary_doc, false);
1695 block_of_hits [j] = hit;
1696 ++j;
1699 // null-pad the array, if necessary
1700 for (; j < block_of_hits.Length; ++j)
1701 block_of_hits [j] = null;
1704 // Return the new cookie
1705 return cookie;
1708 // For a large index, this will be very slow and will consume
1709 // a lot of memory. Don't call it without a good reason!
1710 // We return a hashtable indexed by Uri.
1711 public Hashtable GetAllHitsByUri ()
1713 Hashtable all_hits;
1714 all_hits = UriFu.NewHashtable ();
1716 IndexReader primary_reader;
1717 IndexReader secondary_reader;
1718 primary_reader = GetReader (PrimaryStore);
1719 secondary_reader = GetReader (SecondaryStore);
1721 // Load everything from the primary index
1722 int max_doc;
1723 max_doc = primary_reader.MaxDoc ();
1724 for (int i = 0; i < max_doc; ++i) {
1726 if (primary_reader.IsDeleted (i))
1727 continue;
1729 Document doc;
1730 doc = primary_reader.Document (i);
1732 Hit hit;
1733 hit = DocumentToHit (doc);
1734 all_hits [hit.Uri] = hit;
1737 // Now add in everything from the secondary index, if it exists
1738 if (secondary_reader != null) {
1739 max_doc = secondary_reader.MaxDoc ();
1740 for (int i = 0; i < max_doc; ++i) {
1742 if (secondary_reader.IsDeleted (i))
1743 continue;
1745 Document doc;
1746 doc = secondary_reader.Document (i);
1748 Uri uri;
1749 uri = GetUriFromDocument (doc);
1751 Hit hit;
1752 hit = (Hit) all_hits [uri];
1753 if (hit != null)
1754 AddPropertiesToHit (hit, doc, false);
1758 ReleaseReader (primary_reader);
1759 ReleaseReader (secondary_reader);
1761 return all_hits;