beagled/LuceneCommon.cs

   1 //
   2 // LuceneCommon.cs
   3 //
   4 // Copyright (C) 2004-2005 Novell, Inc.
   5 //
   6
   7 //
   8 // Permission is hereby granted, free of charge, to any person obtaining a
   9 // copy of this software and associated documentation files (the "Software"),
  10 // to deal in the Software without restriction, including without limitation
  11 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
  12 // and/or sell copies of the Software, and to permit persons to whom the
  13 // Software is furnished to do so, subject to the following conditions:
  14 //
  15 // The above copyright notice and this permission notice shall be included in
  16 // all copies or substantial portions of the Software.
  17 //
  18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  24 // DEALINGS IN THE SOFTWARE.
  25 //
  26
  27 using System;
  28 using System.Collections;
  29 using System.Diagnostics;
  30 using System.Globalization;
  31 using System.IO;
  32 using System.Text;
  33 using System.Threading;
  34 using System.Xml;
  35 using System.Xml.Serialization;
  36
  37 using Lucene.Net.Analysis;
  38 using Lucene.Net.Analysis.Standard;
  39 using Lucene.Net.Documents;
  40 using Lucene.Net.Index;
  41 using Lucene.Net.QueryParsers;
  42 using LNS = Lucene.Net.Search;
  43
  44 using Beagle.Util;
  45
  46 namespace Beagle.Daemon {
  47
  48         public class LuceneCommon {
  49
  50                 public delegate bool HitFilter (Hit hit);
  51
  52                 // VERSION HISTORY
  53                 // ---------------
  54                 //
  55                 //  1: Original
  56                 //  2: Changed format of timestamp strings
  57                 //  3: Schema changed to be more Dashboard-Match-like
  58                 //  4: Schema changed for files to include _Directory property
  59                 //  5: Changed analyzer to support stemming.  Bumped version # to
  60                 //     force everyone to re-index.
  61                 //  6: lots of schema changes as part of the general refactoring
  62                 //  7: incremented to force a re-index after our upgrade to lucene 1.4
  63                 //     (in theory the file formats are compatible, we are seeing 'term
  64                 //     out of order' exceptions in some cases)
  65                 //  8: another forced re-index, this time because of massive changes
  66                 //     in the file system backend (it would be nice to have per-backend
  67                 //     versioning so that we didn't have to purge all indexes just
  68                 //     because one changed)
  69                 //  9: changed the way properties are stored, changed in conjunction
  70                 //     with sane handling of multiple properties on hits.
  71                 // 10: changed to support typed and mutable properties
  72                 // 11: moved mime type and hit type into properties
  73                 // 12: added year-month and year-month-day resolutions for all
  74                 //     date properties
  75                 // 13: moved source into a property
  76                 // 14: allow wildcard queries to also match keywords
  77                 private const int MAJOR_VERSION = 14;
  78                 private int minor_version = 0;
  79
  80                 private string index_name;
  81                 private string top_dir;
  82
  83                 private string fingerprint;
  84                 private int last_item_count = -1;
  85
  86                 // This is the big index, containing document full-texts and
  87                 // data that is expensive to index.
  88                 private Lucene.Net.Store.Directory primary_store = null;
  89
  90                 // This is the small index, containing document info that we
  91                 // expect to have change.  Canonical example: file names.
  92                 private Lucene.Net.Store.Directory secondary_store = null;
  93
  94                 //////////////////////////////////////////////////////////////////////////////
  95
  96                 protected LuceneCommon (string index_name, int minor_version)
  97                 {
  98                         this.index_name = index_name;
  99                         this.minor_version = minor_version;
 100
 101                         this.top_dir = (Path.IsPathRooted (index_name)) ? index_name : Path.Combine (PathFinder.IndexDir, index_name);
 102                 }
 103
 104                 //////////////////////////////////////////////////////////////////////////////
 105
 106                 protected string IndexName { get { return index_name; } }
 107
 108                 public Lucene.Net.Store.Directory PrimaryStore { get { return primary_store; } }
 109
 110                 public Lucene.Net.Store.Directory SecondaryStore { get { return secondary_store; } }
 111
 112                 public string Fingerprint { get { return fingerprint; } }
 113
 114                 public string TopDirectory { get { return top_dir; } }
 115
 116                 //////////////////////////////////////////////////////////////////////////////
 117
 118                 protected TextCache text_cache = null;
 119
 120                 public TextCache TextCache {
 121                         get { return text_cache; }
 122                         set { text_cache = value; }
 123                 }
 124
 125                 //////////////////////////////////////////////////////////////////////////////
 126
 127                 private string VersionFile {
 128                         get { return Path.Combine (top_dir, "version"); }
 129                 }
 130
 131                 private string FingerprintFile {
 132                         get { return Path.Combine (top_dir, "fingerprint"); }
 133                 }
 134
 135                 // Shouldn't really be public
 136                 public string PrimaryIndexDirectory {
 137                         get { return Path.Combine (top_dir, "PrimaryIndex"); }
 138                 }
 139
 140                 // Shouldn't really be public
 141                 public string SecondaryIndexDirectory {
 142                         get { return Path.Combine (top_dir, "SecondaryIndex"); }
 143                 }
 144
 145                 public string LockDirectory {
 146                         get { return Path.Combine (top_dir, "Locks"); }
 147                 }
 148
 149                 //////////////////////////////////////////////////////////////////////////////
 150
 151                 // Deal with dangling locks
 152
 153                 private bool IsDanglingLock (FileInfo info)
 154                 {
 155                         Log.Debug ("Checking for dangling locks...");
 156
 157                         // It isn't even a lock file
 158                         if (! info.Name.EndsWith (".lock"))
 159                                 return false;
 160
 161                         StreamReader reader;
 162                         string pid = null;
 163
 164                         try {
 165                                 reader = new StreamReader (info.FullName);
 166                                 pid = reader.ReadLine ();
 167                                 reader.Close ();
 168
 169                         } catch {
 170                                 // We couldn't read the lockfile, so it probably went away.
 171                                 return false;
 172                         }
 173
 174
 175                         if (pid == null) {
 176                                 // Looks like the lock file was empty, which really
 177                                 // shouldn't happen.  It should contain the PID of
 178                                 // the process which locked it.  Lets be on the safe
 179                                 // side and assume it's a dangling lock.
 180                                 Log.Warn ("Found an empty lock file, that shouldn't happen: {0}", info.FullName);
 181                                 return true;
 182                         }
 183
 184                         string cmdline_file;
 185                         cmdline_file = String.Format ("/proc/{0}/cmdline", pid);
 186
 187                         string cmdline = "";
 188                         try {
 189                                 reader = new StreamReader (cmdline_file);
 190                                 cmdline = reader.ReadLine ();
 191                                 reader.Close ();
 192                         } catch {
 193                                 // If we can't open that file, either:
 194                                 // (1) The process doesn't exist
 195                                 // (2) It does exist, but it doesn't belong to us.
 196                                 //     Thus it isn't an IndexHelper
 197                                 // In either case, the lock is dangling --- if it
 198                                 // still exists.
 199                                 return info.Exists;
 200                         }
 201
 202                         // The process exists, but isn't an IndexHelper.
 203                         // If the lock file is still there, it is dangling.
 204                         // FIXME: During one run of bludgeon I got a null reference
 205                         // exception here, so I added the cmdline == null check.
 206                         // Why exactly would that happen?  Is this logic correct
 207                         // in that (odd and presumably rare) case?
 208                         if (cmdline == null || cmdline.IndexOf ("IndexHelper.exe") == -1)
 209                                 return info.Exists;
 210
 211                         // If we reach this point, we know:
 212                         // (1) The process still exists
 213                         // (2) We own it
 214                         // (3) It is an IndexHelper process
 215                         // Thus it almost certainly isn't a dangling lock.
 216                         // The process might be wedged, but that is
 217                         // another issue...
 218                         return false;
 219                 }
 220
 221                 protected bool Exists ()
 222                 {
 223                         if (! (Directory.Exists (top_dir)
 224                                && File.Exists (VersionFile)
 225                                && File.Exists (FingerprintFile)
 226                                && Directory.Exists (PrimaryIndexDirectory)
 227                                && IndexReader.IndexExists (PrimaryIndexDirectory)
 228                                && Directory.Exists (SecondaryIndexDirectory)
 229                                && IndexReader.IndexExists (SecondaryIndexDirectory)
 230                                && Directory.Exists (LockDirectory)))
 231                                 return false;
 232
 233                         // Check the index's version number.  If it is wrong,
 234                         // declare the index non-existent.
 235
 236                         StreamReader version_reader;
 237                         string version_str;
 238                         version_reader = new StreamReader (VersionFile);
 239                         version_str = version_reader.ReadLine ();
 240                         version_reader.Close ();
 241
 242                         int current_major_version, current_minor_version;
 243                         int i = version_str.IndexOf ('.');
 244
 245                         if (i != -1) {
 246                                 current_major_version = Convert.ToInt32 (version_str.Substring (0, i));
 247                                 current_minor_version = Convert.ToInt32 (version_str.Substring (i+1));
 248                         } else {
 249                                 current_minor_version = Convert.ToInt32 (version_str);
 250                                 current_major_version = 0;
 251                         }
 252
 253                         if (current_major_version != MAJOR_VERSION
 254                             || (minor_version >= 0 && current_minor_version != minor_version)) {
 255                                 Logger.Log.Debug ("Version mismatch in {0}", index_name);
 256                                 Logger.Log.Debug ("Index has version {0}.{1}, expected {2}.{3}",
 257                                                   current_major_version, current_minor_version,
 258                                                   MAJOR_VERSION, minor_version);
 259                                 return false;
 260                         }
 261
 262                         // Check the lock directory: If there is a dangling write lock,
 263                         // assume that the index is corrupted and declare it non-existent.
 264                         DirectoryInfo lock_dir_info;
 265                         lock_dir_info = new DirectoryInfo (LockDirectory);
 266                         foreach (FileInfo info in lock_dir_info.GetFiles ()) {
 267                                 if (IsDanglingLock (info)) {
 268                                         Logger.Log.Warn ("Found a dangling index lock on {0}", info.FullName);
 269                                         return false;
 270                                 }
 271                         }
 272
 273                         return true;
 274                 }
 275
 276                 private Lucene.Net.Store.Directory CreateIndex (string path)
 277                 {
 278                         // Create a directory to put the index in.
 279                         Directory.CreateDirectory (path);
 280
 281                         // Create a new store.
 282                         Lucene.Net.Store.Directory store;
 283                         store = Lucene.Net.Store.FSDirectory.GetDirectory (path, LockDirectory, true);
 284
 285                         // Create an empty index in that store.
 286                         IndexWriter writer;
 287                         writer = new IndexWriter (store, null, true);
 288                         writer.Close ();
 289
 290                         return store;
 291                 }
 292
 293                 // Create will kill your index dead.  Use it with care.
 294                 // You don't need to call Open after calling Create.
 295                 protected void Create ()
 296                 {
 297                         if (minor_version < 0)
 298                                 minor_version = 0;
 299
 300                         // Purge any existing directories.
 301                         if (Directory.Exists (top_dir)) {
 302                                 Logger.Log.Debug ("Purging {0}", top_dir);
 303                                 Directory.Delete (top_dir, true);
 304                         }
 305
 306                         // Create any necessary directories.
 307                         Directory.CreateDirectory (top_dir);
 308                         Directory.CreateDirectory (LockDirectory);
 309
 310                         // Create the indexes.
 311                         primary_store = CreateIndex (PrimaryIndexDirectory);
 312                         secondary_store = CreateIndex (SecondaryIndexDirectory);
 313
 314                         // Generate and store the index fingerprint.
 315                         fingerprint = GuidFu.ToShortString (Guid.NewGuid ());
 316                         TextWriter writer;
 317                         writer = new StreamWriter (FingerprintFile, false);
 318                         writer.WriteLine (fingerprint);
 319                         writer.Close ();
 320
 321                         // Store our index version information.
 322                         writer = new StreamWriter (VersionFile, false);
 323                         writer.WriteLine ("{0}.{1}", MAJOR_VERSION, minor_version);
 324                         writer.Close ();
 325                 }
 326
 327                 protected void Open ()
 328                 {
 329                         Open (false);
 330                 }
 331
 332                 protected void Open (bool read_only_mode)
 333                 {
 334                         // Read our index fingerprint.
 335                         TextReader reader;
 336                         reader = new StreamReader (FingerprintFile);
 337                         fingerprint = reader.ReadLine ();
 338                         reader.Close ();
 339
 340                         // Create stores for our indexes.
 341                         primary_store = Lucene.Net.Store.FSDirectory.GetDirectory (PrimaryIndexDirectory, LockDirectory, false, read_only_mode);
 342                         secondary_store = Lucene.Net.Store.FSDirectory.GetDirectory (SecondaryIndexDirectory, LockDirectory, false, read_only_mode);
 343                 }
 344
 345                 ////////////////////////////////////////////////////////////////
 346
 347                 //
 348                 // Custom Analyzers
 349                 //
 350
 351                 private class SingletonTokenStream : TokenStream {
 352
 353                         private string singleton_str;
 354
 355                         public SingletonTokenStream (string singleton_str)
 356                         {
 357                                 this.singleton_str = singleton_str;
 358                         }
 359
 360                         override public Lucene.Net.Analysis.Token Next ()
 361                         {
 362                                 if (singleton_str == null)
 363                                         return null;
 364
 365                                 Lucene.Net.Analysis.Token token;
 366                                 token = new Lucene.Net.Analysis.Token (singleton_str, 0, singleton_str.Length);
 367
 368                                 singleton_str = null;
 369
 370                                 return token;
 371                         }
 372                 }
 373
 374                 // FIXME: This assumes everything being indexed is in English!
 375                 private class BeagleAnalyzer : StandardAnalyzer {
 376
 377                         private char [] buffer = new char [2];
 378                         private bool strip_extra_property_info = false;
 379
 380                         public BeagleAnalyzer (bool strip_extra_property_info)
 381                         {
 382                                 this.strip_extra_property_info = strip_extra_property_info;
 383                         }
 384
 385                         public override TokenStream TokenStream (string fieldName, TextReader reader)
 386                         {
 387                                 bool is_text_prop = false;
 388
 389                                 // Strip off the first two characters in a property.
 390                                 // We store type information in those two characters, so we don't
 391                                 // want to index them.
 392                                 if (fieldName.StartsWith ("prop:")) {
 393
 394                                         if (strip_extra_property_info) {
 395                                                 // Skip everything up to and including the first :
 396                                                 int c;
 397                                                 do {
 398                                                         c = reader.Read ();
 399                                                 } while (c != -1 && c != ':');
 400                                         }
 401
 402                                         is_text_prop = fieldName.StartsWith ("prop:t");
 403
 404                                         // If this is non-text property, just return one token
 405                                         // containing the entire string.  We do this to avoid
 406                                         // tokenizing keywords.
 407                                         if (! is_text_prop)
 408                                                 return new SingletonTokenStream (reader.ReadToEnd ());
 409                                 }
 410
 411                                 TokenStream outstream;
 412                                 outstream = base.TokenStream (fieldName, reader);
 413
 414                                 if (fieldName == "Text"
 415                                     || fieldName == "HotText"
 416                                     || fieldName == "PropertyText"
 417                                     || is_text_prop) {
 418                                         outstream = new NoiseFilter (outstream);
 419                                         outstream = new PorterStemFilter (outstream);
 420                                 }
 421
 422                                 return outstream;
 423                         }
 424                 }
 425
 426                 static private Analyzer indexing_analyzer = new BeagleAnalyzer (true);
 427                 static private Analyzer query_analyzer = new BeagleAnalyzer (false);
 428
 429                 static protected Analyzer IndexingAnalyzer { get { return indexing_analyzer; } }
 430                 static protected Analyzer QueryAnalyzer { get { return query_analyzer; } }
 431
 432                 ////////////////////////////////////////////////////////////////
 433
 434                 //
 435                 // Dealing with properties
 436                 //
 437
 438                 static private char TypeToCode (PropertyType type)
 439                 {
 440                         switch (type) {
 441                         case PropertyType.Text:    return 't';
 442                         case PropertyType.Keyword: return 'k';
 443                         case PropertyType.Date:    return 'd';
 444                         }
 445                         throw new Exception ("Bad property type: " + type);
 446                 }
 447
 448                 static private PropertyType CodeToType (char c)
 449                 {
 450                         switch (c) {
 451                         case 't': return PropertyType.Text;
 452                         case 'k': return PropertyType.Keyword;
 453                         case 'd': return PropertyType.Date;
 454                         }
 455
 456                         throw new Exception ("Bad property code: " + c);
 457                 }
 458
 459                 static private string TypeToWildcardField (PropertyType type)
 460                 {
 461                         switch (type) {
 462                         case PropertyType.Text:    return "PropertyText";
 463                         case PropertyType.Keyword: return "PropertyKeyword";
 464                         case PropertyType.Date:    return "PropertyDate";
 465                         }
 466
 467                         return null;
 468                 }
 469
 470                 // Exposing this is a little bit suspicious.
 471                 static protected string PropertyToFieldName (PropertyType type, string key)
 472                 {
 473                         return String.Format ("prop:{0}:{1}", TypeToCode (type), key);
 474
 475                 }
 476
 477                 static private void AddDateFields (string field_name, Property prop, Document doc)
 478                 {
 479                         DateTime dt = StringFu.StringToDateTime (prop.Value);
 480
 481                         Field f;
 482                         f = new Field ("YM:" + field_name,
 483                                        StringFu.DateTimeToYearMonthString (dt),
 484                                        false,   // never store
 485                                        true,    // always index
 486                                        false);  // never tokenize
 487                         doc.Add (f);
 488
 489                         f = new Field ("D:" + field_name,
 490                                        StringFu.DateTimeToDayString (dt),
 491                                        false,   // never store
 492                                        true,    // always index
 493                                        false);  // never tokenize
 494                         doc.Add (f);
 495                 }
 496
 497                 static protected void AddPropertyToDocument (Property prop, Document doc)
 498                 {
 499                         if (prop == null || prop.Value == null)
 500                                 return;
 501
 502                         // Don't actually put properties in the UnindexedNamespace
 503                         // in the document.  A horrible (and yet lovely!) hack.
 504                         if (prop.Key.StartsWith (StringFu.UnindexedNamespace))
 505                                 return;
 506
 507                         Field f;
 508
 509                         if (prop.IsSearched) {
 510                                 string wildcard_field = TypeToWildcardField (prop.Type);
 511                                 bool tokenize = (prop.Type == PropertyType.Text);
 512                                 if (wildcard_field != null) {
 513                                         f = new Field (wildcard_field,
 514                                                        prop.Value,
 515                                                        false, // never stored
 516                                                        true,  // always indexed
 517                                                        tokenize);
 518                                         doc.Add (f);
 519
 520                                         if (prop.Type == PropertyType.Date)
 521                                                 AddDateFields (wildcard_field, prop, doc);
 522                                 }
 523                         }
 524
 525                         string coded_value;
 526                         coded_value = String.Format ("{0}:{1}",
 527                                                      prop.IsSearched ? 's' : '_',
 528                                                      prop.Value);
 529
 530                         string field_name = PropertyToFieldName (prop.Type, prop.Key);
 531
 532                         f = new Field (field_name,
 533                                        coded_value,
 534                                        prop.IsStored,
 535                                        true,        // always index
 536                                        true);       // always tokenize (just strips off type code for keywords)
 537                         doc.Add (f);
 538
 539                         if (prop.Type == PropertyType.Date)
 540                                 AddDateFields (field_name, prop, doc);
 541                 }
 542
 543                 static protected Property GetPropertyFromDocument (Field f, Document doc, bool from_primary_index)
 544                 {
 545                         // Note: we don't use the document that we pass in,
 546                         // but in theory we could.  At some later point we
 547                         // might need to split a property's data across two or
 548                         // more fields in the document.
 549
 550                         if (f == null)
 551                                 return null;
 552
 553                         string field_name;
 554                         field_name = f.Name ();
 555                         if (field_name.Length < 7
 556                             || ! field_name.StartsWith ("prop:"))
 557                                 return null;
 558
 559                         string field_value;
 560                         field_value = f.StringValue ();
 561
 562                         Property prop;
 563                         prop = new Property ();
 564                         prop.Type = CodeToType (field_name [5]);
 565                         prop.Key = field_name.Substring (7);
 566                         prop.Value = field_value.Substring (2);
 567                         prop.IsSearched = (field_value [0] == 's');
 568                         prop.IsMutable = ! from_primary_index;
 569                         prop.IsStored = f.IsStored ();
 570
 571                         return prop;
 572                 }
 573
 574                 //////////////////////////////////////////////////////////////////////////////
 575
 576                 //
 577                 // Dealing with documents
 578                 //
 579
 580                 static protected void BuildDocuments (Indexable indexable,
 581                                                       out Document primary_doc,
 582                                                       out Document secondary_doc)
 583                 {
 584                         primary_doc = new Document ();
 585                         secondary_doc = null;
 586
 587                         Field f;
 588
 589                         f = Field.Keyword ("Uri", UriFu.UriToSerializableString (indexable.Uri));
 590                         primary_doc.Add (f);
 591
 592                         if (indexable.ParentUri != null) {
 593                                 f = Field.Keyword ("ParentUri", UriFu.UriToSerializableString (indexable.ParentUri));
 594                                 primary_doc.Add (f);
 595                         }
 596
 597                         if (indexable.ValidTimestamp) {
 598                                 // Note that we also want to search in the
 599                                 // Timestamp field when we do a wildcard date
 600                                 // query, so that's why we also add a wildcard
 601                                 // field for each item here.
 602
 603                                 string wildcard_field = TypeToWildcardField (PropertyType.Date);
 604
 605                                 string str = StringFu.DateTimeToString (indexable.Timestamp);
 606                                 f = Field.Keyword ("Timestamp", str);
 607                                 primary_doc.Add (f);
 608                                 f = Field.UnStored (wildcard_field, str);
 609                                 primary_doc.Add (f);
 610
 611                                 str = StringFu.DateTimeToYearMonthString (indexable.Timestamp);
 612                                 f = Field.Keyword ("YM:Timestamp", str);
 613                                 primary_doc.Add (f);
 614                                 f = Field.UnStored ("YM:" + wildcard_field, str);
 615                                 primary_doc.Add (f);
 616
 617                                 str = StringFu.DateTimeToDayString (indexable.Timestamp);
 618                                 f = Field.Keyword ("D:Timestamp", str);
 619                                 primary_doc.Add (f);
 620                                 f = Field.UnStored ("D:" + wildcard_field, str);
 621                                 primary_doc.Add (f);
 622                         }
 623
 624                         if (indexable.NoContent) {
 625                                 // If there is no content, make a note of that
 626                                 // in a special property.
 627                                 Property prop;
 628                                 prop = Property.NewBool ("beagle:NoContent", true);
 629                                 AddPropertyToDocument (prop, primary_doc);
 630
 631                         } else {
 632
 633                                 // Since we might have content, add our text
 634                                 // readers.
 635
 636                                 TextReader reader;
 637
 638                                 reader = indexable.GetTextReader ();
 639                                 if (reader != null) {
 640                                         f = Field.Text ("Text", reader);
 641                                         primary_doc.Add (f);
 642                                 }
 643
 644                                 reader = indexable.GetHotTextReader ();
 645                                 if (reader != null) {
 646                                         f = Field.Text ("HotText", reader);
 647                                         primary_doc.Add (f);
 648                                 }
 649                         }
 650
 651                         // Store the Type and MimeType in special properties
 652
 653                         if (indexable.HitType != null) {
 654                                 Property prop;
 655                                 prop = Property.NewUnsearched ("beagle:HitType", indexable.HitType);
 656                                 AddPropertyToDocument (prop, primary_doc);
 657                         }
 658
 659                         if (indexable.MimeType != null) {
 660                                 Property prop;
 661                                 prop = Property.NewUnsearched ("beagle:MimeType", indexable.MimeType);
 662                                 AddPropertyToDocument (prop, primary_doc);
 663                         }
 664
 665                         if (indexable.Source != null) {
 666                                 Property prop;
 667                                 prop = Property.NewUnsearched ("beagle:Source", indexable.Source);
 668                                 AddPropertyToDocument (prop, primary_doc);
 669                         }
 670
 671                         // Store the other properties
 672
 673                         foreach (Property prop in indexable.Properties) {
 674                                 Document target_doc = primary_doc;
 675                                 if (prop.IsMutable) {
 676                                         if (secondary_doc == null) {
 677                                                 secondary_doc = new Document ();
 678                                                 f = Field.Keyword ("Uri", UriFu.UriToSerializableString (indexable.Uri));
 679                                                 secondary_doc.Add (f);
 680                                         }
 681                                         target_doc = secondary_doc;
 682                                 }
 683
 684                                 AddPropertyToDocument (prop, target_doc);
 685                         }
 686                 }
 687
 688                 static protected Document RewriteDocument (Document old_secondary_doc,
 689                                                            Indexable prop_only_indexable)
 690                 {
 691                         Hashtable seen_props;
 692                         seen_props = new Hashtable ();
 693
 694                         Document new_doc;
 695                         new_doc = new Document ();
 696
 697                         Field uri_f;
 698                         uri_f = Field.Keyword ("Uri", UriFu.UriToSerializableString (prop_only_indexable.Uri));
 699                         new_doc.Add (uri_f);
 700
 701                         Logger.Log.Debug ("Rewriting {0}", prop_only_indexable.DisplayUri);
 702
 703                         // Add the new properties to the new document.  To
 704                         // delete a property, set the Value to null... then it
 705                         // will be added to seen_props (so the old value will
 706                         // be ignored below), but AddPropertyToDocument will
 707                         // return w/o doing anything.
 708                         foreach (Property prop in prop_only_indexable.Properties) {
 709                                 seen_props [prop.Key] = prop;
 710                                 AddPropertyToDocument (prop, new_doc);
 711                                 Logger.Log.Debug ("New prop '{0}' = '{1}'", prop.Key, prop.Value);
 712                         }
 713
 714                         // Copy the other properties from the old document to the
 715                         // new one, skipping any properties that we got new values
 716                         // for out of the Indexable.
 717                         if (old_secondary_doc != null) {
 718                                 foreach (Field f in old_secondary_doc.Fields ()) {
 719                                         Property prop;
 720                                         prop = GetPropertyFromDocument (f, old_secondary_doc, false);
 721                                         if (prop != null && ! seen_props.Contains (prop.Key)) {
 722                                                 Logger.Log.Debug ("Old prop '{0}' = '{1}'", prop.Key, prop.Value);
 723                                                 AddPropertyToDocument (prop, new_doc);
 724                                         }
 725                                 }
 726                         }
 727
 728                         return new_doc;
 729                 }
 730
 731                 static protected Uri GetUriFromDocument (Document doc)
 732                 {
 733                         string uri;
 734                         uri = doc.Get ("Uri");
 735                         if (uri == null)
 736                                 throw new Exception ("Got document from Lucene w/o a URI!");
 737                         return UriFu.UriStringToUri (uri);
 738                 }
 739
 740                 static protected Hit DocumentToHit (Document doc)
 741                 {
 742                         Hit hit;
 743                         hit = new Hit ();
 744
 745                         hit.Uri = GetUriFromDocument (doc);
 746
 747                         string str;
 748                         str = doc.Get ("ParentUri");
 749                         if (str != null)
 750                                 hit.ParentUri = UriFu.UriStringToUri (str);
 751
 752                         hit.Timestamp = StringFu.StringToDateTime (doc.Get ("Timestamp"));
 753
 754                         AddPropertiesToHit (hit, doc, true);
 755
 756                         // Get the Type and MimeType from the properties.
 757                         hit.Type = hit.GetFirstProperty ("beagle:HitType");
 758                         hit.MimeType = hit.GetFirstProperty ("beagle:MimeType");
 759                         hit.Source = hit.GetFirstProperty ("beagle:Source");
 760
 761                         return hit;
 762                 }
 763
 764                 static protected void AddPropertiesToHit (Hit hit, Document doc, bool from_primary_index)
 765                 {
 766                         foreach (Field f in doc.Fields ()) {
 767                                 Property prop;
 768                                 prop = GetPropertyFromDocument (f, doc, from_primary_index);
 769                                 if (prop != null)
 770                                         hit.AddProperty (prop);
 771                         }
 772                 }
 773
 774
 775                 //////////////////////////////////////////////////////////////////////////////
 776
 777                 //
 778                 // Handle the index's item count
 779                 //
 780
 781                 public int GetItemCount ()
 782                 {
 783                         if (last_item_count < 0) {
 784                                 IndexReader reader;
 785                                 reader = GetReader (PrimaryStore);
 786                                 last_item_count = reader.NumDocs ();
 787                                 ReleaseReader (reader);
 788                         }
 789                         return last_item_count;
 790                 }
 791
 792                 // We should set the cached count of index items when IndexReaders
 793                 // are open and available, so calls to GetItemCount will return immediately.
 794
 795                 protected bool HaveItemCount { get { return last_item_count >= 0; } }
 796
 797                 protected void SetItemCount (IndexReader reader)
 798                 {
 799                         last_item_count = reader.NumDocs ();
 800                 }
 801
 802                 public void SetItemCount (int count)
 803                 {
 804                         last_item_count = count;
 805                 }
 806
 807                 protected void AdjustItemCount (int delta)
 808                 {
 809                         if (last_item_count >= 0)
 810                                 last_item_count += delta;
 811                 }
 812
 813                 //////////////////////////////////////////////////////////////////////////////
 814
 815                 //
 816                 // Access to the stemmer and list of stop words
 817                 //
 818
 819                 static PorterStemmer stemmer = new PorterStemmer ();
 820
 821                 static public string Stem (string str)
 822                 {
 823                         return stemmer.Stem (str);
 824                 }
 825
 826                 public static bool IsStopWord (string stemmed_word)
 827                 {
 828                         return ArrayFu.IndexOfString (StopAnalyzer.ENGLISH_STOP_WORDS, stemmed_word) != -1;
 829                 }
 830
 831                 //////////////////////////////////////////////////////////////////////////////
 832
 833                 //
 834                 // Special Hit Filtering classes
 835                 //
 836
 837                 static private bool TrueHitFilter (Hit hit)
 838                 {
 839                         return true;
 840                 }
 841
 842                 static private HitFilter true_hit_filter = new HitFilter (TrueHitFilter);
 843
 844                 public class OrHitFilter {
 845
 846                         private ArrayList all = new ArrayList ();
 847                         private bool contains_known_true = false;
 848
 849                         public void Add (HitFilter hit_filter)
 850                         {
 851                                 if (hit_filter == true_hit_filter)
 852                                         contains_known_true = true;
 853                                 all.Add (hit_filter);
 854                         }
 855
 856                         public bool HitFilter (Hit hit)
 857                         {
 858                                 if (contains_known_true)
 859                                         return true;
 860                                 foreach (HitFilter hit_filter in all)
 861                                         if (hit_filter (hit))
 862                                                 return true;
 863                                 return false;
 864                         }
 865                 }
 866
 867                 public class AndHitFilter {
 868
 869                         private ArrayList all = new ArrayList ();
 870
 871                         public void Add (HitFilter hit_filter)
 872                         {
 873                                 all.Add (hit_filter);
 874                         }
 875
 876                         public bool HitFilter (Hit hit)
 877                         {
 878                                 foreach (HitFilter hit_filter in all)
 879                                         if (! hit_filter (hit))
 880                                                 return false;
 881                                 return true;
 882                         }
 883                 }
 884
 885                 public class NotHitFilter {
 886                         HitFilter original;
 887
 888                         public NotHitFilter (HitFilter original)
 889                         {
 890                                 this.original = original;
 891                         }
 892
 893                         public bool HitFilter (Hit hit)
 894                         {
 895                                 return ! original (hit);
 896                         }
 897                 }
 898
 899                 //////////////////////////////////////////////////////////////////////////////
 900
 901                 //
 902                 // Queries
 903                 //
 904
 905                 static private LNS.Query StringToQuery (string field_name,
 906                                                         string text,
 907                                                         ArrayList term_list)
 908                 {
 909                         ArrayList tokens = new ArrayList ();
 910
 911                         // Use the analyzer to extract the query's tokens.
 912                         // This code is taken from Lucene's query parser.
 913                         TokenStream source = QueryAnalyzer.TokenStream (field_name, new StringReader (text));
 914                         while (true) {
 915                                 Lucene.Net.Analysis.Token token;
 916                                 try {
 917                                         token = source.Next ();
 918                                         if (token == null)
 919                                                 break;
 920                                 } catch (IOException) {
 921                                         break;
 922                                 }
 923                                 if (token != null)
 924                                         tokens.Add (token.TermText ());
 925                         }
 926                         try {
 927                                 source.Close ();
 928                         } catch (IOException) {
 929                                 // ignore
 930                         }
 931
 932                         if (tokens.Count == 0)
 933                                 return null;
 934
 935                         LNS.PhraseQuery query = new LNS.PhraseQuery ();
 936
 937                         foreach (string token in tokens) {
 938                                 Term term;
 939                                 term = new Term (field_name, token);
 940                                 query.Add (term);
 941                                 if (term_list != null)
 942                                         term_list.Add (term);
 943                         }
 944
 945                         return query;
 946                 }
 947
 948                 //
 949                 // Date Range Handling
 950                 //
 951
 952                 // This function will break down dates to discrete chunks of
 953                 // time to avoid expanding RangeQuerys as much as possible.
 954                 // For example, searching for
 955                 //
 956                 // YMD(5 May 2005, 16 Oct 2006)
 957                 //
 958                 // would break down into three queries:
 959                 //
 960                 // (YM(May 2005) AND D(5,31)) OR
 961                 // YM(Jun 2005, Sep 2006) OR
 962                 // (YM(Oct 2006) AND D(1,16))
 963
 964                 static private DateTime lower_bound = new DateTime (1970, 1, 1);
 965
 966                 // FIXME: we should probably boost this sometime around 2030.
 967                 // Mark your calendar.
 968                 static private DateTime upper_bound = new DateTime (2038, 12, 31);
 969
 970                 static private Term NewYearMonthTerm (string field_name, int y, int m)
 971                 {
 972                         return new Term ("YM:" + field_name, String.Format ("{0}{1:00}", y, m));
 973                 }
 974
 975                 static private LNS.Query NewYearMonthQuery (string field_name, int y, int m)
 976                 {
 977                         return new LNS.TermQuery (NewYearMonthTerm (field_name, y, m));
 978                 }
 979
 980                 static private LNS.Query NewYearMonthQuery (string field_name, int y1, int m1, int y2, int m2)
 981                 {
 982                         return new LNS.RangeQuery (NewYearMonthTerm (field_name, y1, m1),
 983                                                    NewYearMonthTerm (field_name, y2, m2),
 984                                                    true); // query is inclusive
 985                 }
 986
 987                 static private Term NewDayTerm (string field_name, int d)
 988                 {
 989                         return new Term ("D:" + field_name, String.Format ("{0:00}", d));
 990                 }
 991
 992                 static private LNS.Query NewDayQuery (string field_name, int d1, int d2)
 993                 {
 994                         return new LNS.RangeQuery (NewDayTerm (field_name, d1),
 995                                                    NewDayTerm (field_name, d2),
 996                                                    true); // query is inclusive
 997                 }
 998
 999                 private class DateRangeHitFilter {
1000                         public string Key;
1001                         public DateTime StartDate;
1002                         public DateTime EndDate;
1003
1004                         public bool HitFilter (Hit hit)
1005                         {
1006                                 // First, check the Timestamp
1007                                 if (Key == QueryPart_DateRange.AllPropertiesKey
1008                                     || Key == QueryPart_DateRange.TimestampKey) {
1009                                         DateTime dt;
1010                                         dt = hit.Timestamp;
1011                                         if (StartDate <= dt && dt <= EndDate)
1012                                                 return true;
1013                                         if (Key == QueryPart_DateRange.TimestampKey)
1014                                                 return false;
1015                                 }
1016
1017                                 if (Key == QueryPart_DateRange.AllPropertiesKey) {
1018                                         // Walk through all of the properties, and see if any
1019                                         // date properties fall inside the range.
1020                                         foreach (Property prop in hit.Properties) {
1021                                                 if (prop.Type == PropertyType.Date) {
1022                                                         DateTime dt;
1023                                                         dt = StringFu.StringToDateTime (prop.Value);
1024                                                         if (StartDate <= dt && dt <= EndDate)
1025                                                                 return true;
1026                                                 }
1027                                         }
1028                                         return false;
1029                                 } else {
1030                                         // Walk through all of the properties with the given key,
1031                                         // and see if any of them fall inside of the range.
1032                                         string[] values;
1033                                         values = hit.GetProperties (Key);
1034                                         foreach (string v in values) {
1035                                                 DateTime dt;
1036                                                 dt = StringFu.StringToDateTime (v);
1037                                                 if (StartDate <= dt && dt <= EndDate)
1038                                                         return true;
1039                                         }
1040                                         return false;
1041                                 }
1042                         }
1043                 }
1044
1045                 static private LNS.Query GetDateRangeQuery (QueryPart_DateRange part, out HitFilter hit_filter)
1046                 {
1047                         string field_name;
1048                         if (part.Key == QueryPart_DateRange.AllPropertiesKey)
1049                                 field_name = TypeToWildcardField (PropertyType.Date);
1050                         else if (part.Key == QueryPart_DateRange.TimestampKey)
1051                                 field_name = "Timestamp";
1052                         else
1053                                 field_name = PropertyToFieldName (PropertyType.Date, part.Key);
1054
1055                         // FIXME: We could optimize this and reduce the size of our range
1056                         // queries if we actually new the min and max date that appear in
1057                         // any properties in the index.  We would need to inspect the index to
1058                         // determine that at start-up, and then track it as new documents
1059                         // get added to the index.
1060                         if (part.StartDate < lower_bound)
1061                                 part.StartDate = lower_bound;
1062                         if (part.EndDate > upper_bound || part.EndDate == DateTime.MinValue)
1063                                 part.EndDate = upper_bound;
1064
1065                         // Swap the start and end dates if they come in reversed.
1066                         if (part.StartDate > part.EndDate) {
1067                                 DateTime swap;
1068                                 swap = part.StartDate;
1069                                 part.StartDate = part.EndDate;
1070                                 part.EndDate = swap;
1071                         }
1072
1073                         // Set up our hit filter to cull out the bad dates.
1074                         DateRangeHitFilter drhf;
1075                         drhf = new DateRangeHitFilter ();
1076                         drhf.Key = part.Key;
1077                         drhf.StartDate = part.StartDate;
1078                         drhf.EndDate = part.EndDate;
1079                         hit_filter = new HitFilter (drhf.HitFilter);
1080
1081                         Logger.Log.Debug ("Building new date range query");
1082                         Logger.Log.Debug ("Start: {0}", part.StartDate);
1083                         Logger.Log.Debug ("End: {0}", part.EndDate);
1084
1085                         int y1, m1, d1, y2, m2, d2;
1086                         y1 = part.StartDate.Year;
1087                         m1 = part.StartDate.Month;
1088                         d1 = part.StartDate.Day;
1089                         y2 = part.EndDate.Year;
1090                         m2 = part.EndDate.Month;
1091                         d2 = part.EndDate.Day;
1092
1093                         LNS.BooleanQuery top_level_query;
1094                         top_level_query = new LNS.BooleanQuery ();
1095
1096                         // A special case: both the start and the end of our range fall
1097                         // in the same month.
1098                         if (y1 == y2 && m1 == m2) {
1099                                 LNS.Query ym_query;
1100                                 ym_query = NewYearMonthQuery (field_name, y1, m1);
1101
1102                                 // If our range only covers a part of the month, do a range query on the days.
1103                                 if (d1 != 1 || d2 != DateTime.DaysInMonth (y2, m2)) {
1104                                         LNS.BooleanQuery sub_query;
1105                                         sub_query = new LNS.BooleanQuery ();
1106                                         sub_query.Add (ym_query, true, false);
1107                                         sub_query.Add (NewDayQuery (field_name, d1, d2), true, false);
1108                                         top_level_query.Add (sub_query, false, false);
1109                                 } else {
1110                                         top_level_query.Add (ym_query, false, false);
1111                                 }
1112
1113                         } else {
1114
1115                                 // Handle a partial month at the beginning of our range.
1116                                 if (d1 > 1) {
1117                                         LNS.BooleanQuery sub_query;
1118                                         sub_query = new LNS.BooleanQuery ();
1119                                         sub_query.Add (NewYearMonthQuery (field_name, y1, m1), true, false);
1120                                         sub_query.Add (NewDayQuery (field_name, d1, DateTime.DaysInMonth (y1, m1)), true, false);
1121                                         top_level_query.Add (sub_query, false, false);
1122
1123                                         ++m1;
1124                                         if (m1 == 13) {
1125                                                 m1 = 1;
1126                                                 ++y1;
1127                                         }
1128                                 }
1129
1130                                 // And likewise, handle a partial month at the end of our range.
1131                                 if (d2 < DateTime.DaysInMonth (y2, m2)) {
1132                                         LNS.BooleanQuery sub_query;
1133                                         sub_query = new LNS.BooleanQuery ();
1134                                         sub_query.Add (NewYearMonthQuery (field_name, y2, m2), true, false);
1135                                         sub_query.Add (NewDayQuery (field_name, 1, d2), true, false);
1136                                         top_level_query.Add (sub_query, false, false);
1137
1138                                         --m2;
1139                                         if (m2 == 0) {
1140                                                 m2 = 12;
1141                                                 --y2;
1142                                         }
1143                                 }
1144
1145                                 // Generate the query for the "middle" of our period, if it is non-empty
1146                                 if (y1 < y2 || ((y1 == y2) && m1 <= m2))
1147                                         top_level_query.Add (NewYearMonthQuery (field_name, y1, m1, y2, m2),
1148                                                              false, false);
1149                         }
1150
1151                         return top_level_query;
1152                 }
1153
1154                 // search_subset_uris is a list of Uris that this search should be
1155                 // limited to.
1156                 static protected void QueryPartToQuery (QueryPart     abstract_part,
1157                                                         bool          only_build_primary_query,
1158                                                         ArrayList     term_list,
1159                                                         out LNS.Query primary_query,
1160                                                         out LNS.Query secondary_query,
1161                                                         out HitFilter hit_filter)
1162                 {
1163                         primary_query = null;
1164                         secondary_query = null;
1165
1166                         // By default, we assume that our lucene queries will return exactly the
1167                         // matching set of objects.  We need to set the hit filter if further
1168                         // refinement of the search results is required.  (As in the case of
1169                         // date range queries, for example.)  We essentially have to do this
1170                         // to make OR queries work correctly.
1171                         hit_filter = true_hit_filter;
1172
1173                         // The exception is when dealing with a prohibited part.  Just return
1174                         // null for the hit filter in that case.  This works since
1175                         // prohibited parts are not allowed inside of OR queries.
1176                         if (abstract_part.Logic == QueryPartLogic.Prohibited)
1177                                 hit_filter = null;
1178
1179                         if (abstract_part == null)
1180                                 return;
1181
1182                         if (abstract_part is QueryPart_Text) {
1183                                 QueryPart_Text part = (QueryPart_Text) abstract_part;
1184
1185                                 if (! (part.SearchFullText || part.SearchTextProperties))
1186                                         return;
1187
1188                                 LNS.BooleanQuery p_query = new LNS.BooleanQuery ();
1189                                 LNS.BooleanQuery s_query = new LNS.BooleanQuery ();
1190
1191                                 if (part.SearchFullText) {
1192                                         LNS.Query subquery;
1193                                         subquery = StringToQuery ("Text", part.Text, term_list);
1194                                         if (subquery != null)
1195                                                 p_query.Add (subquery, false, false);
1196
1197                                         // FIXME: HotText is ignored for now!
1198                                         // subquery = StringToQuery ("HotText", part.Text);
1199                                         // if (subquery != null)
1200                                         //    p_query.Add (subquery, false, false);
1201                                 }
1202
1203                                 if (part.SearchTextProperties) {
1204                                         LNS.Query subquery;
1205                                         subquery = StringToQuery ("PropertyText", part.Text, term_list);
1206                                         if (subquery != null) {
1207                                                 p_query.Add (subquery, false, false);
1208                                                 // Properties can live in either index
1209                                                 if (! only_build_primary_query)
1210                                                         s_query.Add (subquery.Clone () as LNS.Query, false, false);
1211                                         }
1212
1213                                         Term term;
1214                                         term = new Term ("PropertyKeyword", part.Text);
1215                                         // FIXME: terms are already added in term_list. But they may have been tokenized
1216                                         // The term here is non-tokenized version. Should this be added to term_list ?
1217                                         // term_list is used to calculate scores
1218                                         if (term_list != null)
1219                                                 term_list.Add (term);
1220                                         subquery = new LNS.TermQuery (term);
1221                                         p_query.Add (subquery, false, false);
1222                                         // Properties can live in either index
1223                                         if (! only_build_primary_query)
1224                                                 s_query.Add (subquery.Clone () as LNS.Query, false, false);
1225                                 }
1226
1227                                 primary_query = p_query;
1228                                 if (! only_build_primary_query)
1229                                         secondary_query = s_query;
1230
1231                                 return;
1232                         }
1233
1234                         if (abstract_part is QueryPart_Property) {
1235                                 QueryPart_Property part = (QueryPart_Property) abstract_part;
1236
1237                                 string field_name;
1238                                 if (part.Key == QueryPart_Property.AllProperties) {
1239                                         field_name = TypeToWildcardField (part.Type);
1240                                         // FIXME: probably shouldn't just return silently
1241                                         if (field_name == null)
1242                                                 return;
1243                                 } else
1244                                         field_name = PropertyToFieldName (part.Type, part.Key);
1245
1246                                 if (part.Type == PropertyType.Text)
1247                                         primary_query = StringToQuery (field_name, part.Value, term_list);
1248                                 else {
1249                                         Term term;
1250                                         term = new Term (field_name, part.Value);
1251                                         if (term_list != null)
1252                                                 term_list.Add (term);
1253                                         primary_query = new LNS.TermQuery (term);
1254                                 }
1255
1256                                 // Properties can live in either index
1257                                 if (! only_build_primary_query && primary_query != null)
1258                                         secondary_query = primary_query.Clone () as LNS.Query;
1259
1260                                 return;
1261                         }
1262
1263                         if (abstract_part is QueryPart_DateRange) {
1264
1265                                 QueryPart_DateRange part = (QueryPart_DateRange) abstract_part;
1266
1267                                 primary_query = GetDateRangeQuery (part, out hit_filter);
1268                                 // Date properties can live in either index
1269                                 if (! only_build_primary_query && primary_query != null)
1270                                         secondary_query = primary_query.Clone () as LNS.Query;
1271
1272                                 // If this is a prohibited part, invert our hit filter.
1273                                 if (part.Logic == QueryPartLogic.Prohibited) {
1274                                         NotHitFilter nhf;
1275                                         nhf = new NotHitFilter (hit_filter);
1276                                         hit_filter = new HitFilter (nhf.HitFilter);
1277                                 }
1278
1279                                 return;
1280                         }
1281
1282                         if (abstract_part is QueryPart_Or) {
1283                                 QueryPart_Or part = (QueryPart_Or) abstract_part;
1284
1285                                 // Assemble a new BooleanQuery combining all of the sub-parts.
1286                                 LNS.BooleanQuery p_query;
1287                                 p_query = new LNS.BooleanQuery ();
1288
1289                                 LNS.BooleanQuery s_query = null;
1290                                 if (! only_build_primary_query)
1291                                         s_query = new LNS.BooleanQuery ();
1292
1293                                 primary_query = p_query;
1294                                 secondary_query = s_query;
1295
1296                                 OrHitFilter or_hit_filter = null;
1297
1298                                 foreach (QueryPart  sub_part in part.SubParts) {
1299                                         LNS.Query p_subq, s_subq;
1300                                         HitFilter sub_hit_filter; // FIXME: This is (and must be) ignored
1301                                         // FIXME: Any subpart in an OR which has a hit filter won't work
1302                                         // correctly, because we can't tell which part of an OR we matched
1303                                         // against to filter correctly.  This affects date range queries.
1304                                         QueryPartToQuery (sub_part, only_build_primary_query,
1305                                                           term_list,
1306                                                           out p_subq, out s_subq, out sub_hit_filter);
1307                                         if (p_subq != null)
1308                                                 p_query.Add (p_subq, false, false);
1309                                         if (s_subq != null)
1310                                                 s_query.Add (s_subq, false, false);
1311                                         if (sub_hit_filter != null) {
1312                                                 if (or_hit_filter == null)
1313                                                         or_hit_filter = new OrHitFilter ();
1314                                                 or_hit_filter.Add (sub_hit_filter);
1315                                         }
1316                                 }
1317
1318                                 if (or_hit_filter != null)
1319                                         hit_filter = new HitFilter (or_hit_filter.HitFilter);
1320
1321                                 return;
1322                         }
1323
1324                         throw new Exception ("Unhandled QueryPart type! " + abstract_part.ToString ());
1325                 }
1326
1327                 static protected LNS.Query UriQuery (string field_name, Uri uri)
1328                 {
1329                         return new LNS.TermQuery (new Term (field_name, UriFu.UriToSerializableString (uri)));
1330                 }
1331
1332                 static protected LNS.Query UriQuery (string field_name, ICollection uri_list)
1333                 {
1334                         return UriQuery (field_name, uri_list, null);
1335                 }
1336
1337                 static protected LNS.Query UriQuery (string field_name, ICollection uri_list, LNS.Query extra_requirement)
1338                 {
1339                         if (uri_list.Count == 0)
1340                                 return null;
1341
1342                         int max_clauses;
1343                         max_clauses = LNS.BooleanQuery.GetMaxClauseCount ();
1344
1345                         int N;
1346                         N = 1 + (uri_list.Count - 1) / max_clauses;
1347
1348                         LNS.BooleanQuery top_query;
1349                         top_query = new LNS.BooleanQuery ();
1350
1351                         int cursor = 0;
1352                         if (extra_requirement != null) {
1353                                 top_query.Add (extra_requirement, true, false);
1354                                 ++cursor;
1355                         }
1356
1357                         ArrayList bottom_queries = null;
1358
1359                         if (N > 1) {
1360                                 bottom_queries = new ArrayList ();
1361                                 for (int i = 0; i < N; ++i) {
1362                                         LNS.BooleanQuery bq;
1363                                         bq = new LNS.BooleanQuery ();
1364                                         bottom_queries.Add (bq);
1365                                         top_query.Add (bq, false, false);
1366                                 }
1367                         }
1368
1369                         foreach (Uri uri in uri_list) {
1370                                 LNS.Query subquery;
1371                                 subquery = UriQuery (field_name, uri);
1372
1373                                 LNS.BooleanQuery target;
1374                                 if (N == 1)
1375                                         target = top_query;
1376                                 else {
1377                                         target = (LNS.BooleanQuery) bottom_queries [cursor];
1378                                         ++cursor;
1379                                         if (cursor >= N)
1380                                                 cursor = 0;
1381                                 }
1382
1383                                 target.Add (subquery, false, false);
1384                         }
1385
1386                         return top_query;
1387                 }
1388
1389                 ///////////////////////////////////////////////////////////////////////////////////
1390
1391                 public int SegmentCount {
1392                         get {
1393                                 DirectoryInfo dir_info;
1394                                 int p_count = 0, s_count = 0;
1395
1396                                 dir_info = new DirectoryInfo (PrimaryIndexDirectory);
1397                                 foreach (FileInfo file_info in dir_info.GetFiles ())
1398                                         if (file_info.Extension == ".cfs")
1399                                                 ++p_count;
1400
1401                                 dir_info = new DirectoryInfo (SecondaryIndexDirectory);
1402                                 foreach (FileInfo file_info in dir_info.GetFiles ())
1403                                         if (file_info.Extension == ".cfs")
1404                                                 ++s_count;
1405
1406                                 return p_count > s_count ? p_count : s_count;
1407                         }
1408                 }
1409
1410                 ///////////////////////////////////////////////////////////////////////////////////
1411
1412                 // Cache IndexReaders on a per-Lucene index basis, since they
1413                 // are extremely expensive to create.  Note that using this
1414                 // only makes sense in situations where the index only
1415                 // possibly might change from underneath us, but most of the
1416                 // time probably won't.  This means it makes sense to do
1417                 // this in LuceneQueryingDriver.cs, but it doesn't in
1418                 // LuceneIndexingDriver.cs.
1419
1420                 private class ReaderAndVersion {
1421
1422                         public IndexReader Reader;
1423                         public long Version;
1424                         public int Refcount;
1425
1426                         public ReaderAndVersion (IndexReader reader, long version)
1427                         {
1428                                 this.Reader = reader;
1429                                 this.Version = version;
1430                                 this.Refcount = 1;
1431                         }
1432                 }
1433
1434                 static private Hashtable directory_rav_map = new Hashtable ();
1435                 static private Hashtable reader_rav_map = new Hashtable ();
1436
1437                 static public LNS.IndexSearcher GetSearcher (Lucene.Net.Store.Directory directory)
1438                 {
1439                         IndexReader reader = GetReader (directory);
1440
1441                         return new LNS.IndexSearcher (reader);
1442                 }
1443
1444                 static public IndexReader GetReader (Lucene.Net.Store.Directory directory)
1445                 {
1446                         IndexReader reader;
1447                         long version;
1448
1449                         lock (reader_rav_map) {
1450                                 ReaderAndVersion rav = (ReaderAndVersion) directory_rav_map [directory];
1451
1452                                 if (rav == null) {
1453                                         version = IndexReader.GetCurrentVersion (directory);
1454                                         reader = IndexReader.Open (directory);
1455
1456                                         rav = new ReaderAndVersion (reader, version);
1457                                         rav.Refcount++;
1458
1459                                         directory_rav_map [directory] = rav;
1460                                         reader_rav_map [reader] = rav;
1461
1462                                         return reader;
1463                                 }
1464
1465                                 version = IndexReader.GetCurrentVersion (directory);
1466
1467                                 if (version != rav.Version) {
1468                                         UnrefReaderAndVersion_Unlocked (rav);
1469
1470                                         reader = IndexReader.Open (directory);
1471
1472                                         rav = new ReaderAndVersion (reader, version);
1473                                         rav.Refcount++;
1474
1475                                         directory_rav_map [directory] = rav;
1476                                         reader_rav_map [reader] = rav;
1477                                 } else
1478                                         rav.Refcount++;
1479
1480                                 return rav.Reader;
1481                         }
1482                 }
1483
1484                 static private void UnrefReaderAndVersion_Unlocked (ReaderAndVersion rav)
1485                 {
1486                         rav.Refcount--;
1487
1488                         if (rav.Refcount == 0) {
1489                                 rav.Reader.Close ();
1490                                 reader_rav_map.Remove (rav.Reader);
1491                         }
1492                 }
1493
1494                 static public void ReleaseReader (IndexReader reader)
1495                 {
1496                         lock (reader_rav_map) {
1497                                 ReaderAndVersion rav = (ReaderAndVersion) reader_rav_map [reader];
1498
1499                                 UnrefReaderAndVersion_Unlocked (rav);
1500                         }
1501                 }
1502
1503                 static public void ReleaseSearcher (LNS.IndexSearcher searcher)
1504                 {
1505                         IndexReader reader = searcher.GetIndexReader ();
1506
1507                         searcher.Close ();
1508                         ReleaseReader (reader);
1509                 }
1510
1511                 ///////////////////////////////////////////////////////////////////////////////////
1512
1513                 //
1514                 // Various ways to grab lots of hits at once.
1515                 // These should never be used for querying, only for utility
1516                 // functions.
1517                 //
1518
1519                 public int GetBlockOfHits (int cookie,
1520                                            Hit [] block_of_hits)
1521                 {
1522                         IndexReader primary_reader;
1523                         IndexReader secondary_reader;
1524                         primary_reader = GetReader (PrimaryStore);
1525                         secondary_reader = GetReader (SecondaryStore);
1526
1527                         int request_size;
1528                         request_size = block_of_hits.Length;
1529                         if (request_size > primary_reader.NumDocs ())
1530                                 request_size = primary_reader.NumDocs ();
1531
1532                         int max_doc;
1533                         max_doc = primary_reader.MaxDoc ();
1534
1535                         if (cookie < 0) {
1536                                 Random random;
1537                                 random = new Random ();
1538                                 cookie = random.Next (max_doc);
1539                         }
1540
1541                         int original_cookie;
1542                         original_cookie = cookie;
1543
1544                         Hashtable primary_docs, secondary_docs;
1545                         primary_docs = UriFu.NewHashtable ();
1546                         secondary_docs = UriFu.NewHashtable ();
1547
1548                         // Load the primary documents
1549                         for (int i = 0; i < request_size; ++i) {
1550
1551                                 if (! primary_reader.IsDeleted (cookie)) {
1552                                         Document doc;
1553                                         doc = primary_reader.Document (cookie);
1554                                         primary_docs [GetUriFromDocument (doc)] = doc;
1555                                 }
1556
1557                                 ++cookie;
1558                                 if (cookie >= max_doc) // wrap around
1559                                         cookie = 0;
1560
1561                                 // If we somehow end up back where we started,
1562                                 // give up.
1563                                 if (cookie == original_cookie)
1564                                         break;
1565                         }
1566
1567                         // If necessary, load the secondary documents
1568                         if (secondary_reader != null) {
1569                                 LNS.IndexSearcher searcher;
1570                                 searcher = new LNS.IndexSearcher (secondary_reader);
1571
1572                                 LNS.Query uri_query;
1573                                 uri_query = UriQuery ("Uri", primary_docs.Keys);
1574
1575                                 LNS.Hits hits;
1576                                 hits = searcher.Search (uri_query);
1577                                 for (int i = 0; i < hits.Length (); ++i) {
1578                                         Document doc;
1579                                         doc = hits.Doc (i);
1580                                         secondary_docs [GetUriFromDocument (doc)] = doc;
1581                                 }
1582
1583                                 searcher.Close ();
1584                         }
1585
1586                         ReleaseReader (primary_reader);
1587                         ReleaseReader (secondary_reader);
1588
1589                         // Now assemble the hits
1590                         int j = 0;
1591                         foreach (Uri uri in primary_docs.Keys) {
1592                                 Document primary_doc, secondary_doc;
1593                                 primary_doc = primary_docs [uri] as Document;
1594                                 secondary_doc = secondary_docs [uri] as Document;
1595
1596                                 Hit hit;
1597                                 hit = DocumentToHit (primary_doc);
1598                                 if (secondary_doc != null)
1599                                         AddPropertiesToHit (hit, secondary_doc, false);
1600
1601                                 block_of_hits [j] = hit;
1602                                 ++j;
1603                         }
1604
1605                         // null-pad the array, if necessary
1606                         for (; j < block_of_hits.Length; ++j)
1607                                 block_of_hits [j] = null;
1608
1609
1610                         // Return the new cookie
1611                         return cookie;
1612                 }
1613
1614                 // For a large index, this will be very slow and will consume
1615                 // a lot of memory.  Don't call it without a good reason!
1616                 // We return a hashtable indexed by Uri.
1617                 public Hashtable GetAllHitsByUri ()
1618                 {
1619                         Hashtable all_hits;
1620                         all_hits = UriFu.NewHashtable ();
1621
1622                         IndexReader primary_reader;
1623                         IndexReader secondary_reader;
1624                         primary_reader = GetReader (PrimaryStore);
1625                         secondary_reader = GetReader (SecondaryStore);
1626
1627                         // Load everything from the primary index
1628                         int max_doc;
1629                         max_doc = primary_reader.MaxDoc ();
1630                         for (int i = 0; i < max_doc; ++i) {
1631
1632                                 if (primary_reader.IsDeleted (i))
1633                                         continue;
1634
1635                                 Document doc;
1636                                 doc = primary_reader.Document (i);
1637
1638                                 Hit hit;
1639                                 hit = DocumentToHit (doc);
1640                                 all_hits [hit.Uri] = hit;
1641                         }
1642
1643                         // Now add in everything from the secondary index, if it exists
1644                         if (secondary_reader != null) {
1645                                 max_doc = secondary_reader.MaxDoc ();
1646                                 for (int i = 0; i < max_doc; ++i) {
1647
1648                                         if (secondary_reader.IsDeleted (i))
1649                                                 continue;
1650
1651                                         Document doc;
1652                                         doc = secondary_reader.Document (i);
1653
1654                                         Uri uri;
1655                                         uri = GetUriFromDocument (doc);
1656
1657                                         Hit hit;
1658                                         hit = (Hit) all_hits [uri];
1659                                         if (hit != null)
1660                                                 AddPropertiesToHit (hit, doc, false);
1661                                 }
1662                         }
1663
1664                         ReleaseReader (primary_reader);
1665                         ReleaseReader (secondary_reader);
1666
1667                         return all_hits;
1668                 }
1669         }
1670 }