beagled/LuceneCommon.cs

   1 //
   2 // LuceneCommon.cs
   3 //
   4 // Copyright (C) 2004-2005 Novell, Inc.
   5 //
   6
   7 //
   8 // Permission is hereby granted, free of charge, to any person obtaining a
   9 // copy of this software and associated documentation files (the "Software"),
  10 // to deal in the Software without restriction, including without limitation
  11 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
  12 // and/or sell copies of the Software, and to permit persons to whom the
  13 // Software is furnished to do so, subject to the following conditions:
  14 //
  15 // The above copyright notice and this permission notice shall be included in
  16 // all copies or substantial portions of the Software.
  17 //
  18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  24 // DEALINGS IN THE SOFTWARE.
  25 //
  26
  27 using System;
  28 using System.Collections;
  29 using System.Diagnostics;
  30 using System.Globalization;
  31 using System.IO;
  32 using System.Text;
  33 using System.Threading;
  34 using System.Xml;
  35 using System.Xml.Serialization;
  36
  37 using Lucene.Net.Analysis;
  38 using Lucene.Net.Analysis.Standard;
  39 using Lucene.Net.Documents;
  40 using Lucene.Net.Index;
  41 using Lucene.Net.QueryParsers;
  42 using LNS = Lucene.Net.Search;
  43
  44 using Beagle.Util;
  45
  46 namespace Beagle.Daemon {
  47
  48         public class LuceneCommon {
  49
  50                 public delegate bool HitFilter (Hit hit);
  51
  52                 // VERSION HISTORY
  53                 // ---------------
  54                 //
  55                 //  1: Original
  56                 //  2: Changed format of timestamp strings
  57                 //  3: Schema changed to be more Dashboard-Match-like
  58                 //  4: Schema changed for files to include _Directory property
  59                 //  5: Changed analyzer to support stemming.  Bumped version # to
  60                 //     force everyone to re-index.
  61                 //  6: lots of schema changes as part of the general refactoring
  62                 //  7: incremented to force a re-index after our upgrade to lucene 1.4
  63                 //     (in theory the file formats are compatible, we are seeing 'term
  64                 //     out of order' exceptions in some cases)
  65                 //  8: another forced re-index, this time because of massive changes
  66                 //     in the file system backend (it would be nice to have per-backend
  67                 //     versioning so that we didn't have to purge all indexes just
  68                 //     because one changed)
  69                 //  9: changed the way properties are stored, changed in conjunction
  70                 //     with sane handling of multiple properties on hits.
  71                 // 10: changed to support typed and mutable properties
  72                 // 11: moved mime type and hit type into properties
  73                 // 12: added year-month and year-month-day resolutions for all
  74                 //     date properties
  75                 // 13: moved source into a property
  76                 // 14: allow wildcard queries to also match keywords
  77                 private const int MAJOR_VERSION = 14;
  78                 private int minor_version = 0;
  79
  80                 private string index_name;
  81                 private string top_dir;
  82
  83                 private string fingerprint;
  84                 private int last_item_count = -1;
  85
  86                 // This is the big index, containing document full-texts and
  87                 // data that is expensive to index.
  88                 private Lucene.Net.Store.Directory primary_store = null;
  89
  90                 // This is the small index, containing document info that we
  91                 // expect to have change.  Canonical example: file names.
  92                 private Lucene.Net.Store.Directory secondary_store = null;
  93
  94                 //////////////////////////////////////////////////////////////////////////////
  95
  96                 protected LuceneCommon (string index_name, int minor_version)
  97                 {
  98                         this.index_name = index_name;
  99                         this.minor_version = minor_version;
 100
 101                         this.top_dir = (Path.IsPathRooted (index_name)) ? index_name : Path.Combine (PathFinder.IndexDir, index_name);
 102                 }
 103
 104                 //////////////////////////////////////////////////////////////////////////////
 105
 106                 protected string IndexName { get { return index_name; } }
 107
 108                 public Lucene.Net.Store.Directory PrimaryStore { get { return primary_store; } }
 109
 110                 public Lucene.Net.Store.Directory SecondaryStore { get { return secondary_store; } }
 111
 112                 public string Fingerprint { get { return fingerprint; } }
 113
 114                 public string TopDirectory { get { return top_dir; } }
 115
 116                 //////////////////////////////////////////////////////////////////////////////
 117
 118                 protected TextCache text_cache = null;
 119
 120                 public TextCache TextCache {
 121                         get { return text_cache; }
 122                         set { text_cache = value; }
 123                 }
 124
 125                 //////////////////////////////////////////////////////////////////////////////
 126
 127                 private string VersionFile {
 128                         get { return Path.Combine (top_dir, "version"); }
 129                 }
 130
 131                 private string FingerprintFile {
 132                         get { return Path.Combine (top_dir, "fingerprint"); }
 133                 }
 134
 135                 // Shouldn't really be public
 136                 public string PrimaryIndexDirectory {
 137                         get { return Path.Combine (top_dir, "PrimaryIndex"); }
 138                 }
 139
 140                 // Shouldn't really be public
 141                 public string SecondaryIndexDirectory {
 142                         get { return Path.Combine (top_dir, "SecondaryIndex"); }
 143                 }
 144
 145                 public string LockDirectory {
 146                         get { return Path.Combine (top_dir, "Locks"); }
 147                 }
 148
 149                 //////////////////////////////////////////////////////////////////////////////
 150
 151                 // Deal with dangling locks
 152
 153                 private bool IsDanglingLock (FileInfo info)
 154                 {
 155                         // It isn't even a lock file
 156                         if (! info.Name.EndsWith (".lock"))
 157                                 return false;
 158
 159                         StreamReader reader;
 160                         string pid = null;
 161
 162                         try {
 163                                 reader = new StreamReader (info.FullName);
 164                                 pid = reader.ReadLine ();
 165                                 reader.Close ();
 166
 167                         } catch {
 168                                 // We couldn't read the lockfile, so it probably went away.
 169                                 return false;
 170                         }
 171
 172                         string cmdline_file;
 173                         cmdline_file = String.Format ("/proc/{0}/cmdline", pid);
 174
 175                         string cmdline = "";
 176                         try {
 177                                 reader = new StreamReader (cmdline_file);
 178                                 cmdline = reader.ReadLine ();
 179                                 reader.Close ();
 180                         } catch {
 181                                 // If we can't open that file, either:
 182                                 // (1) The process doesn't exist
 183                                 // (2) It does exist, but it doesn't belong to us.
 184                                 //     Thus it isn't an IndexHelper
 185                                 // In either case, the lock is dangling --- if it
 186                                 // still exists.
 187                                 return info.Exists;
 188                         }
 189
 190                         // The process exists, but isn't an IndexHelper.
 191                         // If the lock file is still there, it is dangling.
 192                         // FIXME: During one run of bludgeon I got a null reference
 193                         // exception here, so I added the cmdline == null check.
 194                         // Why exactly would that happen?  Is this logic correct
 195                         // in that (odd and presumably rare) case?
 196                         if (cmdline == null || cmdline.IndexOf ("IndexHelper.exe") == -1)
 197                                 return info.Exists;
 198
 199                         // If we reach this point, we know:
 200                         // (1) The process still exists
 201                         // (2) We own it
 202                         // (3) It is an IndexHelper process
 203                         // Thus it almost certainly isn't a dangling lock.
 204                         // The process might be wedged, but that is
 205                         // another issue...
 206                         return false;
 207                 }
 208
 209
 210                 // Return true if there are dangling locks
 211                 protected bool HaveDanglingLocks ()
 212                 {
 213                         return false;
 214                 }
 215
 216                 protected bool Exists ()
 217                 {
 218                         if (! (Directory.Exists (top_dir)
 219                                && File.Exists (VersionFile)
 220                                && File.Exists (FingerprintFile)
 221                                && Directory.Exists (PrimaryIndexDirectory)
 222                                && IndexReader.IndexExists (PrimaryIndexDirectory)
 223                                && Directory.Exists (SecondaryIndexDirectory)
 224                                && IndexReader.IndexExists (SecondaryIndexDirectory)
 225                                && Directory.Exists (LockDirectory)))
 226                                 return false;
 227
 228                         // Check the index's version number.  If it is wrong,
 229                         // declare the index non-existent.
 230
 231                         StreamReader version_reader;
 232                         string version_str;
 233                         version_reader = new StreamReader (VersionFile);
 234                         version_str = version_reader.ReadLine ();
 235                         version_reader.Close ();
 236
 237                         int current_major_version, current_minor_version;
 238                         int i = version_str.IndexOf ('.');
 239
 240                         if (i != -1) {
 241                                 current_major_version = Convert.ToInt32 (version_str.Substring (0, i));
 242                                 current_minor_version = Convert.ToInt32 (version_str.Substring (i+1));
 243                         } else {
 244                                 current_minor_version = Convert.ToInt32 (version_str);
 245                                 current_major_version = 0;
 246                         }
 247
 248                         if (current_major_version != MAJOR_VERSION
 249                             || (minor_version >= 0 && current_minor_version != minor_version)) {
 250                                 Logger.Log.Debug ("Version mismatch in {0}", index_name);
 251                                 Logger.Log.Debug ("Index has version {0}.{1}, expected {2}.{3}",
 252                                                   current_major_version, current_minor_version,
 253                                                   MAJOR_VERSION, minor_version);
 254                                 return false;
 255                         }
 256
 257                         // Check the lock directory: If there is a dangling write lock,
 258                         // assume that the index is corrupted and declare it non-existent.
 259                         DirectoryInfo lock_dir_info;
 260                         lock_dir_info = new DirectoryInfo (LockDirectory);
 261                         foreach (FileInfo info in lock_dir_info.GetFiles ()) {
 262                                 if (IsDanglingLock (info)) {
 263                                         Logger.Log.Warn ("Found a dangling index lock on {0}", info.FullName);
 264                                         return false;
 265                                 }
 266                         }
 267
 268                         return true;
 269                 }
 270
 271                 private Lucene.Net.Store.Directory CreateIndex (string path)
 272                 {
 273                         // Create a directory to put the index in.
 274                         Directory.CreateDirectory (path);
 275
 276                         // Create a new store.
 277                         Lucene.Net.Store.Directory store;
 278                         store = Lucene.Net.Store.FSDirectory.GetDirectory (path, LockDirectory, true);
 279
 280                         // Create an empty index in that store.
 281                         IndexWriter writer;
 282                         writer = new IndexWriter (store, null, true);
 283                         writer.Close ();
 284
 285                         return store;
 286                 }
 287
 288                 // Create will kill your index dead.  Use it with care.
 289                 // You don't need to call Open after calling Create.
 290                 protected void Create ()
 291                 {
 292                         if (minor_version < 0)
 293                                 minor_version = 0;
 294
 295                         // Purge any existing directories.
 296                         if (Directory.Exists (top_dir)) {
 297                                 Logger.Log.Debug ("Purging {0}", top_dir);
 298                                 Directory.Delete (top_dir, true);
 299                         }
 300
 301                         // Create any necessary directories.
 302                         Directory.CreateDirectory (top_dir);
 303                         Directory.CreateDirectory (LockDirectory);
 304
 305                         // Create the indexes.
 306                         primary_store = CreateIndex (PrimaryIndexDirectory);
 307                         secondary_store = CreateIndex (SecondaryIndexDirectory);
 308
 309                         // Generate and store the index fingerprint.
 310                         fingerprint = GuidFu.ToShortString (Guid.NewGuid ());
 311                         TextWriter writer;
 312                         writer = new StreamWriter (FingerprintFile, false);
 313                         writer.WriteLine (fingerprint);
 314                         writer.Close ();
 315
 316                         // Store our index version information.
 317                         writer = new StreamWriter (VersionFile, false);
 318                         writer.WriteLine ("{0}.{1}", MAJOR_VERSION, minor_version);
 319                         writer.Close ();
 320                 }
 321
 322                 protected void Open ()
 323                 {
 324                         Open (false);
 325                 }
 326
 327                 protected void Open (bool read_only_mode)
 328                 {
 329                         // Read our index fingerprint.
 330                         TextReader reader;
 331                         reader = new StreamReader (FingerprintFile);
 332                         fingerprint = reader.ReadLine ();
 333                         reader.Close ();
 334
 335                         // Create stores for our indexes.
 336                         primary_store = Lucene.Net.Store.FSDirectory.GetDirectory (PrimaryIndexDirectory, LockDirectory, false, read_only_mode);
 337                         secondary_store = Lucene.Net.Store.FSDirectory.GetDirectory (SecondaryIndexDirectory, LockDirectory, false, read_only_mode);
 338                 }
 339
 340                 ////////////////////////////////////////////////////////////////
 341
 342                 //
 343                 // Custom Analyzers
 344                 //
 345
 346                 private class SingletonTokenStream : TokenStream {
 347
 348                         private string singleton_str;
 349
 350                         public SingletonTokenStream (string singleton_str)
 351                         {
 352                                 this.singleton_str = singleton_str;
 353                         }
 354
 355                         override public Lucene.Net.Analysis.Token Next ()
 356                         {
 357                                 if (singleton_str == null)
 358                                         return null;
 359
 360                                 Lucene.Net.Analysis.Token token;
 361                                 token = new Lucene.Net.Analysis.Token (singleton_str, 0, singleton_str.Length);
 362
 363                                 singleton_str = null;
 364
 365                                 return token;
 366                         }
 367                 }
 368
 369                 // FIXME: This assumes everything being indexed is in English!
 370                 private class BeagleAnalyzer : StandardAnalyzer {
 371
 372                         private char [] buffer = new char [2];
 373                         private bool strip_extra_property_info = false;
 374
 375                         public BeagleAnalyzer (bool strip_extra_property_info)
 376                         {
 377                                 this.strip_extra_property_info = strip_extra_property_info;
 378                         }
 379
 380                         public override TokenStream TokenStream (string fieldName, TextReader reader)
 381                         {
 382                                 bool is_text_prop = false;
 383
 384                                 // Strip off the first two characters in a property.
 385                                 // We store type information in those two characters, so we don't
 386                                 // want to index them.
 387                                 if (fieldName.StartsWith ("prop:")) {
 388
 389                                         if (strip_extra_property_info) {
 390                                                 // Skip everything up to and including the first :
 391                                                 int c;
 392                                                 do {
 393                                                         c = reader.Read ();
 394                                                 } while (c != -1 && c != ':');
 395                                         }
 396
 397                                         is_text_prop = fieldName.StartsWith ("prop:t");
 398
 399                                         // If this is non-text property, just return one token
 400                                         // containing the entire string.  We do this to avoid
 401                                         // tokenizing keywords.
 402                                         if (! is_text_prop)
 403                                                 return new SingletonTokenStream (reader.ReadToEnd ());
 404                                 }
 405
 406                                 TokenStream outstream;
 407                                 outstream = base.TokenStream (fieldName, reader);
 408
 409                                 if (fieldName == "Text"
 410                                     || fieldName == "HotText"
 411                                     || fieldName == "PropertyText"
 412                                     || is_text_prop) {
 413                                         outstream = new NoiseFilter (outstream);
 414                                         outstream = new PorterStemFilter (outstream);
 415                                 }
 416
 417                                 return outstream;
 418                         }
 419                 }
 420
 421                 static private Analyzer indexing_analyzer = new BeagleAnalyzer (true);
 422                 static private Analyzer query_analyzer = new BeagleAnalyzer (false);
 423
 424                 static protected Analyzer IndexingAnalyzer { get { return indexing_analyzer; } }
 425                 static protected Analyzer QueryAnalyzer { get { return query_analyzer; } }
 426
 427                 ////////////////////////////////////////////////////////////////
 428
 429                 //
 430                 // Dealing with properties
 431                 //
 432
 433                 static private char TypeToCode (PropertyType type)
 434                 {
 435                         switch (type) {
 436                         case PropertyType.Text:    return 't';
 437                         case PropertyType.Keyword: return 'k';
 438                         case PropertyType.Date:    return 'd';
 439                         }
 440                         throw new Exception ("Bad property type: " + type);
 441                 }
 442
 443                 static private PropertyType CodeToType (char c)
 444                 {
 445                         switch (c) {
 446                         case 't': return PropertyType.Text;
 447                         case 'k': return PropertyType.Keyword;
 448                         case 'd': return PropertyType.Date;
 449                         }
 450
 451                         throw new Exception ("Bad property code: " + c);
 452                 }
 453
 454                 static private string TypeToWildcardField (PropertyType type)
 455                 {
 456                         switch (type) {
 457                         case PropertyType.Text:    return "PropertyText";
 458                         case PropertyType.Keyword: return "PropertyKeyword";
 459                         case PropertyType.Date:    return "PropertyDate";
 460                         }
 461
 462                         return null;
 463                 }
 464
 465                 // Exposing this is a little bit suspicious.
 466                 static protected string PropertyToFieldName (PropertyType type, string key)
 467                 {
 468                         return String.Format ("prop:{0}:{1}", TypeToCode (type), key);
 469
 470                 }
 471
 472                 static private void AddDateFields (string field_name, Property prop, Document doc)
 473                 {
 474                         DateTime dt = StringFu.StringToDateTime (prop.Value);
 475
 476                         Field f;
 477                         f = new Field ("YM:" + field_name,
 478                                        StringFu.DateTimeToYearMonthString (dt),
 479                                        false,   // never store
 480                                        true,    // always index
 481                                        false);  // never tokenize
 482                         doc.Add (f);
 483
 484                         f = new Field ("D:" + field_name,
 485                                        StringFu.DateTimeToDayString (dt),
 486                                        false,   // never store
 487                                        true,    // always index
 488                                        false);  // never tokenize
 489                         doc.Add (f);
 490                 }
 491
 492                 static protected void AddPropertyToDocument (Property prop, Document doc)
 493                 {
 494                         if (prop == null || prop.Value == null)
 495                                 return;
 496
 497                         // Don't actually put properties in the UnindexedNamespace
 498                         // in the document.  A horrible (and yet lovely!) hack.
 499                         if (prop.Key.StartsWith (StringFu.UnindexedNamespace))
 500                                 return;
 501
 502                         Field f;
 503
 504                         if (prop.IsSearched) {
 505                                 string wildcard_field = TypeToWildcardField (prop.Type);
 506                                 bool tokenize = (prop.Type == PropertyType.Text);
 507                                 if (wildcard_field != null) {
 508                                         f = new Field (wildcard_field,
 509                                                        prop.Value,
 510                                                        false, // never stored
 511                                                        true,  // always indexed
 512                                                        tokenize);
 513                                         doc.Add (f);
 514
 515                                         if (prop.Type == PropertyType.Date)
 516                                                 AddDateFields (wildcard_field, prop, doc);
 517                                 }
 518                         }
 519
 520                         string coded_value;
 521                         coded_value = String.Format ("{0}:{1}",
 522                                                      prop.IsSearched ? 's' : '_',
 523                                                      prop.Value);
 524
 525                         string field_name = PropertyToFieldName (prop.Type, prop.Key);
 526
 527                         f = new Field (field_name,
 528                                        coded_value,
 529                                        prop.IsStored,
 530                                        true,        // always index
 531                                        true);       // always tokenize (just strips off type code for keywords)
 532                         doc.Add (f);
 533
 534                         if (prop.Type == PropertyType.Date)
 535                                 AddDateFields (field_name, prop, doc);
 536                 }
 537
 538                 static protected Property GetPropertyFromDocument (Field f, Document doc, bool from_primary_index)
 539                 {
 540                         // Note: we don't use the document that we pass in,
 541                         // but in theory we could.  At some later point we
 542                         // might need to split a property's data across two or
 543                         // more fields in the document.
 544
 545                         if (f == null)
 546                                 return null;
 547
 548                         string field_name;
 549                         field_name = f.Name ();
 550                         if (field_name.Length < 7
 551                             || ! field_name.StartsWith ("prop:"))
 552                                 return null;
 553
 554                         string field_value;
 555                         field_value = f.StringValue ();
 556
 557                         Property prop;
 558                         prop = new Property ();
 559                         prop.Type = CodeToType (field_name [5]);
 560                         prop.Key = field_name.Substring (7);
 561                         prop.Value = field_value.Substring (2);
 562                         prop.IsSearched = (field_value [0] == 's');
 563                         prop.IsMutable = ! from_primary_index;
 564                         prop.IsStored = f.IsStored ();
 565
 566                         return prop;
 567                 }
 568
 569                 //////////////////////////////////////////////////////////////////////////////
 570
 571                 //
 572                 // Dealing with documents
 573                 //
 574
 575                 static protected void BuildDocuments (Indexable indexable,
 576                                                       out Document primary_doc,
 577                                                       out Document secondary_doc)
 578                 {
 579                         primary_doc = new Document ();
 580                         secondary_doc = null;
 581
 582                         Field f;
 583
 584                         f = Field.Keyword ("Uri", UriFu.UriToSerializableString (indexable.Uri));
 585                         primary_doc.Add (f);
 586
 587                         if (indexable.ParentUri != null) {
 588                                 f = Field.Keyword ("ParentUri", UriFu.UriToSerializableString (indexable.ParentUri));
 589                                 primary_doc.Add (f);
 590                         }
 591
 592                         if (indexable.ValidTimestamp) {
 593                                 // Note that we also want to search in the
 594                                 // Timestamp field when we do a wildcard date
 595                                 // query, so that's why we also add a wildcard
 596                                 // field for each item here.
 597
 598                                 string wildcard_field = TypeToWildcardField (PropertyType.Date);
 599
 600                                 string str = StringFu.DateTimeToString (indexable.Timestamp);
 601                                 f = Field.Keyword ("Timestamp", str);
 602                                 primary_doc.Add (f);
 603                                 f = Field.UnStored (wildcard_field, str);
 604                                 primary_doc.Add (f);
 605
 606                                 str = StringFu.DateTimeToYearMonthString (indexable.Timestamp);
 607                                 f = Field.Keyword ("YM:Timestamp", str);
 608                                 primary_doc.Add (f);
 609                                 f = Field.UnStored ("YM:" + wildcard_field, str);
 610                                 primary_doc.Add (f);
 611
 612                                 str = StringFu.DateTimeToDayString (indexable.Timestamp);
 613                                 f = Field.Keyword ("D:Timestamp", str);
 614                                 primary_doc.Add (f);
 615                                 f = Field.UnStored ("D:" + wildcard_field, str);
 616                                 primary_doc.Add (f);
 617                         }
 618
 619                         if (indexable.NoContent) {
 620                                 // If there is no content, make a note of that
 621                                 // in a special property.
 622                                 Property prop;
 623                                 prop = Property.NewBool ("beagle:NoContent", true);
 624                                 AddPropertyToDocument (prop, primary_doc);
 625
 626                         } else {
 627
 628                                 // Since we might have content, add our text
 629                                 // readers.
 630
 631                                 TextReader reader;
 632
 633                                 reader = indexable.GetTextReader ();
 634                                 if (reader != null) {
 635                                         f = Field.Text ("Text", reader);
 636                                         primary_doc.Add (f);
 637                                 }
 638
 639                                 reader = indexable.GetHotTextReader ();
 640                                 if (reader != null) {
 641                                         f = Field.Text ("HotText", reader);
 642                                         primary_doc.Add (f);
 643                                 }
 644                         }
 645
 646                         // Store the Type and MimeType in special properties
 647
 648                         if (indexable.HitType != null) {
 649                                 Property prop;
 650                                 prop = Property.NewUnsearched ("beagle:HitType", indexable.HitType);
 651                                 AddPropertyToDocument (prop, primary_doc);
 652                         }
 653
 654                         if (indexable.MimeType != null) {
 655                                 Property prop;
 656                                 prop = Property.NewUnsearched ("beagle:MimeType", indexable.MimeType);
 657                                 AddPropertyToDocument (prop, primary_doc);
 658                         }
 659
 660                         if (indexable.Source != null) {
 661                                 Property prop;
 662                                 prop = Property.NewUnsearched ("beagle:Source", indexable.Source);
 663                                 AddPropertyToDocument (prop, primary_doc);
 664                         }
 665
 666                         // Store the other properties
 667
 668                         foreach (Property prop in indexable.Properties) {
 669                                 Document target_doc = primary_doc;
 670                                 if (prop.IsMutable) {
 671                                         if (secondary_doc == null) {
 672                                                 secondary_doc = new Document ();
 673                                                 f = Field.Keyword ("Uri", UriFu.UriToSerializableString (indexable.Uri));
 674                                                 secondary_doc.Add (f);
 675                                         }
 676                                         target_doc = secondary_doc;
 677                                 }
 678
 679                                 AddPropertyToDocument (prop, target_doc);
 680                         }
 681                 }
 682
 683                 static protected Document RewriteDocument (Document old_secondary_doc,
 684                                                            Indexable prop_only_indexable)
 685                 {
 686                         Hashtable seen_props;
 687                         seen_props = new Hashtable ();
 688
 689                         Document new_doc;
 690                         new_doc = new Document ();
 691
 692                         Field uri_f;
 693                         uri_f = Field.Keyword ("Uri", UriFu.UriToSerializableString (prop_only_indexable.Uri));
 694                         new_doc.Add (uri_f);
 695
 696                         Logger.Log.Debug ("Rewriting {0}", prop_only_indexable.DisplayUri);
 697
 698                         // Add the new properties to the new document.  To
 699                         // delete a property, set the Value to null... then it
 700                         // will be added to seen_props (so the old value will
 701                         // be ignored below), but AddPropertyToDocument will
 702                         // return w/o doing anything.
 703                         foreach (Property prop in prop_only_indexable.Properties) {
 704                                 seen_props [prop.Key] = prop;
 705                                 AddPropertyToDocument (prop, new_doc);
 706                                 Logger.Log.Debug ("New prop '{0}' = '{1}'", prop.Key, prop.Value);
 707                         }
 708
 709                         // Copy the other properties from the old document to the
 710                         // new one, skipping any properties that we got new values
 711                         // for out of the Indexable.
 712                         if (old_secondary_doc != null) {
 713                                 foreach (Field f in old_secondary_doc.Fields ()) {
 714                                         Property prop;
 715                                         prop = GetPropertyFromDocument (f, old_secondary_doc, false);
 716                                         if (prop != null && ! seen_props.Contains (prop.Key)) {
 717                                                 Logger.Log.Debug ("Old prop '{0}' = '{1}'", prop.Key, prop.Value);
 718                                                 AddPropertyToDocument (prop, new_doc);
 719                                         }
 720                                 }
 721                         }
 722
 723                         return new_doc;
 724                 }
 725
 726                 static protected Uri GetUriFromDocument (Document doc)
 727                 {
 728                         string uri;
 729                         uri = doc.Get ("Uri");
 730                         if (uri == null)
 731                                 throw new Exception ("Got document from Lucene w/o a URI!");
 732                         return UriFu.UriStringToUri (uri);
 733                 }
 734
 735                 static protected Hit DocumentToHit (Document doc)
 736                 {
 737                         Hit hit;
 738                         hit = new Hit ();
 739
 740                         hit.Uri = GetUriFromDocument (doc);
 741
 742                         string str;
 743                         str = doc.Get ("ParentUri");
 744                         if (str != null)
 745                                 hit.ParentUri = UriFu.UriStringToUri (str);
 746
 747                         hit.Timestamp = StringFu.StringToDateTime (doc.Get ("Timestamp"));
 748
 749                         AddPropertiesToHit (hit, doc, true);
 750
 751                         // Get the Type and MimeType from the properties.
 752                         hit.Type = hit.GetFirstProperty ("beagle:HitType");
 753                         hit.MimeType = hit.GetFirstProperty ("beagle:MimeType");
 754                         hit.Source = hit.GetFirstProperty ("beagle:Source");
 755
 756                         return hit;
 757                 }
 758
 759                 static protected void AddPropertiesToHit (Hit hit, Document doc, bool from_primary_index)
 760                 {
 761                         foreach (Field f in doc.Fields ()) {
 762                                 Property prop;
 763                                 prop = GetPropertyFromDocument (f, doc, from_primary_index);
 764                                 if (prop != null)
 765                                         hit.AddProperty (prop);
 766                         }
 767                 }
 768
 769
 770                 //////////////////////////////////////////////////////////////////////////////
 771
 772                 //
 773                 // Handle the index's item count
 774                 //
 775
 776                 public int GetItemCount ()
 777                 {
 778                         if (last_item_count < 0) {
 779                                 IndexReader reader;
 780                                 reader = GetReader (PrimaryStore);
 781                                 last_item_count = reader.NumDocs ();
 782                                 ReleaseReader (reader);
 783                         }
 784                         return last_item_count;
 785                 }
 786
 787                 // We should set the cached count of index items when IndexReaders
 788                 // are open and available, so calls to GetItemCount will return immediately.
 789
 790                 protected bool HaveItemCount { get { return last_item_count >= 0; } }
 791
 792                 protected void SetItemCount (IndexReader reader)
 793                 {
 794                         last_item_count = reader.NumDocs ();
 795                 }
 796
 797                 public void SetItemCount (int count)
 798                 {
 799                         last_item_count = count;
 800                 }
 801
 802                 protected void AdjustItemCount (int delta)
 803                 {
 804                         if (last_item_count >= 0)
 805                                 last_item_count += delta;
 806                 }
 807
 808                 //////////////////////////////////////////////////////////////////////////////
 809
 810                 //
 811                 // Access to the stemmer and list of stop words
 812                 //
 813
 814                 static PorterStemmer stemmer = new PorterStemmer ();
 815
 816                 static public string Stem (string str)
 817                 {
 818                         return stemmer.Stem (str);
 819                 }
 820
 821                 public static bool IsStopWord (string stemmed_word)
 822                 {
 823                         return ArrayFu.IndexOfString (StopAnalyzer.ENGLISH_STOP_WORDS, stemmed_word) != -1;
 824                 }
 825
 826                 //////////////////////////////////////////////////////////////////////////////
 827
 828                 //
 829                 // Special Hit Filtering classes
 830                 //
 831
 832                 static private bool TrueHitFilter (Hit hit)
 833                 {
 834                         return true;
 835                 }
 836
 837                 static private HitFilter true_hit_filter = new HitFilter (TrueHitFilter);
 838
 839                 public class OrHitFilter {
 840
 841                         private ArrayList all = new ArrayList ();
 842                         private bool contains_known_true = false;
 843
 844                         public void Add (HitFilter hit_filter)
 845                         {
 846                                 if (hit_filter == true_hit_filter)
 847                                         contains_known_true = true;
 848                                 all.Add (hit_filter);
 849                         }
 850
 851                         public bool HitFilter (Hit hit)
 852                         {
 853                                 if (contains_known_true)
 854                                         return true;
 855                                 foreach (HitFilter hit_filter in all)
 856                                         if (hit_filter (hit))
 857                                                 return true;
 858                                 return false;
 859                         }
 860                 }
 861
 862                 public class AndHitFilter {
 863
 864                         private ArrayList all = new ArrayList ();
 865
 866                         public void Add (HitFilter hit_filter)
 867                         {
 868                                 all.Add (hit_filter);
 869                         }
 870
 871                         public bool HitFilter (Hit hit)
 872                         {
 873                                 foreach (HitFilter hit_filter in all)
 874                                         if (! hit_filter (hit))
 875                                                 return false;
 876                                 return true;
 877                         }
 878                 }
 879
 880                 public class NotHitFilter {
 881                         HitFilter original;
 882
 883                         public NotHitFilter (HitFilter original)
 884                         {
 885                                 this.original = original;
 886                         }
 887
 888                         public bool HitFilter (Hit hit)
 889                         {
 890                                 return ! original (hit);
 891                         }
 892                 }
 893
 894                 //////////////////////////////////////////////////////////////////////////////
 895
 896                 //
 897                 // Queries
 898                 //
 899
 900                 static private LNS.Query StringToQuery (string field_name,
 901                                                         string text,
 902                                                         ArrayList term_list)
 903                 {
 904                         ArrayList tokens = new ArrayList ();
 905
 906                         // Use the analyzer to extract the query's tokens.
 907                         // This code is taken from Lucene's query parser.
 908                         TokenStream source = QueryAnalyzer.TokenStream (field_name, new StringReader (text));
 909                         while (true) {
 910                                 Lucene.Net.Analysis.Token token;
 911                                 try {
 912                                         token = source.Next ();
 913                                         if (token == null)
 914                                                 break;
 915                                 } catch (IOException) {
 916                                         break;
 917                                 }
 918                                 if (token != null)
 919                                         tokens.Add (token.TermText ());
 920                         }
 921                         try {
 922                                 source.Close ();
 923                         } catch (IOException) {
 924                                 // ignore
 925                         }
 926
 927                         if (tokens.Count == 0)
 928                                 return null;
 929
 930                         LNS.PhraseQuery query = new LNS.PhraseQuery ();
 931
 932                         foreach (string token in tokens) {
 933                                 Term term;
 934                                 term = new Term (field_name, token);
 935                                 query.Add (term);
 936                                 if (term_list != null)
 937                                         term_list.Add (term);
 938                         }
 939
 940                         return query;
 941                 }
 942
 943                 //
 944                 // Date Range Handling
 945                 //
 946
 947                 // This function will break down dates to discrete chunks of
 948                 // time to avoid expanding RangeQuerys as much as possible.
 949                 // For example, searching for
 950                 //
 951                 // YMD(5 May 2005, 16 Oct 2006)
 952                 //
 953                 // would break down into three queries:
 954                 //
 955                 // (YM(May 2005) AND D(5,31)) OR
 956                 // YM(Jun 2005, Sep 2006) OR
 957                 // (YM(Oct 2006) AND D(1,16))
 958
 959                 static private DateTime lower_bound = new DateTime (1970, 1, 1);
 960
 961                 // FIXME: we should probably boost this sometime around 2030.
 962                 // Mark your calendar.
 963                 static private DateTime upper_bound = new DateTime (2038, 12, 31);
 964
 965                 static private Term NewYearMonthTerm (string field_name, int y, int m)
 966                 {
 967                         return new Term ("YM:" + field_name, String.Format ("{0}{1:00}", y, m));
 968                 }
 969
 970                 static private LNS.Query NewYearMonthQuery (string field_name, int y, int m)
 971                 {
 972                         return new LNS.TermQuery (NewYearMonthTerm (field_name, y, m));
 973                 }
 974
 975                 static private LNS.Query NewYearMonthQuery (string field_name, int y1, int m1, int y2, int m2)
 976                 {
 977                         return new LNS.RangeQuery (NewYearMonthTerm (field_name, y1, m1),
 978                                                    NewYearMonthTerm (field_name, y2, m2),
 979                                                    true); // query is inclusive
 980                 }
 981
 982                 static private Term NewDayTerm (string field_name, int d)
 983                 {
 984                         return new Term ("D:" + field_name, String.Format ("{0:00}", d));
 985                 }
 986
 987                 static private LNS.Query NewDayQuery (string field_name, int d1, int d2)
 988                 {
 989                         return new LNS.RangeQuery (NewDayTerm (field_name, d1),
 990                                                    NewDayTerm (field_name, d2),
 991                                                    true); // query is inclusive
 992                 }
 993
 994                 private class DateRangeHitFilter {
 995                         public string Key;
 996                         public DateTime StartDate;
 997                         public DateTime EndDate;
 998
 999                         public bool HitFilter (Hit hit)
1000                         {
1001                                 // First, check the Timestamp
1002                                 if (Key == QueryPart_DateRange.AllPropertiesKey
1003                                     || Key == QueryPart_DateRange.TimestampKey) {
1004                                         DateTime dt;
1005                                         dt = hit.Timestamp;
1006                                         if (StartDate <= dt && dt <= EndDate)
1007                                                 return true;
1008                                         if (Key == QueryPart_DateRange.TimestampKey)
1009                                                 return false;
1010                                 }
1011
1012                                 if (Key == QueryPart_DateRange.AllPropertiesKey) {
1013                                         // Walk through all of the properties, and see if any
1014                                         // date properties fall inside the range.
1015                                         foreach (Property prop in hit.Properties) {
1016                                                 if (prop.Type == PropertyType.Date) {
1017                                                         DateTime dt;
1018                                                         dt = StringFu.StringToDateTime (prop.Value);
1019                                                         if (StartDate <= dt && dt <= EndDate)
1020                                                                 return true;
1021                                                 }
1022                                         }
1023                                         return false;
1024                                 } else {
1025                                         // Walk through all of the properties with the given key,
1026                                         // and see if any of them fall inside of the range.
1027                                         string[] values;
1028                                         values = hit.GetProperties (Key);
1029                                         foreach (string v in values) {
1030                                                 DateTime dt;
1031                                                 dt = StringFu.StringToDateTime (v);
1032                                                 if (StartDate <= dt && dt <= EndDate)
1033                                                         return true;
1034                                         }
1035                                         return false;
1036                                 }
1037                         }
1038                 }
1039
1040                 static private LNS.Query GetDateRangeQuery (QueryPart_DateRange part, out HitFilter hit_filter)
1041                 {
1042                         string field_name;
1043                         if (part.Key == QueryPart_DateRange.AllPropertiesKey)
1044                                 field_name = TypeToWildcardField (PropertyType.Date);
1045                         else if (part.Key == QueryPart_DateRange.TimestampKey)
1046                                 field_name = "Timestamp";
1047                         else
1048                                 field_name = PropertyToFieldName (PropertyType.Date, part.Key);
1049
1050                         // FIXME: We could optimize this and reduce the size of our range
1051                         // queries if we actually new the min and max date that appear in
1052                         // any properties in the index.  We would need to inspect the index to
1053                         // determine that at start-up, and then track it as new documents
1054                         // get added to the index.
1055                         if (part.StartDate < lower_bound)
1056                                 part.StartDate = lower_bound;
1057                         if (part.EndDate > upper_bound || part.EndDate == DateTime.MinValue)
1058                                 part.EndDate = upper_bound;
1059
1060                         // Swap the start and end dates if they come in reversed.
1061                         if (part.StartDate > part.EndDate) {
1062                                 DateTime swap;
1063                                 swap = part.StartDate;
1064                                 part.StartDate = part.EndDate;
1065                                 part.EndDate = swap;
1066                         }
1067
1068                         // Set up our hit filter to cull out the bad dates.
1069                         DateRangeHitFilter drhf;
1070                         drhf = new DateRangeHitFilter ();
1071                         drhf.Key = part.Key;
1072                         drhf.StartDate = part.StartDate;
1073                         drhf.EndDate = part.EndDate;
1074                         hit_filter = new HitFilter (drhf.HitFilter);
1075
1076                         Logger.Log.Debug ("Building new date range query");
1077                         Logger.Log.Debug ("Start: {0}", part.StartDate);
1078                         Logger.Log.Debug ("End: {0}", part.EndDate);
1079
1080                         int y1, m1, d1, y2, m2, d2;
1081                         y1 = part.StartDate.Year;
1082                         m1 = part.StartDate.Month;
1083                         d1 = part.StartDate.Day;
1084                         y2 = part.EndDate.Year;
1085                         m2 = part.EndDate.Month;
1086                         d2 = part.EndDate.Day;
1087
1088                         LNS.BooleanQuery top_level_query;
1089                         top_level_query = new LNS.BooleanQuery ();
1090
1091                         // A special case: both the start and the end of our range fall
1092                         // in the same month.
1093                         if (y1 == y2 && m1 == m2) {
1094                                 LNS.Query ym_query;
1095                                 ym_query = NewYearMonthQuery (field_name, y1, m1);
1096
1097                                 // If our range only covers a part of the month, do a range query on the days.
1098                                 if (d1 != 1 || d2 != DateTime.DaysInMonth (y2, m2)) {
1099                                         LNS.BooleanQuery sub_query;
1100                                         sub_query = new LNS.BooleanQuery ();
1101                                         sub_query.Add (ym_query, true, false);
1102                                         sub_query.Add (NewDayQuery (field_name, d1, d2), true, false);
1103                                         top_level_query.Add (sub_query, false, false);
1104                                 } else {
1105                                         top_level_query.Add (ym_query, false, false);
1106                                 }
1107
1108                         } else {
1109
1110                                 // Handle a partial month at the beginning of our range.
1111                                 if (d1 > 1) {
1112                                         LNS.BooleanQuery sub_query;
1113                                         sub_query = new LNS.BooleanQuery ();
1114                                         sub_query.Add (NewYearMonthQuery (field_name, y1, m1), true, false);
1115                                         sub_query.Add (NewDayQuery (field_name, d1, DateTime.DaysInMonth (y1, m1)), true, false);
1116                                         top_level_query.Add (sub_query, false, false);
1117
1118                                         ++m1;
1119                                         if (m1 == 13) {
1120                                                 m1 = 1;
1121                                                 ++y1;
1122                                         }
1123                                 }
1124
1125                                 // And likewise, handle a partial month at the end of our range.
1126                                 if (d2 < DateTime.DaysInMonth (y2, m2)) {
1127                                         LNS.BooleanQuery sub_query;
1128                                         sub_query = new LNS.BooleanQuery ();
1129                                         sub_query.Add (NewYearMonthQuery (field_name, y2, m2), true, false);
1130                                         sub_query.Add (NewDayQuery (field_name, 1, d2), true, false);
1131                                         top_level_query.Add (sub_query, false, false);
1132
1133                                         --m2;
1134                                         if (m2 == 0) {
1135                                                 m2 = 12;
1136                                                 --y2;
1137                                         }
1138                                 }
1139
1140                                 // Generate the query for the "middle" of our period, if it is non-empty
1141                                 if (y1 < y2 || ((y1 == y2) && m1 <= m2))
1142                                         top_level_query.Add (NewYearMonthQuery (field_name, y1, m1, y2, m2),
1143                                                              false, false);
1144                         }
1145
1146                         return top_level_query;
1147                 }
1148
1149                 // search_subset_uris is a list of Uris that this search should be
1150                 // limited to.
1151                 static protected void QueryPartToQuery (QueryPart     abstract_part,
1152                                                         bool          only_build_primary_query,
1153                                                         ArrayList     term_list,
1154                                                         out LNS.Query primary_query,
1155                                                         out LNS.Query secondary_query,
1156                                                         out HitFilter hit_filter)
1157                 {
1158                         primary_query = null;
1159                         secondary_query = null;
1160
1161                         // By default, we assume that our lucene queries will return exactly the
1162                         // matching set of objects.  We need to set the hit filter if further
1163                         // refinement of the search results is required.  (As in the case of
1164                         // date range queries, for example.)  We essentially have to do this
1165                         // to make OR queries work correctly.
1166                         hit_filter = true_hit_filter;
1167
1168                         // The exception is when dealing with a prohibited part.  Just return
1169                         // null for the hit filter in that case.  This works since
1170                         // prohibited parts are not allowed inside of OR queries.
1171                         if (abstract_part.Logic == QueryPartLogic.Prohibited)
1172                                 hit_filter = null;
1173
1174                         if (abstract_part == null)
1175                                 return;
1176
1177                         if (abstract_part is QueryPart_Text) {
1178                                 QueryPart_Text part = (QueryPart_Text) abstract_part;
1179
1180                                 if (! (part.SearchFullText || part.SearchTextProperties))
1181                                         return;
1182
1183                                 LNS.BooleanQuery p_query = new LNS.BooleanQuery ();
1184                                 LNS.BooleanQuery s_query = new LNS.BooleanQuery ();
1185
1186                                 if (part.SearchFullText) {
1187                                         LNS.Query subquery;
1188                                         subquery = StringToQuery ("Text", part.Text, term_list);
1189                                         if (subquery != null)
1190                                                 p_query.Add (subquery, false, false);
1191
1192                                         // FIXME: HotText is ignored for now!
1193                                         // subquery = StringToQuery ("HotText", part.Text);
1194                                         // if (subquery != null)
1195                                         //    p_query.Add (subquery, false, false);
1196                                 }
1197
1198                                 if (part.SearchTextProperties) {
1199                                         LNS.Query subquery;
1200                                         subquery = StringToQuery ("PropertyText", part.Text, term_list);
1201                                         if (subquery != null) {
1202                                                 p_query.Add (subquery, false, false);
1203                                                 // Properties can live in either index
1204                                                 if (! only_build_primary_query)
1205                                                         s_query.Add (subquery.Clone () as LNS.Query, false, false);
1206                                         }
1207
1208                                         Term term;
1209                                         term = new Term ("PropertyKeyword", part.Text);
1210                                         // FIXME: terms are already added in term_list. But they may have been tokenized
1211                                         // The term here is non-tokenized version. Should this be added to term_list ?
1212                                         // term_list is used to calculate scores
1213                                         if (term_list != null)
1214                                                 term_list.Add (term);
1215                                         subquery = new LNS.TermQuery (term);
1216                                         p_query.Add (subquery, false, false);
1217                                         // Properties can live in either index
1218                                         if (! only_build_primary_query)
1219                                                 s_query.Add (subquery.Clone () as LNS.Query, false, false);
1220                                 }
1221
1222                                 primary_query = p_query;
1223                                 if (! only_build_primary_query)
1224                                         secondary_query = s_query;
1225
1226                                 return;
1227                         }
1228
1229                         if (abstract_part is QueryPart_Property) {
1230                                 QueryPart_Property part = (QueryPart_Property) abstract_part;
1231
1232                                 string field_name;
1233                                 if (part.Key == QueryPart_Property.AllProperties) {
1234                                         field_name = TypeToWildcardField (part.Type);
1235                                         // FIXME: probably shouldn't just return silently
1236                                         if (field_name == null)
1237                                                 return;
1238                                 } else
1239                                         field_name = PropertyToFieldName (part.Type, part.Key);
1240
1241                                 if (part.Type == PropertyType.Text)
1242                                         primary_query = StringToQuery (field_name, part.Value, term_list);
1243                                 else {
1244                                         Term term;
1245                                         term = new Term (field_name, part.Value);
1246                                         if (term_list != null)
1247                                                 term_list.Add (term);
1248                                         primary_query = new LNS.TermQuery (term);
1249                                 }
1250
1251                                 // Properties can live in either index
1252                                 if (! only_build_primary_query && primary_query != null)
1253                                         secondary_query = primary_query.Clone () as LNS.Query;
1254
1255                                 return;
1256                         }
1257
1258                         if (abstract_part is QueryPart_DateRange) {
1259
1260                                 QueryPart_DateRange part = (QueryPart_DateRange) abstract_part;
1261
1262                                 primary_query = GetDateRangeQuery (part, out hit_filter);
1263                                 // Date properties can live in either index
1264                                 if (! only_build_primary_query && primary_query != null)
1265                                         secondary_query = primary_query.Clone () as LNS.Query;
1266
1267                                 // If this is a prohibited part, invert our hit filter.
1268                                 if (part.Logic == QueryPartLogic.Prohibited) {
1269                                         NotHitFilter nhf;
1270                                         nhf = new NotHitFilter (hit_filter);
1271                                         hit_filter = new HitFilter (nhf.HitFilter);
1272                                 }
1273
1274                                 return;
1275                         }
1276
1277                         if (abstract_part is QueryPart_Or) {
1278                                 QueryPart_Or part = (QueryPart_Or) abstract_part;
1279
1280                                 // Assemble a new BooleanQuery combining all of the sub-parts.
1281                                 LNS.BooleanQuery p_query;
1282                                 p_query = new LNS.BooleanQuery ();
1283
1284                                 LNS.BooleanQuery s_query = null;
1285                                 if (! only_build_primary_query)
1286                                         s_query = new LNS.BooleanQuery ();
1287
1288                                 primary_query = p_query;
1289                                 secondary_query = s_query;
1290
1291                                 OrHitFilter or_hit_filter = null;
1292
1293                                 foreach (QueryPart  sub_part in part.SubParts) {
1294                                         LNS.Query p_subq, s_subq;
1295                                         HitFilter sub_hit_filter; // FIXME: This is (and must be) ignored
1296                                         // FIXME: Any subpart in an OR which has a hit filter won't work
1297                                         // correctly, because we can't tell which part of an OR we matched
1298                                         // against to filter correctly.  This affects date range queries.
1299                                         QueryPartToQuery (sub_part, only_build_primary_query,
1300                                                           term_list,
1301                                                           out p_subq, out s_subq, out sub_hit_filter);
1302                                         if (p_subq != null)
1303                                                 p_query.Add (p_subq, false, false);
1304                                         if (s_subq != null)
1305                                                 s_query.Add (s_subq, false, false);
1306                                         if (sub_hit_filter != null) {
1307                                                 if (or_hit_filter == null)
1308                                                         or_hit_filter = new OrHitFilter ();
1309                                                 or_hit_filter.Add (sub_hit_filter);
1310                                         }
1311                                 }
1312
1313                                 if (or_hit_filter != null)
1314                                         hit_filter = new HitFilter (or_hit_filter.HitFilter);
1315
1316                                 return;
1317                         }
1318
1319                         throw new Exception ("Unhandled QueryPart type! " + abstract_part.ToString ());
1320                 }
1321
1322                 static protected LNS.Query UriQuery (string field_name, Uri uri)
1323                 {
1324                         return new LNS.TermQuery (new Term (field_name, UriFu.UriToSerializableString (uri)));
1325                 }
1326
1327                 static protected LNS.Query UriQuery (string field_name, ICollection uri_list)
1328                 {
1329                         return UriQuery (field_name, uri_list, null);
1330                 }
1331
1332                 static protected LNS.Query UriQuery (string field_name, ICollection uri_list, LNS.Query extra_requirement)
1333                 {
1334                         if (uri_list.Count == 0)
1335                                 return null;
1336
1337                         int max_clauses;
1338                         max_clauses = LNS.BooleanQuery.GetMaxClauseCount ();
1339
1340                         int N;
1341                         N = 1 + (uri_list.Count - 1) / max_clauses;
1342
1343                         LNS.BooleanQuery top_query;
1344                         top_query = new LNS.BooleanQuery ();
1345
1346                         int cursor = 0;
1347                         if (extra_requirement != null) {
1348                                 top_query.Add (extra_requirement, true, false);
1349                                 ++cursor;
1350                         }
1351
1352                         ArrayList bottom_queries = null;
1353
1354                         if (N > 1) {
1355                                 bottom_queries = new ArrayList ();
1356                                 for (int i = 0; i < N; ++i) {
1357                                         LNS.BooleanQuery bq;
1358                                         bq = new LNS.BooleanQuery ();
1359                                         bottom_queries.Add (bq);
1360                                         top_query.Add (bq, false, false);
1361                                 }
1362                         }
1363
1364                         foreach (Uri uri in uri_list) {
1365                                 LNS.Query subquery;
1366                                 subquery = UriQuery (field_name, uri);
1367
1368                                 LNS.BooleanQuery target;
1369                                 if (N == 1)
1370                                         target = top_query;
1371                                 else {
1372                                         target = (LNS.BooleanQuery) bottom_queries [cursor];
1373                                         ++cursor;
1374                                         if (cursor >= N)
1375                                                 cursor = 0;
1376                                 }
1377
1378                                 target.Add (subquery, false, false);
1379                         }
1380
1381                         return top_query;
1382                 }
1383
1384                 ///////////////////////////////////////////////////////////////////////////////////
1385
1386                 public int SegmentCount {
1387                         get {
1388                                 DirectoryInfo dir_info;
1389                                 int p_count = 0, s_count = 0;
1390
1391                                 dir_info = new DirectoryInfo (PrimaryIndexDirectory);
1392                                 foreach (FileInfo file_info in dir_info.GetFiles ())
1393                                         if (file_info.Extension == ".cfs")
1394                                                 ++p_count;
1395
1396                                 dir_info = new DirectoryInfo (SecondaryIndexDirectory);
1397                                 foreach (FileInfo file_info in dir_info.GetFiles ())
1398                                         if (file_info.Extension == ".cfs")
1399                                                 ++s_count;
1400
1401                                 return p_count > s_count ? p_count : s_count;
1402                         }
1403                 }
1404
1405                 ///////////////////////////////////////////////////////////////////////////////////
1406
1407                 // Cache IndexReaders on a per-Lucene index basis, since they
1408                 // are extremely expensive to create.  Note that using this
1409                 // only makes sense in situations where the index only
1410                 // possibly might change from underneath us, but most of the
1411                 // time probably won't.  This means it makes sense to do
1412                 // this in LuceneQueryingDriver.cs, but it doesn't in
1413                 // LuceneIndexingDriver.cs.
1414
1415                 private class ReaderAndVersion {
1416
1417                         public IndexReader Reader;
1418                         public long Version;
1419                         public int Refcount;
1420
1421                         public ReaderAndVersion (IndexReader reader, long version)
1422                         {
1423                                 this.Reader = reader;
1424                                 this.Version = version;
1425                                 this.Refcount = 1;
1426                         }
1427                 }
1428
1429                 static private Hashtable directory_rav_map = new Hashtable ();
1430                 static private Hashtable reader_rav_map = new Hashtable ();
1431
1432                 static public LNS.IndexSearcher GetSearcher (Lucene.Net.Store.Directory directory)
1433                 {
1434                         IndexReader reader = GetReader (directory);
1435
1436                         return new LNS.IndexSearcher (reader);
1437                 }
1438
1439                 static public IndexReader GetReader (Lucene.Net.Store.Directory directory)
1440                 {
1441                         IndexReader reader;
1442                         long version;
1443
1444                         lock (reader_rav_map) {
1445                                 ReaderAndVersion rav = (ReaderAndVersion) directory_rav_map [directory];
1446
1447                                 if (rav == null) {
1448                                         version = IndexReader.GetCurrentVersion (directory);
1449                                         reader = IndexReader.Open (directory);
1450
1451                                         rav = new ReaderAndVersion (reader, version);
1452                                         rav.Refcount++;
1453
1454                                         directory_rav_map [directory] = rav;
1455                                         reader_rav_map [reader] = rav;
1456
1457                                         return reader;
1458                                 }
1459
1460                                 version = IndexReader.GetCurrentVersion (directory);
1461
1462                                 if (version != rav.Version) {
1463                                         UnrefReaderAndVersion_Unlocked (rav);
1464
1465                                         reader = IndexReader.Open (directory);
1466
1467                                         rav = new ReaderAndVersion (reader, version);
1468                                         rav.Refcount++;
1469
1470                                         directory_rav_map [directory] = rav;
1471                                         reader_rav_map [reader] = rav;
1472                                 } else
1473                                         rav.Refcount++;
1474
1475                                 return rav.Reader;
1476                         }
1477                 }
1478
1479                 static private void UnrefReaderAndVersion_Unlocked (ReaderAndVersion rav)
1480                 {
1481                         rav.Refcount--;
1482
1483                         if (rav.Refcount == 0) {
1484                                 rav.Reader.Close ();
1485                                 reader_rav_map.Remove (rav.Reader);
1486                         }
1487                 }
1488
1489                 static public void ReleaseReader (IndexReader reader)
1490                 {
1491                         lock (reader_rav_map) {
1492                                 ReaderAndVersion rav = (ReaderAndVersion) reader_rav_map [reader];
1493
1494                                 UnrefReaderAndVersion_Unlocked (rav);
1495                         }
1496                 }
1497
1498                 static public void ReleaseSearcher (LNS.IndexSearcher searcher)
1499                 {
1500                         IndexReader reader = searcher.GetIndexReader ();
1501
1502                         searcher.Close ();
1503                         ReleaseReader (reader);
1504                 }
1505
1506                 ///////////////////////////////////////////////////////////////////////////////////
1507
1508                 //
1509                 // Various ways to grab lots of hits at once.
1510                 // These should never be used for querying, only for utility
1511                 // functions.
1512                 //
1513
1514                 public int GetBlockOfHits (int cookie,
1515                                            Hit [] block_of_hits)
1516                 {
1517                         IndexReader primary_reader;
1518                         IndexReader secondary_reader;
1519                         primary_reader = GetReader (PrimaryStore);
1520                         secondary_reader = GetReader (SecondaryStore);
1521
1522                         int request_size;
1523                         request_size = block_of_hits.Length;
1524                         if (request_size > primary_reader.NumDocs ())
1525                                 request_size = primary_reader.NumDocs ();
1526
1527                         int max_doc;
1528                         max_doc = primary_reader.MaxDoc ();
1529
1530                         if (cookie < 0) {
1531                                 Random random;
1532                                 random = new Random ();
1533                                 cookie = random.Next (max_doc);
1534                         }
1535
1536                         int original_cookie;
1537                         original_cookie = cookie;
1538
1539                         Hashtable primary_docs, secondary_docs;
1540                         primary_docs = UriFu.NewHashtable ();
1541                         secondary_docs = UriFu.NewHashtable ();
1542
1543                         // Load the primary documents
1544                         for (int i = 0; i < request_size; ++i) {
1545
1546                                 if (! primary_reader.IsDeleted (cookie)) {
1547                                         Document doc;
1548                                         doc = primary_reader.Document (cookie);
1549                                         primary_docs [GetUriFromDocument (doc)] = doc;
1550                                 }
1551
1552                                 ++cookie;
1553                                 if (cookie >= max_doc) // wrap around
1554                                         cookie = 0;
1555
1556                                 // If we somehow end up back where we started,
1557                                 // give up.
1558                                 if (cookie == original_cookie)
1559                                         break;
1560                         }
1561
1562                         // If necessary, load the secondary documents
1563                         if (secondary_reader != null) {
1564                                 LNS.IndexSearcher searcher;
1565                                 searcher = new LNS.IndexSearcher (secondary_reader);
1566
1567                                 LNS.Query uri_query;
1568                                 uri_query = UriQuery ("Uri", primary_docs.Keys);
1569
1570                                 LNS.Hits hits;
1571                                 hits = searcher.Search (uri_query);
1572                                 for (int i = 0; i < hits.Length (); ++i) {
1573                                         Document doc;
1574                                         doc = hits.Doc (i);
1575                                         secondary_docs [GetUriFromDocument (doc)] = doc;
1576                                 }
1577
1578                                 searcher.Close ();
1579                         }
1580
1581                         ReleaseReader (primary_reader);
1582                         ReleaseReader (secondary_reader);
1583
1584                         // Now assemble the hits
1585                         int j = 0;
1586                         foreach (Uri uri in primary_docs.Keys) {
1587                                 Document primary_doc, secondary_doc;
1588                                 primary_doc = primary_docs [uri] as Document;
1589                                 secondary_doc = secondary_docs [uri] as Document;
1590
1591                                 Hit hit;
1592                                 hit = DocumentToHit (primary_doc);
1593                                 if (secondary_doc != null)
1594                                         AddPropertiesToHit (hit, secondary_doc, false);
1595
1596                                 block_of_hits [j] = hit;
1597                                 ++j;
1598                         }
1599
1600                         // null-pad the array, if necessary
1601                         for (; j < block_of_hits.Length; ++j)
1602                                 block_of_hits [j] = null;
1603
1604
1605                         // Return the new cookie
1606                         return cookie;
1607                 }
1608
1609                 // For a large index, this will be very slow and will consume
1610                 // a lot of memory.  Don't call it without a good reason!
1611                 // We return a hashtable indexed by Uri.
1612                 public Hashtable GetAllHitsByUri ()
1613                 {
1614                         Hashtable all_hits;
1615                         all_hits = UriFu.NewHashtable ();
1616
1617                         IndexReader primary_reader;
1618                         IndexReader secondary_reader;
1619                         primary_reader = GetReader (PrimaryStore);
1620                         secondary_reader = GetReader (SecondaryStore);
1621
1622                         // Load everything from the primary index
1623                         int max_doc;
1624                         max_doc = primary_reader.MaxDoc ();
1625                         for (int i = 0; i < max_doc; ++i) {
1626
1627                                 if (primary_reader.IsDeleted (i))
1628                                         continue;
1629
1630                                 Document doc;
1631                                 doc = primary_reader.Document (i);
1632
1633                                 Hit hit;
1634                                 hit = DocumentToHit (doc);
1635                                 all_hits [hit.Uri] = hit;
1636                         }
1637
1638                         // Now add in everything from the secondary index, if it exists
1639                         if (secondary_reader != null) {
1640                                 max_doc = secondary_reader.MaxDoc ();
1641                                 for (int i = 0; i < max_doc; ++i) {
1642
1643                                         if (secondary_reader.IsDeleted (i))
1644                                                 continue;
1645
1646                                         Document doc;
1647                                         doc = secondary_reader.Document (i);
1648
1649                                         Uri uri;
1650                                         uri = GetUriFromDocument (doc);
1651
1652                                         Hit hit;
1653                                         hit = (Hit) all_hits [uri];
1654                                         if (hit != null)
1655                                                 AddPropertiesToHit (hit, doc, false);
1656                                 }
1657                         }
1658
1659                         ReleaseReader (primary_reader);
1660                         ReleaseReader (secondary_reader);
1661
1662                         return all_hits;
1663                 }
1664         }
1665 }