beagled/LuceneCommon.cs

   1 //
   2 // LuceneCommon.cs
   3 //
   4 // Copyright (C) 2004-2005 Novell, Inc.
   5 //
   6
   7 //
   8 // Permission is hereby granted, free of charge, to any person obtaining a
   9 // copy of this software and associated documentation files (the "Software"),
  10 // to deal in the Software without restriction, including without limitation
  11 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
  12 // and/or sell copies of the Software, and to permit persons to whom the
  13 // Software is furnished to do so, subject to the following conditions:
  14 //
  15 // The above copyright notice and this permission notice shall be included in
  16 // all copies or substantial portions of the Software.
  17 //
  18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  24 // DEALINGS IN THE SOFTWARE.
  25 //
  26
  27 using System;
  28 using System.Collections;
  29 using System.Diagnostics;
  30 using System.Globalization;
  31 using System.IO;
  32 using System.Text;
  33 using System.Threading;
  34 using System.Xml;
  35 using System.Xml.Serialization;
  36
  37 using Lucene.Net.Analysis;
  38 using Lucene.Net.Analysis.Standard;
  39 using Lucene.Net.Documents;
  40 using Lucene.Net.Index;
  41 using Lucene.Net.QueryParsers;
  42 using LNS = Lucene.Net.Search;
  43
  44 using Beagle.Util;
  45
  46 namespace Beagle.Daemon {
  47
  48         public class LuceneCommon {
  49
  50                 public delegate bool HitFilter (Hit hit);
  51
  52                 // VERSION HISTORY
  53                 // ---------------
  54                 //
  55                 //  1: Original
  56                 //  2: Changed format of timestamp strings
  57                 //  3: Schema changed to be more Dashboard-Match-like
  58                 //  4: Schema changed for files to include _Directory property
  59                 //  5: Changed analyzer to support stemming.  Bumped version # to
  60                 //     force everyone to re-index.
  61                 //  6: lots of schema changes as part of the general refactoring
  62                 //  7: incremented to force a re-index after our upgrade to lucene 1.4
  63                 //     (in theory the file formats are compatible, we are seeing 'term
  64                 //     out of order' exceptions in some cases)
  65                 //  8: another forced re-index, this time because of massive changes
  66                 //     in the file system backend (it would be nice to have per-backend
  67                 //     versioning so that we didn't have to purge all indexes just
  68                 //     because one changed)
  69                 //  9: changed the way properties are stored, changed in conjunction
  70                 //     with sane handling of multiple properties on hits.
  71                 // 10: changed to support typed and mutable properties
  72                 // 11: moved mime type and hit type into properties
  73                 // 12: added year-month and year-month-day resolutions for all
  74                 //     date properties
  75                 // 13: moved source into a property
  76                 // 14: allow wildcard queries to also match keywords
  77                 // 15: analyze PropertyKeyword field, and store all properties as
  78                 //     lower case so that we're truly case insensitive.
  79                 // 16: add inverted timestamp to make querying substantially faster
  80                 private const int MAJOR_VERSION = 16;
  81                 private int minor_version = 0;
  82
  83                 private string index_name;
  84                 private string top_dir;
  85
  86                 private string fingerprint;
  87                 private int last_item_count = -1;
  88
  89                 // This is the big index, containing document full-texts and
  90                 // data that is expensive to index.
  91                 private Lucene.Net.Store.Directory primary_store = null;
  92
  93                 // This is the small index, containing document info that we
  94                 // expect to have change.  Canonical example: file names.
  95                 private Lucene.Net.Store.Directory secondary_store = null;
  96
  97                 //////////////////////////////////////////////////////////////////////////////
  98
  99                 protected LuceneCommon (string index_name, int minor_version)
 100                 {
 101                         this.index_name = index_name;
 102                         this.minor_version = minor_version;
 103
 104                         this.top_dir = (Path.IsPathRooted (index_name)) ? index_name : Path.Combine (PathFinder.IndexDir, index_name);
 105                 }
 106
 107                 //////////////////////////////////////////////////////////////////////////////
 108
 109                 protected string IndexName { get { return index_name; } }
 110
 111                 public Lucene.Net.Store.Directory PrimaryStore { get { return primary_store; } }
 112
 113                 public Lucene.Net.Store.Directory SecondaryStore { get { return secondary_store; } }
 114
 115                 public string Fingerprint { get { return fingerprint; } }
 116
 117                 public string TopDirectory { get { return top_dir; } }
 118
 119                 //////////////////////////////////////////////////////////////////////////////
 120
 121                 protected TextCache text_cache = null;
 122
 123                 public TextCache TextCache {
 124                         get { return text_cache; }
 125                         set { text_cache = value; }
 126                 }
 127
 128                 //////////////////////////////////////////////////////////////////////////////
 129
 130                 private string VersionFile {
 131                         get { return Path.Combine (top_dir, "version"); }
 132                 }
 133
 134                 private string FingerprintFile {
 135                         get { return Path.Combine (top_dir, "fingerprint"); }
 136                 }
 137
 138                 // Shouldn't really be public
 139                 public string PrimaryIndexDirectory {
 140                         get { return Path.Combine (top_dir, "PrimaryIndex"); }
 141                 }
 142
 143                 // Shouldn't really be public
 144                 public string SecondaryIndexDirectory {
 145                         get { return Path.Combine (top_dir, "SecondaryIndex"); }
 146                 }
 147
 148                 public string LockDirectory {
 149                         get { return Path.Combine (top_dir, "Locks"); }
 150                 }
 151
 152                 //////////////////////////////////////////////////////////////////////////////
 153
 154                 // Deal with dangling locks
 155
 156                 private bool IsDanglingLock (FileInfo info)
 157                 {
 158                         Log.Debug ("Checking for dangling locks...");
 159
 160                         // It isn't even a lock file
 161                         if (! info.Name.EndsWith (".lock"))
 162                                 return false;
 163
 164                         StreamReader reader;
 165                         string pid = null;
 166
 167                         try {
 168                                 reader = new StreamReader (info.FullName);
 169                                 pid = reader.ReadLine ();
 170                                 reader.Close ();
 171
 172                         } catch {
 173                                 // We couldn't read the lockfile, so it probably went away.
 174                                 return false;
 175                         }
 176
 177
 178                         if (pid == null) {
 179                                 // Looks like the lock file was empty, which really
 180                                 // shouldn't happen.  It should contain the PID of
 181                                 // the process which locked it.  Lets be on the safe
 182                                 // side and assume it's a dangling lock.
 183                                 Log.Warn ("Found an empty lock file, that shouldn't happen: {0}", info.FullName);
 184                                 return true;
 185                         }
 186
 187                         string cmdline_file;
 188                         cmdline_file = String.Format ("/proc/{0}/cmdline", pid);
 189
 190                         string cmdline = "";
 191                         try {
 192                                 reader = new StreamReader (cmdline_file);
 193                                 cmdline = reader.ReadLine ();
 194                                 reader.Close ();
 195                         } catch {
 196                                 // If we can't open that file, either:
 197                                 // (1) The process doesn't exist
 198                                 // (2) It does exist, but it doesn't belong to us.
 199                                 //     Thus it isn't an IndexHelper
 200                                 // In either case, the lock is dangling --- if it
 201                                 // still exists.
 202                                 return info.Exists;
 203                         }
 204
 205                         // The process exists, but isn't an IndexHelper.
 206                         // If the lock file is still there, it is dangling.
 207                         // FIXME: During one run of bludgeon I got a null reference
 208                         // exception here, so I added the cmdline == null check.
 209                         // Why exactly would that happen?  Is this logic correct
 210                         // in that (odd and presumably rare) case?
 211                         if (cmdline == null || cmdline.IndexOf ("IndexHelper.exe") == -1)
 212                                 return info.Exists;
 213
 214                         // If we reach this point, we know:
 215                         // (1) The process still exists
 216                         // (2) We own it
 217                         // (3) It is an IndexHelper process
 218                         // Thus it almost certainly isn't a dangling lock.
 219                         // The process might be wedged, but that is
 220                         // another issue...
 221                         return false;
 222                 }
 223
 224                 protected bool Exists ()
 225                 {
 226                         if (! (Directory.Exists (top_dir)
 227                                && File.Exists (VersionFile)
 228                                && File.Exists (FingerprintFile)
 229                                && Directory.Exists (PrimaryIndexDirectory)
 230                                && IndexReader.IndexExists (PrimaryIndexDirectory)
 231                                && Directory.Exists (SecondaryIndexDirectory)
 232                                && IndexReader.IndexExists (SecondaryIndexDirectory)
 233                                && Directory.Exists (LockDirectory)))
 234                                 return false;
 235
 236                         // Check the index's version number.  If it is wrong,
 237                         // declare the index non-existent.
 238
 239                         StreamReader version_reader;
 240                         string version_str;
 241                         version_reader = new StreamReader (VersionFile);
 242                         version_str = version_reader.ReadLine ();
 243                         version_reader.Close ();
 244
 245                         int current_major_version, current_minor_version;
 246                         int i = version_str.IndexOf ('.');
 247
 248                         if (i != -1) {
 249                                 current_major_version = Convert.ToInt32 (version_str.Substring (0, i));
 250                                 current_minor_version = Convert.ToInt32 (version_str.Substring (i+1));
 251                         } else {
 252                                 current_minor_version = Convert.ToInt32 (version_str);
 253                                 current_major_version = 0;
 254                         }
 255
 256                         if (current_major_version != MAJOR_VERSION
 257                             || (minor_version >= 0 && current_minor_version != minor_version)) {
 258                                 Logger.Log.Debug ("Version mismatch in {0}", index_name);
 259                                 Logger.Log.Debug ("Index has version {0}.{1}, expected {2}.{3}",
 260                                                   current_major_version, current_minor_version,
 261                                                   MAJOR_VERSION, minor_version);
 262                                 return false;
 263                         }
 264
 265                         // Check the lock directory: If there is a dangling write lock,
 266                         // assume that the index is corrupted and declare it non-existent.
 267                         DirectoryInfo lock_dir_info;
 268                         lock_dir_info = new DirectoryInfo (LockDirectory);
 269                         foreach (FileInfo info in lock_dir_info.GetFiles ()) {
 270                                 if (IsDanglingLock (info)) {
 271                                         Logger.Log.Warn ("Found a dangling index lock on {0}", info.FullName);
 272                                         return false;
 273                                 }
 274                         }
 275
 276                         return true;
 277                 }
 278
 279                 private Lucene.Net.Store.Directory CreateIndex (string path)
 280                 {
 281                         // Create a directory to put the index in.
 282                         Directory.CreateDirectory (path);
 283
 284                         // Create a new store.
 285                         Lucene.Net.Store.Directory store;
 286                         store = Lucene.Net.Store.FSDirectory.GetDirectory (path, LockDirectory, true);
 287
 288                         // Create an empty index in that store.
 289                         IndexWriter writer;
 290                         writer = new IndexWriter (store, null, true);
 291                         writer.Close ();
 292
 293                         return store;
 294                 }
 295
 296                 // Create will kill your index dead.  Use it with care.
 297                 // You don't need to call Open after calling Create.
 298                 protected void Create ()
 299                 {
 300                         if (minor_version < 0)
 301                                 minor_version = 0;
 302
 303                         // Purge any existing directories.
 304                         if (Directory.Exists (top_dir)) {
 305                                 Logger.Log.Debug ("Purging {0}", top_dir);
 306                                 Directory.Delete (top_dir, true);
 307                         }
 308
 309                         // Create any necessary directories.
 310                         Directory.CreateDirectory (top_dir);
 311                         Directory.CreateDirectory (LockDirectory);
 312
 313                         // Create the indexes.
 314                         primary_store = CreateIndex (PrimaryIndexDirectory);
 315                         secondary_store = CreateIndex (SecondaryIndexDirectory);
 316
 317                         // Generate and store the index fingerprint.
 318                         fingerprint = GuidFu.ToShortString (Guid.NewGuid ());
 319                         TextWriter writer;
 320                         writer = new StreamWriter (FingerprintFile, false);
 321                         writer.WriteLine (fingerprint);
 322                         writer.Close ();
 323
 324                         // Store our index version information.
 325                         writer = new StreamWriter (VersionFile, false);
 326                         writer.WriteLine ("{0}.{1}", MAJOR_VERSION, minor_version);
 327                         writer.Close ();
 328                 }
 329
 330                 protected void Open ()
 331                 {
 332                         Open (false);
 333                 }
 334
 335                 protected void Open (bool read_only_mode)
 336                 {
 337                         // Read our index fingerprint.
 338                         TextReader reader;
 339                         reader = new StreamReader (FingerprintFile);
 340                         fingerprint = reader.ReadLine ();
 341                         reader.Close ();
 342
 343                         // Create stores for our indexes.
 344                         primary_store = Lucene.Net.Store.FSDirectory.GetDirectory (PrimaryIndexDirectory, LockDirectory, false, read_only_mode);
 345                         secondary_store = Lucene.Net.Store.FSDirectory.GetDirectory (SecondaryIndexDirectory, LockDirectory, false, read_only_mode);
 346                 }
 347
 348                 ////////////////////////////////////////////////////////////////
 349
 350                 //
 351                 // Custom Analyzers
 352                 //
 353
 354                 private class SingletonTokenStream : TokenStream {
 355
 356                         private string singleton_str;
 357
 358                         public SingletonTokenStream (string singleton_str)
 359                         {
 360                                 this.singleton_str = singleton_str;
 361                         }
 362
 363                         override public Lucene.Net.Analysis.Token Next ()
 364                         {
 365                                 if (singleton_str == null)
 366                                         return null;
 367
 368                                 Lucene.Net.Analysis.Token token;
 369                                 token = new Lucene.Net.Analysis.Token (singleton_str, 0, singleton_str.Length);
 370
 371                                 singleton_str = null;
 372
 373                                 return token;
 374                         }
 375                 }
 376
 377                 // FIXME: This assumes everything being indexed is in English!
 378                 private class BeagleAnalyzer : StandardAnalyzer {
 379
 380                         private char [] buffer = new char [2];
 381                         private bool strip_extra_property_info = false;
 382                         private bool tokenize_email_hostname = false;
 383
 384                         public BeagleAnalyzer (bool is_indexing_analyzer)
 385                         {
 386                                 if (is_indexing_analyzer) {
 387                                         this.strip_extra_property_info = true;
 388                                         this.tokenize_email_hostname = true;
 389                                 } else {
 390                                         this.strip_extra_property_info = false;
 391                                         this.tokenize_email_hostname = false;
 392                                 }
 393                         }
 394
 395                         public override TokenStream TokenStream (string fieldName, TextReader reader)
 396                         {
 397                                 bool is_text_prop = false;
 398
 399                                 // Strip off the first two characters in a property.
 400                                 // We store type information in those two characters, so we don't
 401                                 // want to index them.
 402                                 if (fieldName.StartsWith ("prop:")) {
 403
 404                                         if (strip_extra_property_info) {
 405                                                 // Skip everything up to and including the first :
 406                                                 int c;
 407                                                 do {
 408                                                         c = reader.Read ();
 409                                                 } while (c != -1 && c != ':');
 410                                         }
 411
 412                                         is_text_prop = fieldName.StartsWith ("prop:t");
 413
 414                                         // If this is non-text property, just return one token
 415                                         // containing the entire string.  We do this to avoid
 416                                         // tokenizing keywords.
 417                                         if (! is_text_prop) {
 418                                                 // We don't want to lower case the token if it's
 419                                                 // not in the private namespace.
 420
 421                                                 TokenStream singleton_stream = new SingletonTokenStream (reader.ReadToEnd ());
 422
 423                                                 if (fieldName.StartsWith ("prop:k:" + LuceneQueryingDriver.PrivateNamespace))
 424                                                         return singleton_stream;
 425                                                 else
 426                                                         return new LowerCaseFilter (singleton_stream);
 427                                         }
 428                                 } else if (fieldName == "PropertyKeyword")
 429                                         return new LowerCaseFilter (new SingletonTokenStream (reader.ReadToEnd ()));
 430
 431                                 TokenStream outstream;
 432                                 outstream = base.TokenStream (fieldName, reader);
 433
 434                                 if (fieldName == "Text"
 435                                     || fieldName == "HotText"
 436                                     || fieldName == "PropertyText"
 437                                     || is_text_prop) {
 438                                         outstream = new NoiseEmailHostFilter (outstream, tokenize_email_hostname);
 439                                         outstream = new PorterStemFilter (outstream);
 440                                 }
 441
 442                                 return outstream;
 443                         }
 444                 }
 445
 446                 static private Analyzer indexing_analyzer = new BeagleAnalyzer (true);
 447                 static private Analyzer query_analyzer = new BeagleAnalyzer (false);
 448
 449                 static protected Analyzer IndexingAnalyzer { get { return indexing_analyzer; } }
 450                 static protected Analyzer QueryAnalyzer { get { return query_analyzer; } }
 451
 452                 ////////////////////////////////////////////////////////////////
 453
 454                 //
 455                 // Dealing with properties
 456                 //
 457
 458                 static private char TypeToCode (PropertyType type)
 459                 {
 460                         switch (type) {
 461                         case PropertyType.Text:    return 't';
 462                         case PropertyType.Keyword: return 'k';
 463                         case PropertyType.Date:    return 'd';
 464                         }
 465                         throw new Exception ("Bad property type: " + type);
 466                 }
 467
 468                 static private PropertyType CodeToType (char c)
 469                 {
 470                         switch (c) {
 471                         case 't': return PropertyType.Text;
 472                         case 'k': return PropertyType.Keyword;
 473                         case 'd': return PropertyType.Date;
 474                         }
 475
 476                         throw new Exception ("Bad property code: " + c);
 477                 }
 478
 479                 static private string TypeToWildcardField (PropertyType type)
 480                 {
 481                         switch (type) {
 482                         case PropertyType.Text:    return "PropertyText";
 483                         case PropertyType.Keyword: return "PropertyKeyword";
 484                         case PropertyType.Date:    return "PropertyDate";
 485                         }
 486
 487                         return null;
 488                 }
 489
 490                 // Exposing this is a little bit suspicious.
 491                 static protected string PropertyToFieldName (PropertyType type, string key)
 492                 {
 493                         return String.Format ("prop:{0}:{1}", TypeToCode (type), key);
 494
 495                 }
 496
 497                 static private void AddDateFields (string field_name, Property prop, Document doc)
 498                 {
 499                         DateTime dt = StringFu.StringToDateTime (prop.Value);
 500
 501                         Field f;
 502                         f = new Field ("YM:" + field_name,
 503                                        StringFu.DateTimeToYearMonthString (dt),
 504                                        false,   // never store
 505                                        true,    // always index
 506                                        false);  // never tokenize
 507                         doc.Add (f);
 508
 509                         f = new Field ("D:" + field_name,
 510                                        StringFu.DateTimeToDayString (dt),
 511                                        false,   // never store
 512                                        true,    // always index
 513                                        false);  // never tokenize
 514                         doc.Add (f);
 515                 }
 516
 517                 static protected void AddPropertyToDocument (Property prop, Document doc)
 518                 {
 519                         if (prop == null || prop.Value == null)
 520                                 return;
 521
 522                         // Don't actually put properties in the UnindexedNamespace
 523                         // in the document.  A horrible (and yet lovely!) hack.
 524                         if (prop.Key.StartsWith (StringFu.UnindexedNamespace))
 525                                 return;
 526
 527                         Field f;
 528
 529                         if (prop.IsSearched) {
 530                                 string wildcard_field = TypeToWildcardField (prop.Type);
 531                                 if (wildcard_field != null) {
 532                                         f = new Field (wildcard_field,
 533                                                        prop.Value,
 534                                                        false, // never stored
 535                                                        true,  // always indexed
 536                                                        true); // always tokenize (just lowercases for keywords; full analysis for text)
 537                                         doc.Add (f);
 538
 539                                         if (prop.Type == PropertyType.Date)
 540                                                 AddDateFields (wildcard_field, prop, doc);
 541                                 }
 542                         }
 543
 544                         string coded_value;
 545                         coded_value = String.Format ("{0}:{1}",
 546                                                      prop.IsSearched ? 's' : '_',
 547                                                      prop.Value);
 548
 549                         string field_name = PropertyToFieldName (prop.Type, prop.Key);
 550
 551                         f = new Field (field_name,
 552                                        coded_value,
 553                                        prop.IsStored,
 554                                        true,        // always index
 555                                        true);       // always tokenize (strips off type code for keywords and lowercases)
 556                         doc.Add (f);
 557
 558                         if (prop.Type == PropertyType.Date)
 559                                 AddDateFields (field_name, prop, doc);
 560                 }
 561
 562                 static protected Property GetPropertyFromDocument (Field f, Document doc, bool from_primary_index)
 563                 {
 564                         // Note: we don't use the document that we pass in,
 565                         // but in theory we could.  At some later point we
 566                         // might need to split a property's data across two or
 567                         // more fields in the document.
 568
 569                         if (f == null)
 570                                 return null;
 571
 572                         string field_name;
 573                         field_name = f.Name ();
 574                         if (field_name.Length < 7
 575                             || ! field_name.StartsWith ("prop:"))
 576                                 return null;
 577
 578                         string field_value;
 579                         field_value = f.StringValue ();
 580
 581                         Property prop;
 582                         prop = new Property ();
 583                         prop.Type = CodeToType (field_name [5]);
 584                         prop.Key = field_name.Substring (7);
 585                         prop.Value = field_value.Substring (2);
 586                         prop.IsSearched = (field_value [0] == 's');
 587                         prop.IsMutable = ! from_primary_index;
 588                         prop.IsStored = f.IsStored ();
 589
 590                         return prop;
 591                 }
 592
 593                 //////////////////////////////////////////////////////////////////////////////
 594
 595                 //
 596                 // Dealing with documents
 597                 //
 598
 599                 static protected void BuildDocuments (Indexable indexable,
 600                                                       out Document primary_doc,
 601                                                       out Document secondary_doc)
 602                 {
 603                         primary_doc = new Document ();
 604                         secondary_doc = null;
 605
 606                         Field f;
 607
 608                         f = Field.Keyword ("Uri", UriFu.UriToEscapedString (indexable.Uri));
 609                         primary_doc.Add (f);
 610
 611                         if (indexable.ParentUri != null) {
 612                                 f = Field.Keyword ("ParentUri", UriFu.UriToEscapedString (indexable.ParentUri));
 613                                 primary_doc.Add (f);
 614                         }
 615
 616                         if (indexable.ValidTimestamp) {
 617                                 // Note that we also want to search in the
 618                                 // Timestamp field when we do a wildcard date
 619                                 // query, so that's why we also add a wildcard
 620                                 // field for each item here.
 621
 622                                 string wildcard_field = TypeToWildcardField (PropertyType.Date);
 623
 624                                 string str = StringFu.DateTimeToString (indexable.Timestamp);
 625                                 f = Field.Keyword ("Timestamp", str);
 626                                 primary_doc.Add (f);
 627                                 f = Field.UnStored (wildcard_field, str);
 628                                 primary_doc.Add (f);
 629
 630                                 // Create an inverted timestamp so that we can
 631                                 // sort by timestamp at search-time.
 632                                 long timeval = Convert.ToInt64 (str);
 633                                 f = Field.UnStored ("InvertedTimestamp", (Int64.MaxValue - timeval).ToString ());
 634                                 primary_doc.Add (f);
 635
 636                                 str = StringFu.DateTimeToYearMonthString (indexable.Timestamp);
 637                                 f = Field.Keyword ("YM:Timestamp", str);
 638                                 primary_doc.Add (f);
 639                                 f = Field.UnStored ("YM:" + wildcard_field, str);
 640                                 primary_doc.Add (f);
 641
 642                                 str = StringFu.DateTimeToDayString (indexable.Timestamp);
 643                                 f = Field.Keyword ("D:Timestamp", str);
 644                                 primary_doc.Add (f);
 645                                 f = Field.UnStored ("D:" + wildcard_field, str);
 646                                 primary_doc.Add (f);
 647                         }
 648
 649                         if (indexable.NoContent) {
 650                                 // If there is no content, make a note of that
 651                                 // in a special property.
 652                                 Property prop;
 653                                 prop = Property.NewBool ("beagle:NoContent", true);
 654                                 AddPropertyToDocument (prop, primary_doc);
 655
 656                         } else {
 657
 658                                 // Since we might have content, add our text
 659                                 // readers.
 660
 661                                 TextReader reader;
 662
 663                                 reader = indexable.GetTextReader ();
 664                                 if (reader != null) {
 665                                         f = Field.Text ("Text", reader);
 666                                         primary_doc.Add (f);
 667                                 }
 668
 669                                 reader = indexable.GetHotTextReader ();
 670                                 if (reader != null) {
 671                                         f = Field.Text ("HotText", reader);
 672                                         primary_doc.Add (f);
 673                                 }
 674                         }
 675
 676                         // Store the Type and MimeType in special properties
 677
 678                         if (indexable.HitType != null) {
 679                                 Property prop;
 680                                 prop = Property.NewUnsearched ("beagle:HitType", indexable.HitType);
 681                                 AddPropertyToDocument (prop, primary_doc);
 682                         }
 683
 684                         if (indexable.MimeType != null) {
 685                                 Property prop;
 686                                 prop = Property.NewUnsearched ("beagle:MimeType", indexable.MimeType);
 687                                 AddPropertyToDocument (prop, primary_doc);
 688                         }
 689
 690                         if (indexable.Source != null) {
 691                                 Property prop;
 692                                 prop = Property.NewUnsearched ("beagle:Source", indexable.Source);
 693                                 AddPropertyToDocument (prop, primary_doc);
 694                         }
 695
 696                         // Store the other properties
 697
 698                         foreach (Property prop in indexable.Properties) {
 699                                 Document target_doc = primary_doc;
 700                                 if (prop.IsMutable) {
 701                                         if (secondary_doc == null) {
 702                                                 secondary_doc = new Document ();
 703                                                 f = Field.Keyword ("Uri", UriFu.UriToEscapedString (indexable.Uri));
 704                                                 secondary_doc.Add (f);
 705                                         }
 706                                         target_doc = secondary_doc;
 707                                 }
 708
 709                                 AddPropertyToDocument (prop, target_doc);
 710                         }
 711                 }
 712
 713                 static protected Document RewriteDocument (Document old_secondary_doc,
 714                                                            Indexable prop_only_indexable)
 715                 {
 716                         Hashtable seen_props;
 717                         seen_props = new Hashtable ();
 718
 719                         Document new_doc;
 720                         new_doc = new Document ();
 721
 722                         Field uri_f;
 723                         uri_f = Field.Keyword ("Uri", UriFu.UriToEscapedString (prop_only_indexable.Uri));
 724                         new_doc.Add (uri_f);
 725
 726                         Logger.Log.Debug ("Rewriting {0}", prop_only_indexable.DisplayUri);
 727
 728                         // Add the new properties to the new document.  To
 729                         // delete a property, set the Value to null... then it
 730                         // will be added to seen_props (so the old value will
 731                         // be ignored below), but AddPropertyToDocument will
 732                         // return w/o doing anything.
 733                         foreach (Property prop in prop_only_indexable.Properties) {
 734                                 seen_props [prop.Key] = prop;
 735                                 AddPropertyToDocument (prop, new_doc);
 736                                 Logger.Log.Debug ("New prop '{0}' = '{1}'", prop.Key, prop.Value);
 737                         }
 738
 739                         // Copy the other properties from the old document to the
 740                         // new one, skipping any properties that we got new values
 741                         // for out of the Indexable.
 742                         if (old_secondary_doc != null) {
 743                                 foreach (Field f in old_secondary_doc.Fields ()) {
 744                                         Property prop;
 745                                         prop = GetPropertyFromDocument (f, old_secondary_doc, false);
 746                                         if (prop != null && ! seen_props.Contains (prop.Key)) {
 747                                                 Logger.Log.Debug ("Old prop '{0}' = '{1}'", prop.Key, prop.Value);
 748                                                 AddPropertyToDocument (prop, new_doc);
 749                                         }
 750                                 }
 751                         }
 752
 753                         return new_doc;
 754                 }
 755
 756                 static protected Uri GetUriFromDocument (Document doc)
 757                 {
 758                         string uri;
 759                         uri = doc.Get ("Uri");
 760                         if (uri == null)
 761                                 throw new Exception ("Got document from Lucene w/o a URI!");
 762                         return UriFu.EscapedStringToUri (uri);
 763                 }
 764
 765                 static protected Hit DocumentToHit (Document doc)
 766                 {
 767                         Hit hit;
 768                         hit = new Hit ();
 769
 770                         hit.Uri = GetUriFromDocument (doc);
 771
 772                         string str;
 773                         str = doc.Get ("ParentUri");
 774                         if (str != null)
 775                                 hit.ParentUri = UriFu.EscapedStringToUri (str);
 776
 777                         hit.Timestamp = StringFu.StringToDateTime (doc.Get ("Timestamp"));
 778
 779                         AddPropertiesToHit (hit, doc, true);
 780
 781                         // Get the Type and MimeType from the properties.
 782                         hit.Type = hit.GetFirstProperty ("beagle:HitType");
 783                         hit.MimeType = hit.GetFirstProperty ("beagle:MimeType");
 784                         hit.Source = hit.GetFirstProperty ("beagle:Source");
 785
 786                         return hit;
 787                 }
 788
 789                 static protected void AddPropertiesToHit (Hit hit, Document doc, bool from_primary_index)
 790                 {
 791                         foreach (Field f in doc.Fields ()) {
 792                                 Property prop;
 793                                 prop = GetPropertyFromDocument (f, doc, from_primary_index);
 794                                 if (prop != null)
 795                                         hit.AddProperty (prop);
 796                         }
 797                 }
 798
 799
 800                 //////////////////////////////////////////////////////////////////////////////
 801
 802                 //
 803                 // Handle the index's item count
 804                 //
 805
 806                 public int GetItemCount ()
 807                 {
 808                         if (last_item_count < 0) {
 809                                 IndexReader reader;
 810                                 reader = GetReader (PrimaryStore);
 811                                 last_item_count = reader.NumDocs ();
 812                                 ReleaseReader (reader);
 813                         }
 814                         return last_item_count;
 815                 }
 816
 817                 // We should set the cached count of index items when IndexReaders
 818                 // are open and available, so calls to GetItemCount will return immediately.
 819
 820                 protected bool HaveItemCount { get { return last_item_count >= 0; } }
 821
 822                 protected void SetItemCount (IndexReader reader)
 823                 {
 824                         last_item_count = reader.NumDocs ();
 825                 }
 826
 827                 public void SetItemCount (int count)
 828                 {
 829                         last_item_count = count;
 830                 }
 831
 832                 protected void AdjustItemCount (int delta)
 833                 {
 834                         if (last_item_count >= 0)
 835                                 last_item_count += delta;
 836                 }
 837
 838                 //////////////////////////////////////////////////////////////////////////////
 839
 840                 //
 841                 // Access to the stemmer and list of stop words
 842                 //
 843
 844                 static PorterStemmer stemmer = new PorterStemmer ();
 845
 846                 static public string Stem (string str)
 847                 {
 848                         return stemmer.Stem (str);
 849                 }
 850
 851                 public static bool IsStopWord (string stemmed_word)
 852                 {
 853                         return ArrayFu.IndexOfString (StopAnalyzer.ENGLISH_STOP_WORDS, stemmed_word) != -1;
 854                 }
 855
 856                 //////////////////////////////////////////////////////////////////////////////
 857
 858                 //
 859                 // Special Hit Filtering classes
 860                 //
 861
 862                 static private bool TrueHitFilter (Hit hit)
 863                 {
 864                         return true;
 865                 }
 866
 867                 static private HitFilter true_hit_filter = new HitFilter (TrueHitFilter);
 868
 869                 public class OrHitFilter {
 870
 871                         private ArrayList all = new ArrayList ();
 872                         private bool contains_known_true = false;
 873
 874                         public void Add (HitFilter hit_filter)
 875                         {
 876                                 if (hit_filter == true_hit_filter)
 877                                         contains_known_true = true;
 878                                 all.Add (hit_filter);
 879                         }
 880
 881                         public bool HitFilter (Hit hit)
 882                         {
 883                                 if (contains_known_true)
 884                                         return true;
 885                                 foreach (HitFilter hit_filter in all)
 886                                         if (hit_filter (hit))
 887                                                 return true;
 888                                 return false;
 889                         }
 890                 }
 891
 892                 public class AndHitFilter {
 893
 894                         private ArrayList all = new ArrayList ();
 895
 896                         public void Add (HitFilter hit_filter)
 897                         {
 898                                 all.Add (hit_filter);
 899                         }
 900
 901                         public bool HitFilter (Hit hit)
 902                         {
 903                                 foreach (HitFilter hit_filter in all)
 904                                         if (! hit_filter (hit))
 905                                                 return false;
 906                                 return true;
 907                         }
 908                 }
 909
 910                 public class NotHitFilter {
 911                         HitFilter original;
 912
 913                         public NotHitFilter (HitFilter original)
 914                         {
 915                                 this.original = original;
 916                         }
 917
 918                         public bool HitFilter (Hit hit)
 919                         {
 920                                 return ! original (hit);
 921                         }
 922                 }
 923
 924                 //////////////////////////////////////////////////////////////////////////////
 925
 926                 //
 927                 // Queries
 928                 //
 929
 930                 static private LNS.Query StringToQuery (string field_name,
 931                                                         string text,
 932                                                         ArrayList term_list)
 933                 {
 934                         ArrayList tokens = new ArrayList ();
 935
 936                         // Use the analyzer to extract the query's tokens.
 937                         // This code is taken from Lucene's query parser.
 938                         TokenStream source = QueryAnalyzer.TokenStream (field_name, new StringReader (text));
 939                         while (true) {
 940                                 Lucene.Net.Analysis.Token token;
 941                                 try {
 942                                         token = source.Next ();
 943                                         if (token == null)
 944                                                 break;
 945                                 } catch (IOException) {
 946                                         break;
 947                                 }
 948                                 if (token != null)
 949                                         tokens.Add (token.TermText ());
 950                         }
 951                         try {
 952                                 source.Close ();
 953                         } catch (IOException) {
 954                                 // ignore
 955                         }
 956
 957                         if (tokens.Count == 0)
 958                                 return null;
 959
 960                         LNS.PhraseQuery query = new LNS.PhraseQuery ();
 961
 962                         foreach (string token in tokens) {
 963                                 Term term;
 964                                 term = new Term (field_name, token);
 965                                 query.Add (term);
 966                                 if (term_list != null)
 967                                         term_list.Add (term);
 968                         }
 969
 970                         return query;
 971                 }
 972
 973                 //
 974                 // Date Range Handling
 975                 //
 976
 977                 // This function will break down dates to discrete chunks of
 978                 // time to avoid expanding RangeQuerys as much as possible.
 979                 // For example, searching for
 980                 //
 981                 // YMD(5 May 2005, 16 Oct 2006)
 982                 //
 983                 // would break down into three queries:
 984                 //
 985                 // (YM(May 2005) AND D(5,31)) OR
 986                 // YM(Jun 2005, Sep 2006) OR
 987                 // (YM(Oct 2006) AND D(1,16))
 988
 989                 static private DateTime lower_bound = new DateTime (1970, 1, 1);
 990
 991                 // FIXME: we should probably boost this sometime around 2030.
 992                 // Mark your calendar.
 993                 static private DateTime upper_bound = new DateTime (2038, 12, 31);
 994
 995                 static private Term NewYearMonthTerm (string field_name, int y, int m)
 996                 {
 997                         return new Term ("YM:" + field_name, String.Format ("{0}{1:00}", y, m));
 998                 }
 999
1000                 static private LNS.Query NewYearMonthQuery (string field_name, int y, int m)
1001                 {
1002                         return new LNS.TermQuery (NewYearMonthTerm (field_name, y, m));
1003                 }
1004
1005                 static private LNS.Query NewYearMonthQuery (string field_name, int y1, int m1, int y2, int m2)
1006                 {
1007                         return new LNS.RangeQuery (NewYearMonthTerm (field_name, y1, m1),
1008                                                    NewYearMonthTerm (field_name, y2, m2),
1009                                                    true); // query is inclusive
1010                 }
1011
1012                 static private Term NewDayTerm (string field_name, int d)
1013                 {
1014                         return new Term ("D:" + field_name, String.Format ("{0:00}", d));
1015                 }
1016
1017                 static private LNS.Query NewDayQuery (string field_name, int d1, int d2)
1018                 {
1019                         return new LNS.RangeQuery (NewDayTerm (field_name, d1),
1020                                                    NewDayTerm (field_name, d2),
1021                                                    true); // query is inclusive
1022                 }
1023
1024                 private class DateRangeHitFilter {
1025                         public string Key;
1026                         public DateTime StartDate;
1027                         public DateTime EndDate;
1028
1029                         public bool HitFilter (Hit hit)
1030                         {
1031                                 // First, check the Timestamp
1032                                 if (Key == QueryPart_DateRange.AllPropertiesKey
1033                                     || Key == QueryPart_DateRange.TimestampKey) {
1034                                         DateTime dt;
1035                                         dt = hit.Timestamp;
1036                                         if (StartDate <= dt && dt <= EndDate)
1037                                                 return true;
1038                                         if (Key == QueryPart_DateRange.TimestampKey)
1039                                                 return false;
1040                                 }
1041
1042                                 if (Key == QueryPart_DateRange.AllPropertiesKey) {
1043                                         // Walk through all of the properties, and see if any
1044                                         // date properties fall inside the range.
1045                                         foreach (Property prop in hit.Properties) {
1046                                                 if (prop.Type == PropertyType.Date) {
1047                                                         DateTime dt;
1048                                                         dt = StringFu.StringToDateTime (prop.Value);
1049                                                         if (StartDate <= dt && dt <= EndDate)
1050                                                                 return true;
1051                                                 }
1052                                         }
1053                                         return false;
1054                                 } else {
1055                                         // Walk through all of the properties with the given key,
1056                                         // and see if any of them fall inside of the range.
1057                                         string[] values;
1058                                         values = hit.GetProperties (Key);
1059                                         foreach (string v in values) {
1060                                                 DateTime dt;
1061                                                 dt = StringFu.StringToDateTime (v);
1062                                                 if (StartDate <= dt && dt <= EndDate)
1063                                                         return true;
1064                                         }
1065                                         return false;
1066                                 }
1067                         }
1068                 }
1069
1070                 static private LNS.Query GetDateRangeQuery (QueryPart_DateRange part, out HitFilter hit_filter)
1071                 {
1072                         string field_name;
1073                         if (part.Key == QueryPart_DateRange.AllPropertiesKey)
1074                                 field_name = TypeToWildcardField (PropertyType.Date);
1075                         else if (part.Key == QueryPart_DateRange.TimestampKey)
1076                                 field_name = "Timestamp";
1077                         else
1078                                 field_name = PropertyToFieldName (PropertyType.Date, part.Key);
1079
1080                         // FIXME: We could optimize this and reduce the size of our range
1081                         // queries if we actually new the min and max date that appear in
1082                         // any properties in the index.  We would need to inspect the index to
1083                         // determine that at start-up, and then track it as new documents
1084                         // get added to the index.
1085                         if (part.StartDate < lower_bound)
1086                                 part.StartDate = lower_bound;
1087                         if (part.EndDate > upper_bound || part.EndDate == DateTime.MinValue)
1088                                 part.EndDate = upper_bound;
1089
1090                         // Swap the start and end dates if they come in reversed.
1091                         if (part.StartDate > part.EndDate) {
1092                                 DateTime swap;
1093                                 swap = part.StartDate;
1094                                 part.StartDate = part.EndDate;
1095                                 part.EndDate = swap;
1096                         }
1097
1098                         // Set up our hit filter to cull out the bad dates.
1099                         DateRangeHitFilter drhf;
1100                         drhf = new DateRangeHitFilter ();
1101                         drhf.Key = part.Key;
1102                         drhf.StartDate = part.StartDate;
1103                         drhf.EndDate = part.EndDate;
1104                         hit_filter = new HitFilter (drhf.HitFilter);
1105
1106                         Logger.Log.Debug ("Building new date range query");
1107                         Logger.Log.Debug ("Start: {0}", part.StartDate);
1108                         Logger.Log.Debug ("End: {0}", part.EndDate);
1109
1110                         int y1, m1, d1, y2, m2, d2;
1111                         y1 = part.StartDate.Year;
1112                         m1 = part.StartDate.Month;
1113                         d1 = part.StartDate.Day;
1114                         y2 = part.EndDate.Year;
1115                         m2 = part.EndDate.Month;
1116                         d2 = part.EndDate.Day;
1117
1118                         LNS.BooleanQuery top_level_query;
1119                         top_level_query = new LNS.BooleanQuery ();
1120
1121                         // A special case: both the start and the end of our range fall
1122                         // in the same month.
1123                         if (y1 == y2 && m1 == m2) {
1124                                 LNS.Query ym_query;
1125                                 ym_query = NewYearMonthQuery (field_name, y1, m1);
1126
1127                                 // If our range only covers a part of the month, do a range query on the days.
1128                                 if (d1 != 1 || d2 != DateTime.DaysInMonth (y2, m2)) {
1129                                         LNS.BooleanQuery sub_query;
1130                                         sub_query = new LNS.BooleanQuery ();
1131                                         sub_query.Add (ym_query, true, false);
1132                                         sub_query.Add (NewDayQuery (field_name, d1, d2), true, false);
1133                                         top_level_query.Add (sub_query, false, false);
1134                                 } else {
1135                                         top_level_query.Add (ym_query, false, false);
1136                                 }
1137
1138                         } else {
1139
1140                                 // Handle a partial month at the beginning of our range.
1141                                 if (d1 > 1) {
1142                                         LNS.BooleanQuery sub_query;
1143                                         sub_query = new LNS.BooleanQuery ();
1144                                         sub_query.Add (NewYearMonthQuery (field_name, y1, m1), true, false);
1145                                         sub_query.Add (NewDayQuery (field_name, d1, DateTime.DaysInMonth (y1, m1)), true, false);
1146                                         top_level_query.Add (sub_query, false, false);
1147
1148                                         ++m1;
1149                                         if (m1 == 13) {
1150                                                 m1 = 1;
1151                                                 ++y1;
1152                                         }
1153                                 }
1154
1155                                 // And likewise, handle a partial month at the end of our range.
1156                                 if (d2 < DateTime.DaysInMonth (y2, m2)) {
1157                                         LNS.BooleanQuery sub_query;
1158                                         sub_query = new LNS.BooleanQuery ();
1159                                         sub_query.Add (NewYearMonthQuery (field_name, y2, m2), true, false);
1160                                         sub_query.Add (NewDayQuery (field_name, 1, d2), true, false);
1161                                         top_level_query.Add (sub_query, false, false);
1162
1163                                         --m2;
1164                                         if (m2 == 0) {
1165                                                 m2 = 12;
1166                                                 --y2;
1167                                         }
1168                                 }
1169
1170                                 // Generate the query for the "middle" of our period, if it is non-empty
1171                                 if (y1 < y2 || ((y1 == y2) && m1 <= m2))
1172                                         top_level_query.Add (NewYearMonthQuery (field_name, y1, m1, y2, m2),
1173                                                              false, false);
1174                         }
1175
1176                         return top_level_query;
1177                 }
1178
1179                 // search_subset_uris is a list of Uris that this search should be
1180                 // limited to.
1181                 static protected void QueryPartToQuery (QueryPart     abstract_part,
1182                                                         bool          only_build_primary_query,
1183                                                         ArrayList     term_list,
1184                                                         out LNS.Query primary_query,
1185                                                         out LNS.Query secondary_query,
1186                                                         out HitFilter hit_filter)
1187                 {
1188                         primary_query = null;
1189                         secondary_query = null;
1190
1191                         // By default, we assume that our lucene queries will return exactly the
1192                         // matching set of objects.  We need to set the hit filter if further
1193                         // refinement of the search results is required.  (As in the case of
1194                         // date range queries, for example.)  We essentially have to do this
1195                         // to make OR queries work correctly.
1196                         hit_filter = true_hit_filter;
1197
1198                         // The exception is when dealing with a prohibited part.  Just return
1199                         // null for the hit filter in that case.  This works since
1200                         // prohibited parts are not allowed inside of OR queries.
1201                         if (abstract_part.Logic == QueryPartLogic.Prohibited)
1202                                 hit_filter = null;
1203
1204                         if (abstract_part == null)
1205                                 return;
1206
1207                         if (abstract_part is QueryPart_Text) {
1208                                 QueryPart_Text part = (QueryPart_Text) abstract_part;
1209
1210                                 if (! (part.SearchFullText || part.SearchTextProperties))
1211                                         return;
1212
1213                                 LNS.BooleanQuery p_query = new LNS.BooleanQuery ();
1214                                 LNS.BooleanQuery s_query = new LNS.BooleanQuery ();
1215
1216                                 if (part.SearchFullText) {
1217                                         LNS.Query subquery;
1218                                         subquery = StringToQuery ("Text", part.Text, term_list);
1219                                         if (subquery != null)
1220                                                 p_query.Add (subquery, false, false);
1221
1222                                         // FIXME: HotText is ignored for now!
1223                                         // subquery = StringToQuery ("HotText", part.Text);
1224                                         // if (subquery != null)
1225                                         //    p_query.Add (subquery, false, false);
1226                                 }
1227
1228                                 if (part.SearchTextProperties) {
1229                                         LNS.Query subquery;
1230                                         subquery = StringToQuery ("PropertyText", part.Text, term_list);
1231                                         if (subquery != null) {
1232                                                 p_query.Add (subquery, false, false);
1233                                                 // Properties can live in either index
1234                                                 if (! only_build_primary_query)
1235                                                         s_query.Add (subquery.Clone () as LNS.Query, false, false);
1236                                         }
1237
1238                                         Term term;
1239                                         term = new Term ("PropertyKeyword", part.Text.ToLower ()); // make sure text is lowercased
1240                                         // FIXME: terms are already added in term_list. But they may have been tokenized
1241                                         // The term here is non-tokenized version. Should this be added to term_list ?
1242                                         // term_list is used to calculate scores
1243                                         if (term_list != null)
1244                                                 term_list.Add (term);
1245                                         subquery = new LNS.TermQuery (term);
1246                                         p_query.Add (subquery, false, false);
1247                                         // Properties can live in either index
1248                                         if (! only_build_primary_query)
1249                                                 s_query.Add (subquery.Clone () as LNS.Query, false, false);
1250                                 }
1251
1252                                 primary_query = p_query;
1253                                 if (! only_build_primary_query)
1254                                         secondary_query = s_query;
1255
1256                                 return;
1257                         }
1258
1259                         if (abstract_part is QueryPart_Wildcard) {
1260                                 QueryPart_Wildcard part = (QueryPart_Wildcard) abstract_part;
1261
1262                                 LNS.BooleanQuery p_query = new LNS.BooleanQuery ();
1263                                 LNS.BooleanQuery s_query = new LNS.BooleanQuery ();
1264
1265                                 Term term;
1266                                 LNS.Query subquery;
1267
1268                                 // Lower case the terms for searching
1269                                 string query_string_lower = part.QueryString.ToLower ();
1270
1271                                 // Search text content
1272                                 term = new Term ("Text", query_string_lower);
1273                                 subquery = new LNS.WildcardQuery (term);
1274                                 p_query.Add (subquery, false, false);
1275                                 term_list.Add (term);
1276
1277                                 // Search text properties
1278                                 term = new Term ("PropertyText", query_string_lower);
1279                                 subquery = new LNS.WildcardQuery (term);
1280                                 p_query.Add (subquery, false, false);
1281                                 // Properties can live in either index
1282                                 if (! only_build_primary_query)
1283                                         s_query.Add (subquery.Clone () as LNS.Query, false, false);
1284                                 term_list.Add (term);
1285
1286                                 // Search property keywords
1287                                 term = new Term ("PropertyKeyword", query_string_lower);
1288                                 term_list.Add (term);
1289                                 subquery = new LNS.WildcardQuery (term);
1290                                 p_query.Add (subquery, false, false);
1291                                 // Properties can live in either index
1292                                 if (! only_build_primary_query)
1293                                         s_query.Add (subquery.Clone () as LNS.Query, false, false);
1294
1295                                 primary_query = p_query;
1296                                 if (! only_build_primary_query)
1297                                         secondary_query = s_query;
1298
1299                                 return;
1300                         }
1301
1302                         if (abstract_part is QueryPart_Property) {
1303                                 QueryPart_Property part = (QueryPart_Property) abstract_part;
1304
1305                                 string field_name;
1306                                 if (part.Key == QueryPart_Property.AllProperties) {
1307                                         field_name = TypeToWildcardField (part.Type);
1308                                         // FIXME: probably shouldn't just return silently
1309                                         if (field_name == null)
1310                                                 return;
1311                                 } else
1312                                         field_name = PropertyToFieldName (part.Type, part.Key);
1313
1314                                 if (part.Type == PropertyType.Text)
1315                                         primary_query = StringToQuery (field_name, part.Value, term_list);
1316                                 else {
1317                                         Term term;
1318                                         term = new Term (field_name, part.Value.ToLower ());
1319                                         if (term_list != null)
1320                                                 term_list.Add (term);
1321                                         primary_query = new LNS.TermQuery (term);
1322                                 }
1323
1324                                 // Properties can live in either index
1325                                 if (! only_build_primary_query && primary_query != null)
1326                                         secondary_query = primary_query.Clone () as LNS.Query;
1327
1328                                 return;
1329                         }
1330
1331                         if (abstract_part is QueryPart_DateRange) {
1332
1333                                 QueryPart_DateRange part = (QueryPart_DateRange) abstract_part;
1334
1335                                 primary_query = GetDateRangeQuery (part, out hit_filter);
1336                                 // Date properties can live in either index
1337                                 if (! only_build_primary_query && primary_query != null)
1338                                         secondary_query = primary_query.Clone () as LNS.Query;
1339
1340                                 // If this is a prohibited part, invert our hit filter.
1341                                 if (part.Logic == QueryPartLogic.Prohibited) {
1342                                         NotHitFilter nhf;
1343                                         nhf = new NotHitFilter (hit_filter);
1344                                         hit_filter = new HitFilter (nhf.HitFilter);
1345                                 }
1346
1347                                 return;
1348                         }
1349
1350                         if (abstract_part is QueryPart_Or) {
1351                                 QueryPart_Or part = (QueryPart_Or) abstract_part;
1352
1353                                 // Assemble a new BooleanQuery combining all of the sub-parts.
1354                                 LNS.BooleanQuery p_query;
1355                                 p_query = new LNS.BooleanQuery ();
1356
1357                                 LNS.BooleanQuery s_query = null;
1358                                 if (! only_build_primary_query)
1359                                         s_query = new LNS.BooleanQuery ();
1360
1361                                 primary_query = p_query;
1362                                 secondary_query = s_query;
1363
1364                                 OrHitFilter or_hit_filter = null;
1365
1366                                 foreach (QueryPart  sub_part in part.SubParts) {
1367                                         LNS.Query p_subq, s_subq;
1368                                         HitFilter sub_hit_filter; // FIXME: This is (and must be) ignored
1369                                         // FIXME: Any subpart in an OR which has a hit filter won't work
1370                                         // correctly, because we can't tell which part of an OR we matched
1371                                         // against to filter correctly.  This affects date range queries.
1372                                         QueryPartToQuery (sub_part, only_build_primary_query,
1373                                                           term_list,
1374                                                           out p_subq, out s_subq, out sub_hit_filter);
1375                                         if (p_subq != null)
1376                                                 p_query.Add (p_subq, false, false);
1377                                         if (s_subq != null)
1378                                                 s_query.Add (s_subq, false, false);
1379                                         if (sub_hit_filter != null) {
1380                                                 if (or_hit_filter == null)
1381                                                         or_hit_filter = new OrHitFilter ();
1382                                                 or_hit_filter.Add (sub_hit_filter);
1383                                         }
1384                                 }
1385
1386                                 if (or_hit_filter != null)
1387                                         hit_filter = new HitFilter (or_hit_filter.HitFilter);
1388
1389                                 return;
1390                         }
1391
1392                         throw new Exception ("Unhandled QueryPart type! " + abstract_part.ToString ());
1393                 }
1394
1395                 static protected LNS.Query UriQuery (string field_name, Uri uri)
1396                 {
1397                         return new LNS.TermQuery (new Term (field_name, UriFu.UriToEscapedString (uri)));
1398                 }
1399
1400                 static protected LNS.Query UriQuery (string field_name, ICollection uri_list)
1401                 {
1402                         return UriQuery (field_name, uri_list, null);
1403                 }
1404
1405                 static protected LNS.Query UriQuery (string field_name, ICollection uri_list, LNS.Query extra_requirement)
1406                 {
1407                         if (uri_list.Count == 0)
1408                                 return null;
1409
1410                         int max_clauses;
1411                         max_clauses = LNS.BooleanQuery.GetMaxClauseCount ();
1412
1413                         int N;
1414                         N = 1 + (uri_list.Count - 1) / max_clauses;
1415
1416                         LNS.BooleanQuery top_query;
1417                         top_query = new LNS.BooleanQuery ();
1418
1419                         int cursor = 0;
1420                         if (extra_requirement != null) {
1421                                 top_query.Add (extra_requirement, true, false);
1422                                 ++cursor;
1423                         }
1424
1425                         ArrayList bottom_queries = null;
1426
1427                         if (N > 1) {
1428                                 bottom_queries = new ArrayList ();
1429                                 for (int i = 0; i < N; ++i) {
1430                                         LNS.BooleanQuery bq;
1431                                         bq = new LNS.BooleanQuery ();
1432                                         bottom_queries.Add (bq);
1433                                         top_query.Add (bq, false, false);
1434                                 }
1435                         }
1436
1437                         foreach (Uri uri in uri_list) {
1438                                 LNS.Query subquery;
1439                                 subquery = UriQuery (field_name, uri);
1440
1441                                 LNS.BooleanQuery target;
1442                                 if (N == 1)
1443                                         target = top_query;
1444                                 else {
1445                                         target = (LNS.BooleanQuery) bottom_queries [cursor];
1446                                         ++cursor;
1447                                         if (cursor >= N)
1448                                                 cursor = 0;
1449                                 }
1450
1451                                 target.Add (subquery, false, false);
1452                         }
1453
1454                         return top_query;
1455                 }
1456
1457                 ///////////////////////////////////////////////////////////////////////////////////
1458
1459                 public int SegmentCount {
1460                         get {
1461                                 DirectoryInfo dir_info;
1462                                 int p_count = 0, s_count = 0;
1463
1464                                 dir_info = new DirectoryInfo (PrimaryIndexDirectory);
1465                                 foreach (FileInfo file_info in dir_info.GetFiles ())
1466                                         if (file_info.Extension == ".cfs")
1467                                                 ++p_count;
1468
1469                                 dir_info = new DirectoryInfo (SecondaryIndexDirectory);
1470                                 foreach (FileInfo file_info in dir_info.GetFiles ())
1471                                         if (file_info.Extension == ".cfs")
1472                                                 ++s_count;
1473
1474                                 return p_count > s_count ? p_count : s_count;
1475                         }
1476                 }
1477
1478                 ///////////////////////////////////////////////////////////////////////////////////
1479
1480                 // Cache IndexReaders on a per-Lucene index basis, since they
1481                 // are extremely expensive to create.  Note that using this
1482                 // only makes sense in situations where the index only
1483                 // possibly might change from underneath us, but most of the
1484                 // time probably won't.  This means it makes sense to do
1485                 // this in LuceneQueryingDriver.cs, but it doesn't in
1486                 // LuceneIndexingDriver.cs.
1487
1488                 private class ReaderAndVersion {
1489
1490                         public IndexReader Reader;
1491                         public long Version;
1492                         public int Refcount;
1493
1494                         public ReaderAndVersion (IndexReader reader, long version)
1495                         {
1496                                 this.Reader = reader;
1497                                 this.Version = version;
1498                                 this.Refcount = 1;
1499                         }
1500                 }
1501
1502                 static private Hashtable directory_rav_map = new Hashtable ();
1503                 static private Hashtable reader_rav_map = new Hashtable ();
1504
1505                 static public LNS.IndexSearcher GetSearcher (Lucene.Net.Store.Directory directory)
1506                 {
1507                         IndexReader reader = GetReader (directory);
1508
1509                         return new LNS.IndexSearcher (reader);
1510                 }
1511
1512                 static public IndexReader GetReader (Lucene.Net.Store.Directory directory)
1513                 {
1514                         IndexReader reader;
1515                         long version;
1516
1517                         lock (reader_rav_map) {
1518                                 ReaderAndVersion rav = (ReaderAndVersion) directory_rav_map [directory];
1519
1520                                 if (rav == null) {
1521                                         version = IndexReader.GetCurrentVersion (directory);
1522                                         reader = IndexReader.Open (directory);
1523
1524                                         rav = new ReaderAndVersion (reader, version);
1525                                         rav.Refcount++;
1526
1527                                         directory_rav_map [directory] = rav;
1528                                         reader_rav_map [reader] = rav;
1529
1530                                         return reader;
1531                                 }
1532
1533                                 version = IndexReader.GetCurrentVersion (directory);
1534
1535                                 if (version != rav.Version) {
1536                                         UnrefReaderAndVersion_Unlocked (rav);
1537
1538                                         reader = IndexReader.Open (directory);
1539
1540                                         rav = new ReaderAndVersion (reader, version);
1541                                         rav.Refcount++;
1542
1543                                         directory_rav_map [directory] = rav;
1544                                         reader_rav_map [reader] = rav;
1545                                 } else
1546                                         rav.Refcount++;
1547
1548                                 return rav.Reader;
1549                         }
1550                 }
1551
1552                 static private void UnrefReaderAndVersion_Unlocked (ReaderAndVersion rav)
1553                 {
1554                         rav.Refcount--;
1555
1556                         if (rav.Refcount == 0) {
1557                                 rav.Reader.Close ();
1558                                 reader_rav_map.Remove (rav.Reader);
1559                         }
1560                 }
1561
1562                 static public void ReleaseReader (IndexReader reader)
1563                 {
1564                         lock (reader_rav_map) {
1565                                 ReaderAndVersion rav = (ReaderAndVersion) reader_rav_map [reader];
1566
1567                                 UnrefReaderAndVersion_Unlocked (rav);
1568                         }
1569                 }
1570
1571                 static public void ReleaseSearcher (LNS.IndexSearcher searcher)
1572                 {
1573                         IndexReader reader = searcher.GetIndexReader ();
1574
1575                         searcher.Close ();
1576                         ReleaseReader (reader);
1577                 }
1578
1579                 ///////////////////////////////////////////////////////////////////////////////////
1580
1581                 //
1582                 // Various ways to grab lots of hits at once.
1583                 // These should never be used for querying, only for utility
1584                 // functions.
1585                 //
1586
1587                 public int GetBlockOfHits (int cookie,
1588                                            Hit [] block_of_hits)
1589                 {
1590                         IndexReader primary_reader;
1591                         IndexReader secondary_reader;
1592                         primary_reader = GetReader (PrimaryStore);
1593                         secondary_reader = GetReader (SecondaryStore);
1594
1595                         int request_size;
1596                         request_size = block_of_hits.Length;
1597                         if (request_size > primary_reader.NumDocs ())
1598                                 request_size = primary_reader.NumDocs ();
1599
1600                         int max_doc;
1601                         max_doc = primary_reader.MaxDoc ();
1602
1603                         if (cookie < 0) {
1604                                 Random random;
1605                                 random = new Random ();
1606                                 cookie = random.Next (max_doc);
1607                         }
1608
1609                         int original_cookie;
1610                         original_cookie = cookie;
1611
1612                         Hashtable primary_docs, secondary_docs;
1613                         primary_docs = UriFu.NewHashtable ();
1614                         secondary_docs = UriFu.NewHashtable ();
1615
1616                         // Load the primary documents
1617                         for (int i = 0; i < request_size; ++i) {
1618
1619                                 if (! primary_reader.IsDeleted (cookie)) {
1620                                         Document doc;
1621                                         doc = primary_reader.Document (cookie);
1622                                         primary_docs [GetUriFromDocument (doc)] = doc;
1623                                 }
1624
1625                                 ++cookie;
1626                                 if (cookie >= max_doc) // wrap around
1627                                         cookie = 0;
1628
1629                                 // If we somehow end up back where we started,
1630                                 // give up.
1631                                 if (cookie == original_cookie)
1632                                         break;
1633                         }
1634
1635                         // If necessary, load the secondary documents
1636                         if (secondary_reader != null) {
1637                                 LNS.IndexSearcher searcher;
1638                                 searcher = new LNS.IndexSearcher (secondary_reader);
1639
1640                                 LNS.Query uri_query;
1641                                 uri_query = UriQuery ("Uri", primary_docs.Keys);
1642
1643                                 LNS.Hits hits;
1644                                 hits = searcher.Search (uri_query);
1645                                 for (int i = 0; i < hits.Length (); ++i) {
1646                                         Document doc;
1647                                         doc = hits.Doc (i);
1648                                         secondary_docs [GetUriFromDocument (doc)] = doc;
1649                                 }
1650
1651                                 searcher.Close ();
1652                         }
1653
1654                         ReleaseReader (primary_reader);
1655                         ReleaseReader (secondary_reader);
1656
1657                         // Now assemble the hits
1658                         int j = 0;
1659                         foreach (Uri uri in primary_docs.Keys) {
1660                                 Document primary_doc, secondary_doc;
1661                                 primary_doc = primary_docs [uri] as Document;
1662                                 secondary_doc = secondary_docs [uri] as Document;
1663
1664                                 Hit hit;
1665                                 hit = DocumentToHit (primary_doc);
1666                                 if (secondary_doc != null)
1667                                         AddPropertiesToHit (hit, secondary_doc, false);
1668
1669                                 block_of_hits [j] = hit;
1670                                 ++j;
1671                         }
1672
1673                         // null-pad the array, if necessary
1674                         for (; j < block_of_hits.Length; ++j)
1675                                 block_of_hits [j] = null;
1676
1677
1678                         // Return the new cookie
1679                         return cookie;
1680                 }
1681
1682                 // For a large index, this will be very slow and will consume
1683                 // a lot of memory.  Don't call it without a good reason!
1684                 // We return a hashtable indexed by Uri.
1685                 public Hashtable GetAllHitsByUri ()
1686                 {
1687                         Hashtable all_hits;
1688                         all_hits = UriFu.NewHashtable ();
1689
1690                         IndexReader primary_reader;
1691                         IndexReader secondary_reader;
1692                         primary_reader = GetReader (PrimaryStore);
1693                         secondary_reader = GetReader (SecondaryStore);
1694
1695                         // Load everything from the primary index
1696                         int max_doc;
1697                         max_doc = primary_reader.MaxDoc ();
1698                         for (int i = 0; i < max_doc; ++i) {
1699
1700                                 if (primary_reader.IsDeleted (i))
1701                                         continue;
1702
1703                                 Document doc;
1704                                 doc = primary_reader.Document (i);
1705
1706                                 Hit hit;
1707                                 hit = DocumentToHit (doc);
1708                                 all_hits [hit.Uri] = hit;
1709                         }
1710
1711                         // Now add in everything from the secondary index, if it exists
1712                         if (secondary_reader != null) {
1713                                 max_doc = secondary_reader.MaxDoc ();
1714                                 for (int i = 0; i < max_doc; ++i) {
1715
1716                                         if (secondary_reader.IsDeleted (i))
1717                                                 continue;
1718
1719                                         Document doc;
1720                                         doc = secondary_reader.Document (i);
1721
1722                                         Uri uri;
1723                                         uri = GetUriFromDocument (doc);
1724
1725                                         Hit hit;
1726                                         hit = (Hit) all_hits [uri];
1727                                         if (hit != null)
1728                                                 AddPropertiesToHit (hit, doc, false);
1729                                 }
1730                         }
1731
1732                         ReleaseReader (primary_reader);
1733                         ReleaseReader (secondary_reader);
1734
1735                         return all_hits;
1736                 }
1737         }
1738 }