Oops, fix a broken part of the patch
[beagle.git] / beagled / LuceneQueryingDriver.cs
blob65122d88f5e7b2a71b36494768f80a99a8500d19
1 //
2 // LuceneQueryingDriver.cs
3 //
4 // Copyright (C) 2004-2005 Novell, Inc.
5 //
7 //
8 // Permission is hereby granted, free of charge, to any person obtaining a
9 // copy of this software and associated documentation files (the "Software"),
10 // to deal in the Software without restriction, including without limitation
11 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 // and/or sell copies of the Software, and to permit persons to whom the
13 // Software is furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 // DEALINGS IN THE SOFTWARE.
27 using System;
28 using System.Collections;
29 using System.Diagnostics;
30 using System.Globalization;
31 using System.IO;
32 using System.Text;
33 using System.Threading;
34 using System.Xml;
35 using System.Xml.Serialization;
37 using Lucene.Net.Analysis;
38 using Lucene.Net.Analysis.Standard;
39 using Lucene.Net.Documents;
40 using Lucene.Net.Index;
41 using Lucene.Net.QueryParsers;
42 using LNS = Lucene.Net.Search;
44 using Beagle.Util;
46 namespace Beagle.Daemon {
48 public class LuceneQueryingDriver : LuceneCommon {
50 static public bool Debug = false;
52 public const string PrivateNamespace = "_private:";
54 public delegate bool UriFilter (Uri uri);
55 public delegate double RelevancyMultiplier (Hit hit);
57 public LuceneQueryingDriver (string index_name, int minor_version, bool read_only)
58 : base (index_name, minor_version)
60 // FIXME: Maybe the LuceneQueryingDriver should never try to create the index?
61 if (Exists ())
62 Open (read_only);
63 else if (!read_only)
64 Create ();
65 else {
66 // We're in read-only mode, but we can't create an index.
67 // Maybe a different exception would be better? This one is caught
68 // in QueryDriver.LoadStaticQueryable ()
69 throw new InvalidOperationException ();
72 // Initialize the user text cache only if we're not in
73 // read-only mode. StaticQueryables instantiate their
74 // own text caches that are stored in a separate
75 // location.
76 if (!read_only)
77 text_cache = TextCache.UserCache;
80 ////////////////////////////////////////////////////////////////
83 ////////////////////////////////////////////////////////////////
85 public Uri[] PropertyQuery (Property prop)
87 // FIXME: Should we support scanning the secondary
88 // index as well?
90 IndexReader primary_reader;
91 LNS.IndexSearcher primary_searcher;
93 primary_reader = LuceneCommon.GetReader (PrimaryStore);
94 primary_searcher = new LNS.IndexSearcher (primary_reader);
96 Term term = new Term (PropertyToFieldName (prop.Type, prop.Key), prop.Value);
97 LNS.TermQuery query = new LNS.TermQuery (term);
98 LNS.Hits hits = primary_searcher.Search (query);
100 Uri[] uri_list = new Uri [hits.Length ()];
101 for (int i = 0; i < hits.Length (); i++) {
102 Document doc;
103 doc = hits.Doc (i);
104 uri_list [i] = GetUriFromDocument (doc);
107 primary_searcher.Close ();
108 ReleaseReader (primary_reader);
110 return uri_list;
113 ////////////////////////////////////////////////////////////////
115 // Returns the lowest matching score before the results are
116 // truncated.
117 public void DoQuery (Query query,
118 IQueryResult result,
119 ICollection search_subset_uris, // should be internal uris
120 UriFilter uri_filter,
121 HitFilter hit_filter)
123 Stopwatch sw;
124 sw = new Stopwatch ();
125 sw.Start ();
127 // Assemble all of the parts into a bunch of Lucene queries
129 ArrayList primary_required_part_queries = null;
130 ArrayList secondary_required_part_queries = null;
132 LNS.BooleanQuery primary_prohibited_part_query = null;
133 LNS.BooleanQuery secondary_prohibited_part_query = null;
135 AndHitFilter all_hit_filters;
136 all_hit_filters = new AndHitFilter ();
137 if (hit_filter != null)
138 all_hit_filters.Add (hit_filter);
140 ArrayList term_list = new ArrayList ();
142 foreach (QueryPart part in query.Parts) {
143 LNS.Query primary_part_query;
144 LNS.Query secondary_part_query;
145 HitFilter part_hit_filter;
146 QueryPartToQuery (part,
147 false, // we want both primary and secondary queries
148 part.Logic == QueryPartLogic.Required ? term_list : null,
149 out primary_part_query,
150 out secondary_part_query,
151 out part_hit_filter);
153 if (primary_part_query == null)
154 continue;
156 switch (part.Logic) {
158 case QueryPartLogic.Required:
159 if (primary_required_part_queries == null) {
160 primary_required_part_queries = new ArrayList ();
161 secondary_required_part_queries = new ArrayList ();
163 primary_required_part_queries.Add (primary_part_query);
164 secondary_required_part_queries.Add (secondary_part_query);
166 if (part_hit_filter != null)
167 all_hit_filters.Add (part_hit_filter);
169 break;
171 case QueryPartLogic.Prohibited:
172 if (primary_prohibited_part_query == null)
173 primary_prohibited_part_query = new LNS.BooleanQuery ();
174 primary_prohibited_part_query.Add (primary_part_query, false, false);
176 if (secondary_part_query != null) {
177 if (secondary_prohibited_part_query == null)
178 secondary_prohibited_part_query = new LNS.BooleanQuery ();
179 secondary_prohibited_part_query.Add (secondary_part_query, false, false);
182 if (part_hit_filter != null) {
183 NotHitFilter nhf;
184 nhf = new NotHitFilter (part_hit_filter);
185 all_hit_filters.Add (new HitFilter (nhf.HitFilter));
188 break;
192 // If we have no required parts, give up.
193 if (primary_required_part_queries == null)
194 return;
197 // Now that we have all of these nice queries, let's execute them!
200 // Create the searchers that we will need.
202 IndexReader primary_reader;
203 LNS.IndexSearcher primary_searcher;
204 IndexReader secondary_reader = null;
205 LNS.IndexSearcher secondary_searcher = null;
207 primary_reader = LuceneCommon.GetReader (PrimaryStore);
208 primary_searcher = new LNS.IndexSearcher (primary_reader);
210 if (SecondaryStore != null) {
211 secondary_reader = LuceneCommon.GetReader (SecondaryStore);
212 if (secondary_reader.NumDocs () == 0) {
213 ReleaseReader (secondary_reader);
214 secondary_reader = null;
218 if (secondary_reader != null)
219 secondary_searcher = new LNS.IndexSearcher (secondary_reader);
222 // Possibly create our whitelists from the search subset.
224 LuceneBitArray primary_whitelist = null;
225 LuceneBitArray secondary_whitelist = null;
227 if (search_subset_uris != null && search_subset_uris.Count > 0) {
228 primary_whitelist = new LuceneBitArray (primary_searcher);
229 if (secondary_searcher != null)
230 secondary_whitelist = new LuceneBitArray (secondary_searcher);
232 foreach (Uri uri in search_subset_uris) {
233 primary_whitelist.AddUri (uri);
234 if (secondary_whitelist != null)
235 secondary_whitelist.AddUri (uri);
237 primary_whitelist.FlushUris ();
238 if (secondary_whitelist != null)
239 secondary_whitelist.FlushUris ();
243 // Build blacklists from our prohibited parts.
245 LuceneBitArray primary_blacklist = null;
246 LuceneBitArray secondary_blacklist = null;
248 if (primary_prohibited_part_query != null) {
249 primary_blacklist = new LuceneBitArray (primary_searcher,
250 primary_prohibited_part_query);
252 if (secondary_searcher != null) {
253 secondary_blacklist = new LuceneBitArray (secondary_searcher);
254 if (secondary_prohibited_part_query != null)
255 secondary_blacklist.Or (secondary_prohibited_part_query);
256 primary_blacklist.Join (secondary_blacklist);
261 // Combine our whitelist and blacklist into just a whitelist.
263 if (primary_blacklist != null) {
264 if (primary_whitelist == null) {
265 primary_blacklist.Not ();
266 primary_whitelist = primary_blacklist;
267 } else {
268 primary_whitelist.AndNot (primary_blacklist);
272 if (secondary_blacklist != null) {
273 if (secondary_whitelist == null) {
274 secondary_blacklist.Not ();
275 secondary_whitelist = secondary_blacklist;
276 } else {
277 secondary_whitelist.AndNot (secondary_blacklist);
281 BetterBitArray primary_matches = null;
283 if (primary_required_part_queries != null) {
285 if (secondary_searcher != null)
286 primary_matches = DoRequiredQueries_TwoIndex (primary_searcher,
287 secondary_searcher,
288 primary_required_part_queries,
289 secondary_required_part_queries,
290 primary_whitelist,
291 secondary_whitelist);
292 else
293 primary_matches = DoRequiredQueries (primary_searcher,
294 primary_required_part_queries,
295 primary_whitelist);
299 sw.Stop ();
300 if (Debug)
301 Logger.Log.Debug ("###### Finished low-level queries in {0}", sw);
302 sw.Reset ();
303 sw.Start ();
305 // Only generate results if we got some matches
306 if (primary_matches != null && primary_matches.ContainsTrue ()) {
307 GenerateQueryResults (primary_reader,
308 primary_searcher,
309 secondary_searcher,
310 primary_matches,
311 result,
312 term_list,
313 query.MaxHits,
314 uri_filter,
315 new HitFilter (all_hit_filters.HitFilter));
319 // Finally, we clean up after ourselves.
322 primary_searcher.Close ();
323 if (secondary_searcher != null)
324 secondary_searcher.Close ();
325 ReleaseReader (primary_reader);
326 if (secondary_reader != null)
327 ReleaseReader (secondary_reader);
330 sw.Stop ();
331 if (Debug)
332 Logger.Log.Debug ("###### Processed query in {0}", sw);
336 ////////////////////////////////////////////////////////////////
339 // Special logic for handling our set of required queries
342 // This is the easy case: we just combine all of the queries
343 // into one big BooleanQuery.
344 private static BetterBitArray DoRequiredQueries (LNS.IndexSearcher primary_searcher,
345 ArrayList primary_queries,
346 BetterBitArray primary_whitelist)
348 LNS.BooleanQuery combined_query;
349 combined_query = new LNS.BooleanQuery ();
350 foreach (LNS.Query query in primary_queries)
351 combined_query.Add (query, true, false);
353 LuceneBitArray matches;
354 matches = new LuceneBitArray (primary_searcher, combined_query);
355 if (primary_whitelist != null)
356 matches.And (primary_whitelist);
358 return matches;
361 // This code attempts to execute N required queries in the
362 // most efficient order to minimize the amount of time spent
363 // joining between the two indexes. It returns a joined bit
364 // array of matches against the primary index.
366 private class MatchInfo : IComparable {
368 public LuceneBitArray PrimaryMatches = null;
369 public LuceneBitArray SecondaryMatches = null;
370 public int UpperBound = 0;
372 public void Join ()
374 PrimaryMatches.Join (SecondaryMatches);
377 public void RestrictBy (MatchInfo joined)
379 if (joined != null) {
380 this.PrimaryMatches.And (joined.PrimaryMatches);
381 this.SecondaryMatches.And (joined.SecondaryMatches);
384 UpperBound = 0;
385 UpperBound += PrimaryMatches.TrueCount;
386 UpperBound += SecondaryMatches.TrueCount;
389 public int CompareTo (object obj)
391 MatchInfo other = (MatchInfo) obj;
392 return this.UpperBound - other.UpperBound;
396 // Any whitelists that are passed in must be fully joined, or
397 // query results will be incorrect.
398 private static BetterBitArray DoRequiredQueries_TwoIndex (LNS.IndexSearcher primary_searcher,
399 LNS.IndexSearcher secondary_searcher,
400 ArrayList primary_queries,
401 ArrayList secondary_queries,
402 BetterBitArray primary_whitelist,
403 BetterBitArray secondary_whitelist)
405 ArrayList match_info_list;
406 match_info_list = new ArrayList ();
408 // First, do all of the low-level queries
409 // and store them in our MatchInfo
410 for (int i = 0; i < primary_queries.Count; ++i) {
411 LNS.Query pq, sq;
412 pq = primary_queries [i] as LNS.Query;
413 sq = secondary_queries [i] as LNS.Query;
415 LuceneBitArray p_matches = null, s_matches = null;
416 p_matches = new LuceneBitArray (primary_searcher);
417 if (pq != null) {
418 p_matches.Or (pq);
419 if (primary_whitelist != null)
420 p_matches.And (primary_whitelist);
423 s_matches = new LuceneBitArray (secondary_searcher);
424 if (sq != null) {
425 s_matches.Or (sq);
426 if (secondary_whitelist != null)
427 s_matches.And (secondary_whitelist);
430 MatchInfo info;
431 info = new MatchInfo ();
432 info.PrimaryMatches = p_matches;
433 info.SecondaryMatches = s_matches;
434 info.RestrictBy (null); // a hack to initialize the UpperBound
435 match_info_list.Add (info);
438 // We want to be smart about the order we do this in,
439 // to minimize the expense of the Join.
440 while (match_info_list.Count > 1) {
442 // FIXME: We don't really need to sort here, it would
443 // be sufficient to just find the minimal element.
444 match_info_list.Sort ();
445 MatchInfo smallest;
446 smallest = match_info_list [0] as MatchInfo;
447 match_info_list.RemoveAt (0);
449 // We can short-circuit if our smallest set of
450 // matches is empty.
451 if (smallest.UpperBound == 0)
452 return smallest.PrimaryMatches; // this must be an empty array.
454 smallest.Join ();
456 foreach (MatchInfo info in match_info_list)
457 info.RestrictBy (smallest);
460 // For the final pair, we don't need to do a full join:
461 // mapping the secondary onto the primary is sufficient
462 MatchInfo last;
463 last = match_info_list [0] as MatchInfo;
464 last.SecondaryMatches.ProjectOnto (last.PrimaryMatches);
466 return last.PrimaryMatches;
469 ////////////////////////////////////////////////////////////////
471 static private void ScoreHits (Hashtable hits_by_id,
472 IndexReader reader,
473 ICollection term_list)
475 Stopwatch sw;
476 sw = new Stopwatch ();
477 sw.Start ();
479 LNS.Similarity similarity;
480 similarity = LNS.Similarity.GetDefault ();
482 foreach (Term term in term_list) {
484 double idf;
485 idf = similarity.Idf (reader.DocFreq (term), reader.MaxDoc ());
487 int hit_count;
488 hit_count = hits_by_id.Count;
490 TermDocs term_docs;
491 term_docs = reader.TermDocs (term);
492 while (term_docs.Next () && hit_count > 0) {
494 int id;
495 id = term_docs.Doc ();
497 Hit hit;
498 hit = hits_by_id [id] as Hit;
499 if (hit != null) {
500 double tf;
501 tf = similarity.Tf (term_docs.Freq ());
502 hit.Score += tf * idf;
503 --hit_count;
508 sw.Stop ();
511 ////////////////////////////////////////////////////////////////
513 private class DocAndId {
514 public Document Doc;
515 public int Id;
519 // Given a set of hits, broadcast some set out as our query
520 // results.
523 private static void GenerateQueryResults (IndexReader primary_reader,
524 LNS.IndexSearcher primary_searcher,
525 LNS.IndexSearcher secondary_searcher,
526 BetterBitArray primary_matches,
527 IQueryResult result,
528 ICollection query_term_list,
529 int max_results,
530 UriFilter uri_filter,
531 HitFilter hit_filter)
533 TopScores top_docs = null;
534 ArrayList all_docs = null;
536 if (Debug)
537 Logger.Log.Debug (">>> Initially handed {0} matches", primary_matches.TrueCount);
539 if (primary_matches.TrueCount <= max_results) {
540 if (Debug)
541 Logger.Log.Debug (">>> Initial count is within our limit of {0}", max_results);
542 all_docs = new ArrayList ();
543 } else {
544 if (Debug)
545 Logger.Log.Debug (">>> Number of hits is capped at {0}", max_results);
546 top_docs = new TopScores (max_results);
549 Stopwatch total, a, b, c;
550 total = new Stopwatch ();
551 a = new Stopwatch ();
552 b = new Stopwatch ();
553 c = new Stopwatch ();
555 total.Start ();
556 a.Start ();
558 // Pull in the primary documents.
559 // We walk across them backwards, since newer
560 // documents are more likely to be at the end of
561 // the index.
562 int j = primary_matches.Count;
563 while (true) {
564 int i;
565 i = primary_matches.GetPreviousTrueIndex (j);
566 if (i < 0)
567 break;
568 j = i-1; // This way we can't forget to adjust i
570 Document doc;
571 doc = primary_searcher.Doc (i);
573 // Check the timestamp --- if we have already reached our
574 // limit, we might be able to reject it immediately.
575 string timestamp_str;
576 long timestamp_num = 0;
578 timestamp_str = doc.Get ("Timestamp");
579 if (timestamp_str == null) {
580 Logger.Log.Warn ("No timestamp on {0}!", GetUriFromDocument (doc));
581 } else {
582 timestamp_num = Int64.Parse (doc.Get ("Timestamp"));
583 if (top_docs != null && ! top_docs.WillAccept (timestamp_num))
584 continue;
587 // If we have a UriFilter, apply it.
588 if (uri_filter != null) {
589 Uri uri;
590 uri = GetUriFromDocument (doc);
591 if (! uri_filter (uri))
592 continue;
595 DocAndId doc_and_id = new DocAndId ();
596 doc_and_id.Doc = doc;
597 doc_and_id.Id = i;
599 // Add the document to the appropriate data structure.
600 // We use the timestamp_num as the score, so high
601 // scores correspond to more-recent timestamps.
602 if (all_docs != null)
603 all_docs.Add (doc_and_id);
604 else
605 top_docs.Add (timestamp_num, doc_and_id);
608 a.Stop ();
610 b.Start ();
612 ICollection final_list_of_docs;
613 if (all_docs != null)
614 final_list_of_docs = all_docs;
615 else
616 final_list_of_docs = top_docs.TopScoringObjects;
618 ArrayList final_list_of_hits;
619 final_list_of_hits = new ArrayList (final_list_of_docs.Count);
621 // This is used only for scoring
622 Hashtable hits_by_id = null;
623 hits_by_id = new Hashtable ();
625 // If we aren't using the secondary index, the next step is
626 // very straightforward.
627 if (secondary_searcher == null) {
629 foreach (DocAndId doc_and_id in final_list_of_docs) {
630 Hit hit;
631 hit = DocumentToHit (doc_and_id.Doc);
632 hits_by_id [doc_and_id.Id] = hit;
633 final_list_of_hits.Add (hit);
636 } else {
638 if (Debug)
639 Logger.Log.Debug (">>> Performing cross-index Hit reunification");
641 Hashtable hits_by_uri;
642 hits_by_uri = UriFu.NewHashtable ();
644 LuceneBitArray secondary_matches;
645 secondary_matches = new LuceneBitArray (secondary_searcher);
647 foreach (DocAndId doc_and_id in final_list_of_docs) {
648 Hit hit;
649 hit = DocumentToHit (doc_and_id.Doc);
650 hits_by_id [doc_and_id.Id] = hit;
651 hits_by_uri [hit.Uri] = hit;
652 secondary_matches.AddUri (hit.Uri);
655 secondary_matches.FlushUris ();
657 // Attach all of our secondary properties
658 // to the hits
659 j = 0;
660 while (true) {
661 int i;
662 i = secondary_matches.GetNextTrueIndex (j);
663 if (i >= secondary_matches.Count)
664 break;
665 j = i+1;
667 Document secondary_doc;
668 secondary_doc = secondary_searcher.Doc (i);
670 Uri uri;
671 uri = GetUriFromDocument (secondary_doc);
673 Hit hit;
674 hit = hits_by_uri [uri] as Hit;
676 AddPropertiesToHit (hit, secondary_doc, false);
678 final_list_of_hits.Add (hit);
682 ScoreHits (hits_by_id, primary_reader, query_term_list);
684 b.Stop ();
686 // If we used the TopScores object, we got our original
687 // list of documents sorted for us. If not, sort the
688 // final list.
689 if (top_docs == null)
690 final_list_of_hits.Sort ();
692 c.Start ();
694 // If we have a hit_filter, use it now.
695 if (hit_filter != null) {
696 for (int i = 0; i < final_list_of_hits.Count; ++i) {
697 Hit hit;
698 hit = final_list_of_hits [i] as Hit;
699 if (! hit_filter (hit)) {
700 if (Debug)
701 Logger.Log.Debug ("Filtered out {0}", hit.Uri);
702 final_list_of_hits [i] = null;
707 // Before we broadcast a hit, we strip out any
708 // properties in the PrivateNamespace. We
709 // manipulate the property ArrayList directory,
710 // which is pretty gross... but this is safe,
711 // since removing items will not change the sort
712 // order.
713 foreach (Hit hit in final_list_of_hits) {
714 if (hit == null)
715 continue;
716 int i = 0;
717 while (i < hit.Properties.Count) {
718 Property prop = hit.Properties [i] as Property;
719 if (prop.Key.StartsWith (PrivateNamespace))
720 hit.Properties.RemoveAt (i);
721 else
722 ++i;
726 result.Add (final_list_of_hits);
728 c.Stop ();
729 total.Stop ();
731 if (Debug) {
732 Logger.Log.Debug (">>> GenerateQueryResults time statistics:");
733 Logger.Log.Debug (">>> First pass {0} ({1:0.0}%)", a, 100 * a.ElapsedTime / total.ElapsedTime);
734 Logger.Log.Debug (">>> Hit assembly {0} ({1:0.0}%)", b, 100 * b.ElapsedTime / total.ElapsedTime);
735 Logger.Log.Debug (">>> Final pass {0} ({1:0.0}%)", c, 100 * c.ElapsedTime / total.ElapsedTime);
736 Logger.Log.Debug (">>> TOTAL {0}", total);