Compute lucene-style scores for our hits.
[beagle.git] / beagled / LuceneQueryingDriver.cs
blob63eea4632f84ee8f01b7e444f0209c4179baa361
1 //
2 // LuceneQueryingDriver.cs
3 //
4 // Copyright (C) 2004-2005 Novell, Inc.
5 //
7 //
8 // Permission is hereby granted, free of charge, to any person obtaining a
9 // copy of this software and associated documentation files (the "Software"),
10 // to deal in the Software without restriction, including without limitation
11 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 // and/or sell copies of the Software, and to permit persons to whom the
13 // Software is furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 // DEALINGS IN THE SOFTWARE.
27 using System;
28 using System.Collections;
29 using System.Diagnostics;
30 using System.Globalization;
31 using System.IO;
32 using System.Text;
33 using System.Threading;
34 using System.Xml;
35 using System.Xml.Serialization;
37 using Lucene.Net.Analysis;
38 using Lucene.Net.Analysis.Standard;
39 using Lucene.Net.Documents;
40 using Lucene.Net.Index;
41 using Lucene.Net.QueryParsers;
42 using LNS = Lucene.Net.Search;
44 using Beagle.Util;
46 namespace Beagle.Daemon {
48 public class LuceneQueryingDriver : LuceneCommon {
50 static public bool Debug = false;
52 public const string PrivateNamespace = "_private:";
54 public delegate bool UriFilter (Uri uri);
55 public delegate double RelevancyMultiplier (Hit hit);
57 public LuceneQueryingDriver (string index_name, int minor_version, bool read_only)
58 : base (index_name, minor_version)
60 // FIXME: Maybe the LuceneQueryingDriver should never try to create the index?
61 if (Exists ())
62 Open (read_only);
63 // FIXME: Do something sane if we're in read-only mode and want to create an index.
64 else if (!read_only)
65 Create ();
68 ////////////////////////////////////////////////////////////////
70 private class NotHitFilter_Closure {
71 HitFilter original;
73 public NotHitFilter_Closure (HitFilter original)
75 this.original = original;
78 public bool Filter (Hit hit)
80 return ! original (hit);
84 public HitFilter NotHitFilter (HitFilter filter)
86 NotHitFilter_Closure closure;
87 closure = new NotHitFilter_Closure (filter);
88 return new HitFilter (closure.Filter);
91 ////////////////////////////////////////////////////////////////
93 // Returns the lowest matching score before the results are
94 // truncated.
95 public void DoQuery (Query query,
96 IQueryResult result,
97 ICollection search_subset_uris, // should be internal uris
98 UriFilter uri_filter,
99 HitFilter hit_filter)
101 Stopwatch sw;
102 sw = new Stopwatch ();
103 sw.Start ();
105 // Assemble all of the parts into a bunch of Lucene queries
107 ArrayList primary_required_part_queries = null;
108 ArrayList secondary_required_part_queries = null;
110 LNS.BooleanQuery primary_prohibited_part_query = null;
111 LNS.BooleanQuery secondary_prohibited_part_query = null;
113 ArrayList all_hit_filters = new ArrayList ();
114 all_hit_filters.Add (hit_filter);
116 ArrayList term_list = new ArrayList ();
118 foreach (QueryPart part in query.Parts) {
119 LNS.Query primary_part_query;
120 LNS.Query secondary_part_query;
121 HitFilter part_hit_filter;
122 QueryPartToQuery (part,
123 false, // we want both primary and secondary queries
124 part.Logic == QueryPartLogic.Required ? term_list : null,
125 out primary_part_query,
126 out secondary_part_query,
127 out part_hit_filter);
129 switch (part.Logic) {
131 case QueryPartLogic.Required:
132 if (primary_required_part_queries == null) {
133 primary_required_part_queries = new ArrayList ();
134 secondary_required_part_queries = new ArrayList ();
136 primary_required_part_queries.Add (primary_part_query);
137 secondary_required_part_queries.Add (secondary_part_query);
139 if (part_hit_filter != null)
140 all_hit_filters.Add (part_hit_filter);
142 break;
144 case QueryPartLogic.Prohibited:
145 if (primary_prohibited_part_query == null)
146 primary_prohibited_part_query = new LNS.BooleanQuery ();
147 primary_prohibited_part_query.Add (primary_part_query, false, false);
149 if (secondary_part_query != null) {
150 if (secondary_prohibited_part_query == null)
151 secondary_prohibited_part_query = new LNS.BooleanQuery ();
152 secondary_prohibited_part_query.Add (secondary_part_query, false, false);
155 if (part_hit_filter != null)
156 all_hit_filters.Add (NotHitFilter (part_hit_filter));
158 break;
162 // If we have no required parts, give up.
163 if (primary_required_part_queries == null)
164 return;
167 // Now that we have all of these nice queries, let's execute them!
170 // Create the searchers that we will need.
172 IndexReader primary_reader;
173 LNS.IndexSearcher primary_searcher;
174 IndexReader secondary_reader = null;
175 LNS.IndexSearcher secondary_searcher = null;
177 primary_reader = IndexReader.Open (PrimaryStore);
178 primary_searcher = new LNS.IndexSearcher (primary_reader);
180 if (SecondaryStore != null) {
181 secondary_reader = IndexReader.Open (SecondaryStore);
182 if (secondary_reader.NumDocs () == 0) {
183 secondary_reader.Close ();
184 secondary_reader = null;
188 if (secondary_reader != null)
189 secondary_searcher = new LNS.IndexSearcher (secondary_reader);
192 // Possibly create our whitelists from the search subset.
194 LuceneBitArray primary_whitelist = null;
195 LuceneBitArray secondary_whitelist = null;
197 if (search_subset_uris != null && search_subset_uris.Count > 0) {
198 primary_whitelist = new LuceneBitArray (primary_searcher);
199 if (secondary_searcher != null)
200 secondary_whitelist = new LuceneBitArray (secondary_searcher);
202 foreach (Uri uri in search_subset_uris) {
203 primary_whitelist.AddUri (uri);
204 if (secondary_whitelist != null)
205 secondary_whitelist.AddUri (uri);
207 primary_whitelist.FlushUris ();
208 if (secondary_whitelist != null)
209 secondary_whitelist.FlushUris ();
213 // Build blacklists from our prohibited parts.
215 LuceneBitArray primary_blacklist = null;
216 LuceneBitArray secondary_blacklist = null;
218 if (primary_prohibited_part_query != null) {
219 primary_blacklist = new LuceneBitArray (primary_searcher,
220 primary_prohibited_part_query);
222 secondary_blacklist = new LuceneBitArray (secondary_searcher);
223 if (secondary_prohibited_part_query != null)
224 secondary_blacklist.Or (secondary_prohibited_part_query);
225 primary_blacklist.Join (secondary_blacklist);
229 // Combine our whitelist and blacklist into just a whitelist.
231 if (primary_blacklist != null) {
232 if (primary_whitelist == null) {
233 primary_blacklist.Not ();
234 primary_whitelist = primary_blacklist;
235 } else {
236 primary_whitelist.AndNot (primary_blacklist);
240 if (secondary_blacklist != null) {
241 if (secondary_whitelist == null) {
242 secondary_blacklist.Not ();
243 secondary_whitelist = secondary_blacklist;
244 } else {
245 secondary_whitelist.AndNot (secondary_blacklist);
249 BetterBitArray primary_matches = null;
251 if (primary_required_part_queries != null) {
253 if (secondary_searcher != null)
254 primary_matches = DoRequiredQueries_TwoIndex (primary_searcher,
255 secondary_searcher,
256 primary_required_part_queries,
257 secondary_required_part_queries,
258 primary_whitelist,
259 secondary_whitelist);
260 else
261 primary_matches = DoRequiredQueries (primary_searcher,
262 primary_required_part_queries,
263 primary_whitelist);
267 sw.Stop ();
268 if (Debug)
269 Logger.Log.Debug ("###### Finished low-level queries in {0}", sw);
270 sw.Reset ();
271 sw.Start ();
273 // If we didn't get any matches, give up.
275 if (primary_matches == null || ! primary_matches.ContainsTrue ())
276 return;
278 GenerateQueryResults (primary_reader,
279 primary_searcher,
280 secondary_searcher,
281 primary_matches,
282 result,
283 term_list,
284 query.MaxHits,
285 DateTime.MinValue,
286 DateTime.MaxValue,
287 uri_filter,
288 hit_filter);
290 // Finally, we clean up after ourselves.
293 primary_reader.Close ();
294 if (secondary_reader != null)
295 secondary_reader.Close ();
296 primary_searcher.Close ();
297 if (secondary_searcher != null)
298 secondary_searcher.Close ();
301 sw.Stop ();
302 if (Debug)
303 Logger.Log.Debug ("###### Processed query in {0}", sw);
307 ////////////////////////////////////////////////////////////////
310 // Special logic for handling our set of required queries
313 // This is the easy case: we just combine all of the queries
314 // into one big BooleanQuery.
315 private static BetterBitArray DoRequiredQueries (LNS.IndexSearcher primary_searcher,
316 ArrayList primary_queries,
317 BetterBitArray primary_whitelist)
319 LNS.BooleanQuery combined_query;
320 combined_query = new LNS.BooleanQuery ();
321 foreach (LNS.Query query in primary_queries)
322 combined_query.Add (query, true, false);
324 LuceneBitArray matches;
325 matches = new LuceneBitArray (primary_searcher, combined_query);
326 if (primary_whitelist != null)
327 matches.And (primary_whitelist);
329 return matches;
332 // This code attempts to execute N required queries in the
333 // most efficient order to minimize the amount of time spent
334 // joining between the two indexes. It returns a joined bit
335 // array of matches against the primary index.
337 private class MatchInfo : IComparable {
339 public LuceneBitArray PrimaryMatches = null;
340 public LuceneBitArray SecondaryMatches = null;
341 public int UpperBound = 0;
343 public void Join ()
345 PrimaryMatches.Join (SecondaryMatches);
348 public void RestrictBy (MatchInfo joined)
350 if (joined != null) {
351 this.PrimaryMatches.And (joined.PrimaryMatches);
352 this.SecondaryMatches.And (joined.SecondaryMatches);
355 UpperBound = 0;
356 UpperBound += PrimaryMatches.TrueCount;
357 UpperBound += SecondaryMatches.TrueCount;
360 public int CompareTo (object obj)
362 MatchInfo other = (MatchInfo) obj;
363 return this.UpperBound - other.UpperBound;
367 // Any whitelists that are passed in must be fully joined, or
368 // query results will be incorrect.
369 private static BetterBitArray DoRequiredQueries_TwoIndex (LNS.IndexSearcher primary_searcher,
370 LNS.IndexSearcher secondary_searcher,
371 ArrayList primary_queries,
372 ArrayList secondary_queries,
373 BetterBitArray primary_whitelist,
374 BetterBitArray secondary_whitelist)
376 ArrayList match_info_list;
377 match_info_list = new ArrayList ();
379 // First, do all of the low-level queries
380 // and store them in our MatchInfo
381 for (int i = 0; i < primary_queries.Count; ++i) {
382 LNS.Query pq, sq;
383 pq = primary_queries [i] as LNS.Query;
384 sq = secondary_queries [i] as LNS.Query;
386 LuceneBitArray p_matches = null, s_matches = null;
387 p_matches = new LuceneBitArray (primary_searcher);
388 if (pq != null) {
389 p_matches.Or (pq);
390 if (primary_whitelist != null)
391 p_matches.And (primary_whitelist);
394 s_matches = new LuceneBitArray (secondary_searcher);
395 if (sq != null) {
396 s_matches.Or (sq);
397 if (secondary_whitelist != null)
398 s_matches.And (secondary_whitelist);
401 MatchInfo info;
402 info = new MatchInfo ();
403 info.PrimaryMatches = p_matches;
404 info.SecondaryMatches = s_matches;
405 info.RestrictBy (null); // a hack to initialize the UpperBound
406 match_info_list.Add (info);
409 // We want to be smart about the order we do this in,
410 // to minimize the expense of the Join.
411 while (match_info_list.Count > 1) {
413 // FIXME: We don't really need to sort here, it would
414 // be sufficient to just find the minimal element.
415 match_info_list.Sort ();
416 MatchInfo smallest;
417 smallest = match_info_list [0] as MatchInfo;
418 match_info_list.RemoveAt (0);
420 // We can short-circuit if our smallest set of
421 // matches is empty.
422 if (smallest.UpperBound == 0)
423 return smallest.PrimaryMatches; // this must be an empty array.
425 smallest.Join ();
427 foreach (MatchInfo info in match_info_list)
428 info.RestrictBy (smallest);
431 // For the final pair, we don't need to do a full join:
432 // mapping the secondary onto the primary is sufficient
433 MatchInfo last;
434 last = match_info_list [0] as MatchInfo;
435 last.SecondaryMatches.ProjectOnto (last.PrimaryMatches);
437 return last.PrimaryMatches;
440 ////////////////////////////////////////////////////////////////
442 static private void ScoreHits (Hashtable hits_by_id,
443 IndexReader reader,
444 ICollection term_list)
446 Stopwatch sw;
447 sw = new Stopwatch ();
448 sw.Start ();
450 LNS.Similarity similarity;
451 similarity = LNS.Similarity.GetDefault ();
453 foreach (Term term in term_list) {
455 double idf;
456 idf = similarity.Idf (reader.DocFreq (term), reader.MaxDoc ());
458 int hit_count;
459 hit_count = hits_by_id.Count;
461 TermDocs term_docs;
462 term_docs = reader.TermDocs (term);
463 while (term_docs.Next () && hit_count > 0) {
465 int id;
466 id = term_docs.Doc ();
468 Hit hit;
469 hit = hits_by_id [id] as Hit;
470 if (hit != null) {
471 double tf;
472 tf = similarity.Tf (term_docs.Freq ());
473 hit.Score += tf * idf;
474 --hit_count;
479 sw.Stop ();
480 Logger.Log.Debug ("Scored {0} hits in {1}", hits_by_id.Count, sw);
483 ////////////////////////////////////////////////////////////////
485 private class DocAndId {
486 public Document Doc;
487 public int Id;
491 // Given a set of hits, broadcast some set out as our query
492 // results.
495 private static void GenerateQueryResults (IndexReader primary_reader,
496 LNS.IndexSearcher primary_searcher,
497 LNS.IndexSearcher secondary_searcher,
498 BetterBitArray primary_matches,
499 IQueryResult result,
500 ICollection query_term_list,
501 int max_results,
502 DateTime min_date,
503 DateTime max_date,
504 UriFilter uri_filter,
505 HitFilter hit_filter)
507 TopScores top_docs = null;
508 ArrayList all_docs = null;
510 long min_date_num, max_date_num;
511 min_date_num = Int64.Parse (StringFu.DateTimeToString (min_date));
512 max_date_num = Int64.Parse (StringFu.DateTimeToString (max_date));
514 if (max_date_num < min_date_num)
515 return;
517 if (Debug)
518 Logger.Log.Debug (">>> Initially handed {0} matches", primary_matches.TrueCount);
520 if (primary_matches.TrueCount <= max_results) {
521 if (Debug)
522 Logger.Log.Debug (">>> Initial count is within our limit of {0}", max_results);
523 all_docs = new ArrayList ();
524 } else {
525 if (Debug)
526 Logger.Log.Debug (">>> Number of hits is capped at {0}", max_results);
527 top_docs = new TopScores (max_results);
530 Stopwatch total, a, b, c;
531 total = new Stopwatch ();
532 a = new Stopwatch ();
533 b = new Stopwatch ();
534 c = new Stopwatch ();
536 total.Start ();
537 a.Start ();
539 // Pull in the primary documents.
540 // We walk across them backwards, since newer
541 // documents are more likely to be at the end of
542 // the index.
543 int j = primary_matches.Count;
544 while (true) {
545 int i;
546 i = primary_matches.GetPreviousTrueIndex (j);
547 if (i < 0)
548 break;
549 j = i-1; // This way we can't forget to adjust i
551 Document doc;
552 doc = primary_searcher.Doc (i);
554 // Check the timestamp to make sure it is in range
555 long timestamp_num;
556 timestamp_num = Int64.Parse (doc.Get ("Timestamp"));
557 if (timestamp_num < min_date_num || max_date_num < timestamp_num)
558 continue;
560 if (top_docs != null && ! top_docs.WillAccept (timestamp_num))
561 continue;
563 // If we have a UriFilter, apply it.
564 if (uri_filter != null) {
565 Uri uri;
566 uri = GetUriFromDocument (doc);
567 if (! uri_filter (uri))
568 continue;
571 DocAndId doc_and_id = new DocAndId ();
572 doc_and_id.Doc = doc;
573 doc_and_id.Id = i;
575 // Add the document to the appropriate data structure.
576 // We use the timestamp_num as the score, so high
577 // scores correspond to more-recent timestamps.
578 if (all_docs != null)
579 all_docs.Add (doc_and_id);
580 else
581 top_docs.Add (timestamp_num, doc_and_id);
584 a.Stop ();
586 b.Start ();
588 ICollection final_list_of_docs;
589 if (all_docs != null)
590 final_list_of_docs = all_docs;
591 else
592 final_list_of_docs = top_docs.TopScoringObjects;
594 ArrayList final_list_of_hits;
595 final_list_of_hits = new ArrayList (final_list_of_docs.Count);
597 // This is used only for scoring
598 Hashtable hits_by_id = null;
599 hits_by_id = new Hashtable ();
601 // If we aren't using the secondary index, the next step is
602 // very straightforward.
603 if (secondary_searcher == null) {
605 foreach (DocAndId doc_and_id in final_list_of_docs) {
606 Hit hit;
607 hit = DocumentToHit (doc_and_id.Doc);
608 hits_by_id [doc_and_id.Id] = hit;
609 final_list_of_hits.Add (hit);
612 } else {
614 if (Debug)
615 Logger.Log.Debug (">>> Performing cross-index Hit reunification");
617 Hashtable hits_by_uri;
618 hits_by_uri = UriFu.NewHashtable ();
620 LuceneBitArray secondary_matches;
621 secondary_matches = new LuceneBitArray (secondary_searcher);
623 foreach (DocAndId doc_and_id in final_list_of_docs) {
624 Hit hit;
625 hit = DocumentToHit (doc_and_id.Doc);
626 hits_by_id [doc_and_id.Id] = hit;
627 hits_by_uri [hit.Uri] = hit;
628 secondary_matches.AddUri (hit.Uri);
631 secondary_matches.FlushUris ();
633 // Attach all of our secondary properties
634 // to the hits
635 j = 0;
636 while (true) {
637 int i;
638 i = secondary_matches.GetNextTrueIndex (j);
639 if (i >= secondary_matches.Count)
640 break;
641 j = i+1;
643 Document secondary_doc;
644 secondary_doc = secondary_searcher.Doc (i);
646 Uri uri;
647 uri = GetUriFromDocument (secondary_doc);
649 Hit hit;
650 hit = hits_by_uri [uri] as Hit;
652 AddPropertiesToHit (hit, secondary_doc, false);
654 final_list_of_hits.Add (hit);
658 ScoreHits (hits_by_id, primary_reader, query_term_list);
660 b.Stop ();
662 // If we used the TopScores object, we got our original
663 // list of documents sorted for us. If not, sort the
664 // final list.
665 if (top_docs == null)
666 final_list_of_hits.Sort ();
668 c.Start ();
670 // If we have a hit_filter, use it now.
671 if (hit_filter != null) {
672 for (int i = 0; i < final_list_of_hits.Count; ++i) {
673 Hit hit;
674 hit = final_list_of_hits [i] as Hit;
675 if (! hit_filter (hit)) {
676 if (Debug)
677 Logger.Log.Debug ("Filtered out {0}", hit.Uri);
678 final_list_of_hits [i] = null;
683 // Before we broadcast a hit, we strip out any
684 // properties in the PrivateNamespace. We
685 // manipulate the property ArrayList directory,
686 // which is pretty gross... but this is safe,
687 // since removing items will not change the sort
688 // order.
689 foreach (Hit hit in final_list_of_hits) {
690 if (hit == null)
691 continue;
692 int i = 0;
693 while (i < hit.Properties.Count) {
694 Property prop = hit.Properties [i] as Property;
695 if (prop.Key.StartsWith (PrivateNamespace))
696 hit.Properties.RemoveAt (i);
697 else
698 ++i;
702 result.Add (final_list_of_hits);
704 c.Stop ();
705 total.Stop ();
707 if (Debug) {
708 Logger.Log.Debug (">>> GenerateQueryResults time statistics:");
709 Logger.Log.Debug (">>> First pass {0} ({1:0.0}%)", a, 100 * a.ElapsedTime / total.ElapsedTime);
710 Logger.Log.Debug (">>> Hit assembly {0} ({1:0.0}%)", b, 100 * b.ElapsedTime / total.ElapsedTime);
711 Logger.Log.Debug (">>> Final pass {0} ({1:0.0}%)", c, 100 * c.ElapsedTime / total.ElapsedTime);
712 Logger.Log.Debug (">>> TOTAL {0}", total);