Thumbnail file hits. Based on a patch from D Bera
[beagle.git] / beagled / LuceneQueryingDriver.cs
blobb71805412da8031eb744d56446dbb099a50e2e01
1 //
2 // LuceneQueryingDriver.cs
3 //
4 // Copyright (C) 2004-2005 Novell, Inc.
5 //
7 //
8 // Permission is hereby granted, free of charge, to any person obtaining a
9 // copy of this software and associated documentation files (the "Software"),
10 // to deal in the Software without restriction, including without limitation
11 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 // and/or sell copies of the Software, and to permit persons to whom the
13 // Software is furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 // DEALINGS IN THE SOFTWARE.
27 using System;
28 using System.Collections;
29 using System.Diagnostics;
30 using System.Globalization;
31 using System.IO;
32 using System.Text;
33 using System.Threading;
34 using System.Xml;
35 using System.Xml.Serialization;
37 using Lucene.Net.Analysis;
38 using Lucene.Net.Analysis.Standard;
39 using Lucene.Net.Documents;
40 using Lucene.Net.Index;
41 using Lucene.Net.QueryParsers;
42 using LNS = Lucene.Net.Search;
44 using Beagle.Util;
46 namespace Beagle.Daemon {
48 public class LuceneQueryingDriver : LuceneCommon {
50 static public bool Debug = true;
52 public const string PrivateNamespace = "_private:";
54 public delegate bool UriFilter (Uri uri);
55 public delegate double RelevancyMultiplier (Hit hit);
57 public LuceneQueryingDriver (string index_name, int minor_version, bool read_only)
58 : base (index_name, minor_version)
60 // FIXME: Maybe the LuceneQueryingDriver should never try to create the index?
61 if (Exists ())
62 Open (read_only);
63 // FIXME: Do something sane if we're in read-only mode and want to create an index.
64 else if (!read_only)
65 Create ();
68 ////////////////////////////////////////////////////////////////
70 private class NotHitFilter_Closure {
71 HitFilter original;
73 public NotHitFilter_Closure (HitFilter original)
75 this.original = original;
78 public bool Filter (Hit hit)
80 return ! original (hit);
84 public HitFilter NotHitFilter (HitFilter filter)
86 NotHitFilter_Closure closure;
87 closure = new NotHitFilter_Closure (filter);
88 return new HitFilter (closure.Filter);
91 ////////////////////////////////////////////////////////////////
93 // Returns the lowest matching score before the results are
94 // truncated.
95 public void DoQuery (Query query,
96 IQueryResult result,
97 ICollection search_subset_uris, // should be internal uris
98 UriFilter uri_filter,
99 HitFilter hit_filter)
101 Stopwatch sw;
102 sw = new Stopwatch ();
103 sw.Start ();
105 // Assemble all of the parts into a bunch of Lucene queries
107 ArrayList primary_required_part_queries = null;
108 ArrayList secondary_required_part_queries = null;
110 LNS.BooleanQuery primary_prohibited_part_query = null;
111 LNS.BooleanQuery secondary_prohibited_part_query = null;
113 ArrayList all_hit_filters = new ArrayList ();
114 all_hit_filters.Add (hit_filter);
116 foreach (QueryPart part in query.Parts) {
117 LNS.Query primary_part_query;
118 LNS.Query secondary_part_query;
119 HitFilter part_hit_filter;
120 QueryPartToQuery (part,
121 false, // we want both primary and secondary queries
122 out primary_part_query,
123 out secondary_part_query,
124 out part_hit_filter);
126 switch (part.Logic) {
128 case QueryPartLogic.Required:
129 if (primary_required_part_queries == null) {
130 primary_required_part_queries = new ArrayList ();
131 secondary_required_part_queries = new ArrayList ();
133 primary_required_part_queries.Add (primary_part_query);
134 secondary_required_part_queries.Add (secondary_part_query);
136 if (part_hit_filter != null)
137 all_hit_filters.Add (part_hit_filter);
139 break;
141 case QueryPartLogic.Prohibited:
142 if (primary_prohibited_part_query == null)
143 primary_prohibited_part_query = new LNS.BooleanQuery ();
144 primary_prohibited_part_query.Add (primary_part_query, false, false);
146 if (secondary_part_query != null) {
147 if (secondary_prohibited_part_query == null)
148 secondary_prohibited_part_query = new LNS.BooleanQuery ();
149 secondary_prohibited_part_query.Add (secondary_part_query, false, false);
152 if (part_hit_filter != null)
153 all_hit_filters.Add (NotHitFilter (part_hit_filter));
155 break;
159 // If we have no required parts, give up.
160 if (primary_required_part_queries == null) {
161 Logger.Log.Debug ("No required parts, I give up!");
162 return;
165 Logger.Log.Debug ("Assembled queries!");
168 // Now that we have all of these nice queries, let's execute them!
171 // Create the searchers that we will need.
173 LNS.IndexSearcher primary_searcher;
174 IndexReader secondary_reader = null;
175 LNS.IndexSearcher secondary_searcher = null;
177 primary_searcher = new LNS.IndexSearcher (PrimaryStore);
179 if (SecondaryStore != null) {
180 secondary_reader = IndexReader.Open (SecondaryStore);
181 if (secondary_reader.NumDocs () == 0) {
182 secondary_reader.Close ();
183 secondary_reader = null;
187 if (secondary_reader != null)
188 secondary_searcher = new LNS.IndexSearcher (secondary_reader);
191 // Possibly create our whitelists from the search subset.
193 LuceneBitArray primary_whitelist = null;
194 LuceneBitArray secondary_whitelist = null;
196 if (search_subset_uris != null && search_subset_uris.Count > 0) {
197 primary_whitelist = new LuceneBitArray (primary_searcher);
198 if (secondary_searcher != null)
199 secondary_whitelist = new LuceneBitArray (secondary_searcher);
201 foreach (Uri uri in search_subset_uris) {
202 primary_whitelist.AddUri (uri);
203 if (secondary_whitelist != null)
204 secondary_whitelist.AddUri (uri);
206 primary_whitelist.FlushUris ();
207 if (secondary_whitelist != null)
208 secondary_whitelist.FlushUris ();
212 // Build blacklists from our prohibited parts.
214 LuceneBitArray primary_blacklist = null;
215 LuceneBitArray secondary_blacklist = null;
217 if (primary_prohibited_part_query != null) {
218 primary_blacklist = new LuceneBitArray (primary_searcher,
219 primary_prohibited_part_query);
221 secondary_blacklist = new LuceneBitArray (secondary_searcher);
222 if (secondary_prohibited_part_query != null)
223 secondary_blacklist.Or (secondary_prohibited_part_query);
224 primary_blacklist.Join (secondary_blacklist);
228 // Combine our whitelist and blacklist into just a whitelist.
230 if (primary_blacklist != null) {
231 if (primary_whitelist == null) {
232 primary_blacklist.Not ();
233 primary_whitelist = primary_blacklist;
234 } else {
235 primary_whitelist.AndNot (primary_blacklist);
239 if (secondary_blacklist != null) {
240 if (secondary_whitelist == null) {
241 secondary_blacklist.Not ();
242 secondary_whitelist = secondary_blacklist;
243 } else {
244 secondary_whitelist.AndNot (secondary_blacklist);
248 BetterBitArray primary_matches = null;
250 if (primary_required_part_queries != null) {
252 if (secondary_searcher != null)
253 primary_matches = DoRequiredQueries_TwoIndex (primary_searcher,
254 secondary_searcher,
255 primary_required_part_queries,
256 secondary_required_part_queries,
257 primary_whitelist,
258 secondary_whitelist);
259 else
260 primary_matches = DoRequiredQueries (primary_searcher,
261 primary_required_part_queries,
262 primary_whitelist);
266 sw.Stop ();
267 if (Debug)
268 Logger.Log.Debug ("###### Finished low-level queries in {0}", sw);
269 sw.Reset ();
270 sw.Start ();
272 // If we didn't get any matches, give up.
274 if (primary_matches == null || ! primary_matches.ContainsTrue ())
275 return;
277 GenerateQueryResults (primary_searcher,
278 secondary_searcher,
279 primary_matches,
280 result,
281 query.MaxHits,
282 DateTime.MinValue,
283 DateTime.MaxValue,
284 uri_filter,
285 hit_filter);
287 // Finally, we clean up after ourselves.
290 if (secondary_reader != null)
291 secondary_reader.Close ();
292 primary_searcher.Close ();
293 if (secondary_searcher != null)
294 secondary_searcher.Close ();
297 sw.Stop ();
298 if (Debug)
299 Logger.Log.Debug ("###### Processed query in {0}", sw);
303 ////////////////////////////////////////////////////////////////
306 // Special logic for handling our set of required queries
309 // This is the easy case: we just combine all of the queries
310 // into one big BooleanQuery.
311 private static BetterBitArray DoRequiredQueries (LNS.IndexSearcher primary_searcher,
312 ArrayList primary_queries,
313 BetterBitArray primary_whitelist)
315 LNS.BooleanQuery combined_query;
316 combined_query = new LNS.BooleanQuery ();
317 foreach (LNS.Query query in primary_queries)
318 combined_query.Add (query, true, false);
320 LuceneBitArray matches;
321 matches = new LuceneBitArray (primary_searcher, combined_query);
322 if (primary_whitelist != null)
323 matches.And (primary_whitelist);
325 return matches;
328 // This code attempts to execute N required queries in the
329 // most efficient order to minimize the amount of time spent
330 // joining between the two indexes. It returns a joined bit
331 // array of matches against the primary index.
333 private class MatchInfo : IComparable {
335 public LuceneBitArray PrimaryMatches = null;
336 public LuceneBitArray SecondaryMatches = null;
337 public int UpperBound = 0;
339 public void Join ()
341 PrimaryMatches.Join (SecondaryMatches);
344 public void RestrictBy (MatchInfo joined)
346 if (joined != null) {
347 this.PrimaryMatches.And (joined.PrimaryMatches);
348 this.SecondaryMatches.And (joined.SecondaryMatches);
351 UpperBound = 0;
352 UpperBound += PrimaryMatches.TrueCount;
353 UpperBound += SecondaryMatches.TrueCount;
356 public int CompareTo (object obj)
358 MatchInfo other = (MatchInfo) obj;
359 return this.UpperBound - other.UpperBound;
363 // Any whitelists that are passed in must be fully joined, or
364 // query results will be incorrect.
365 private static BetterBitArray DoRequiredQueries_TwoIndex (LNS.IndexSearcher primary_searcher,
366 LNS.IndexSearcher secondary_searcher,
367 ArrayList primary_queries,
368 ArrayList secondary_queries,
369 BetterBitArray primary_whitelist,
370 BetterBitArray secondary_whitelist)
372 ArrayList match_info_list;
373 match_info_list = new ArrayList ();
375 // First, do all of the low-level queries
376 // and store them in our MatchInfo
377 for (int i = 0; i < primary_queries.Count; ++i) {
378 LNS.Query pq, sq;
379 pq = primary_queries [i] as LNS.Query;
380 sq = secondary_queries [i] as LNS.Query;
382 LuceneBitArray p_matches = null, s_matches = null;
383 p_matches = new LuceneBitArray (primary_searcher);
384 if (pq != null) {
385 p_matches.Or (pq);
386 if (primary_whitelist != null)
387 p_matches.And (primary_whitelist);
390 s_matches = new LuceneBitArray (secondary_searcher);
391 if (sq != null) {
392 s_matches.Or (sq);
393 if (secondary_whitelist != null)
394 s_matches.And (secondary_whitelist);
397 MatchInfo info;
398 info = new MatchInfo ();
399 info.PrimaryMatches = p_matches;
400 info.SecondaryMatches = s_matches;
401 info.RestrictBy (null); // a hack to initialize the UpperBound
402 match_info_list.Add (info);
405 // We want to be smart about the order we do this in,
406 // to minimize the expense of the Join.
407 while (match_info_list.Count > 1) {
409 // FIXME: We don't really need to sort here, it would
410 // be sufficient to just find the minimal element.
411 match_info_list.Sort ();
412 MatchInfo smallest;
413 smallest = match_info_list [0] as MatchInfo;
414 match_info_list.RemoveAt (0);
416 // We can short-circuit if our smallest set of
417 // matches is empty.
418 if (smallest.UpperBound == 0)
419 return smallest.PrimaryMatches; // this must be an empty array.
421 smallest.Join ();
423 foreach (MatchInfo info in match_info_list)
424 info.RestrictBy (smallest);
427 // For the final pair, we don't need to do a full join:
428 // mapping the secondary onto the primary is sufficient
429 MatchInfo last;
430 last = match_info_list [0] as MatchInfo;
431 last.SecondaryMatches.ProjectOnto (last.PrimaryMatches);
433 return last.PrimaryMatches;
436 ////////////////////////////////////////////////////////////////
439 // Given a set of hits, broadcast some set out as our query
440 // results.
443 private static void GenerateQueryResults (LNS.IndexSearcher primary_searcher,
444 LNS.IndexSearcher secondary_searcher,
445 BetterBitArray primary_matches,
446 IQueryResult result,
447 int max_results,
448 DateTime min_date,
449 DateTime max_date,
450 UriFilter uri_filter,
451 HitFilter hit_filter)
453 TopScores top_docs = null;
454 ArrayList all_docs = null;
456 long min_date_num, max_date_num;
457 min_date_num = Int64.Parse (StringFu.DateTimeToString (min_date));
458 max_date_num = Int64.Parse (StringFu.DateTimeToString (max_date));
460 if (max_date_num < min_date_num)
461 return;
463 if (Debug)
464 Logger.Log.Debug (">>> Initially handed {0} matches", primary_matches.TrueCount);
466 if (primary_matches.TrueCount <= max_results) {
467 if (Debug)
468 Logger.Log.Debug (">>> Initial count is within our limit of {0}", max_results);
469 all_docs = new ArrayList ();
470 } else {
471 if (Debug)
472 Logger.Log.Debug (">>> Number of hits is capped at {0}", max_results);
473 top_docs = new TopScores (max_results);
476 Stopwatch total, a, b, c;
477 total = new Stopwatch ();
478 a = new Stopwatch ();
479 b = new Stopwatch ();
480 c = new Stopwatch ();
482 total.Start ();
483 a.Start ();
485 // Pull in the primary documents.
486 // We walk across them backwards, since newer
487 // documents are more likely to be at the end of
488 // the index.
489 int j = primary_matches.Count;
490 while (true) {
491 int i;
492 i = primary_matches.GetPreviousTrueIndex (j);
493 if (i < 0)
494 break;
495 j = i-1; // This way we can't forget to adjust i
497 Document doc;
498 doc = primary_searcher.Doc (i);
500 // Check the timestamp to make sure it is in range
501 long timestamp_num;
502 timestamp_num = Int64.Parse (doc.Get ("Timestamp"));
503 if (timestamp_num < min_date_num || max_date_num < timestamp_num)
504 continue;
506 if (top_docs != null && ! top_docs.WillAccept (timestamp_num))
507 continue;
509 // If we have a UriFilter, apply it.
510 if (uri_filter != null) {
511 Uri uri;
512 uri = GetUriFromDocument (doc);
513 if (! uri_filter (uri))
514 continue;
517 // Add the document to the appropriate data structure.
518 // We use the timestamp_num as the score, so high
519 // scores correspond to more-recent timestamps.
520 if (all_docs != null)
521 all_docs.Add (doc);
522 else
523 top_docs.Add (timestamp_num, doc);
526 a.Stop ();
528 b.Start ();
530 ICollection final_list_of_docs;
531 if (all_docs != null)
532 final_list_of_docs = all_docs;
533 else
534 final_list_of_docs = top_docs.TopScoringObjects;
536 ArrayList final_list_of_hits;
537 final_list_of_hits = new ArrayList (final_list_of_docs.Count);
539 // If we aren't using the secondary index, the next step is
540 // very straightforward.
541 if (secondary_searcher == null) {
543 foreach (Document doc in final_list_of_docs) {
544 Hit hit;
545 hit = DocumentToHit (doc);
546 final_list_of_hits.Add (hit);
549 } else {
551 if (Debug)
552 Logger.Log.Debug (">>> Performing cross-index Hit reunification");
554 Hashtable hits_by_uri;
555 hits_by_uri = UriFu.NewHashtable ();
557 LuceneBitArray secondary_matches;
558 secondary_matches = new LuceneBitArray (secondary_searcher);
560 foreach (Document primary_doc in final_list_of_docs) {
562 Hit hit;
563 hit = DocumentToHit (primary_doc);
565 hits_by_uri [hit.Uri] = hit;
567 secondary_matches.AddUri (hit.Uri);
570 secondary_matches.FlushUris ();
572 // Attach all of our secondary properties
573 // to the hits
574 j = 0;
575 while (true) {
576 int i;
577 i = secondary_matches.GetNextTrueIndex (j);
578 if (i >= secondary_matches.Count)
579 break;
580 j = i+1;
582 Document secondary_doc;
583 secondary_doc = secondary_searcher.Doc (i);
585 Uri uri;
586 uri = GetUriFromDocument (secondary_doc);
587 Logger.Log.Debug ("Joining {0}", uri);
589 Hit hit;
590 hit = hits_by_uri [uri] as Hit;
592 AddPropertiesToHit (hit, secondary_doc, false);
594 final_list_of_hits.Add (hit);
598 b.Stop ();
600 // If we used the TopScores object, we got our original
601 // list of documents sorted for us. If not, sort the
602 // final list.
603 if (top_docs == null)
604 final_list_of_hits.Sort ();
606 c.Start ();
608 Logger.Log.Debug ("Final list length = {0}", final_list_of_hits.Count);
610 // If we have a hit_filter, use it now.
611 if (hit_filter != null) {
612 for (int i = 0; i < final_list_of_hits.Count; ++i) {
613 Hit hit;
614 hit = final_list_of_hits [i] as Hit;
615 if (! hit_filter (hit)) {
616 Logger.Log.Debug ("Filtered out {0}", hit.Uri);
617 final_list_of_hits [i] = null;
622 // Before we broadcast a hit, we strip out any
623 // properties in the PrivateNamespace. We
624 // manipulate the property ArrayList directory,
625 // which is pretty gross... but this is safe,
626 // since removing items will not change the sort
627 // order.
628 foreach (Hit hit in final_list_of_hits) {
629 if (hit == null)
630 continue;
631 int i = 0;
632 while (i < hit.Properties.Count) {
633 Property prop = hit.Properties [i] as Property;
634 if (prop.Key.StartsWith (PrivateNamespace))
635 hit.Properties.RemoveAt (i);
636 else
637 ++i;
641 result.Add (final_list_of_hits);
643 c.Stop ();
644 total.Stop ();
646 if (Debug) {
647 Logger.Log.Debug (">>> GenerateQueryResults time statistics:");
648 Logger.Log.Debug (">>> First pass {0} ({1:0.0}%)", a, 100 * a.ElapsedTime / total.ElapsedTime);
649 Logger.Log.Debug (">>> Hit assembly {0} ({1:0.0}%)", b, 100 * b.ElapsedTime / total.ElapsedTime);
650 Logger.Log.Debug (">>> Final pass {0} ({1:0.0}%)", c, 100 * c.ElapsedTime / total.ElapsedTime);
651 Logger.Log.Debug (">>> TOTAL {0}", total);