scriptindex: Fix weird error cases
[xapian.git] / xapian-core / matcher / multimatch.cc
blob2d86bbd885930612da96231e65bfc9a24976139d
1 /* multimatch.cc
3 * Copyright 1999,2000,2001 BrightStation PLC
4 * Copyright 2001,2002 Ananova Ltd
5 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2013,2014,2015,2016 Olly Betts
6 * Copyright 2003 Orange PCS Ltd
7 * Copyright 2003 Sam Liddicott
8 * Copyright 2007,2008,2009 Lemur Consulting Ltd
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License as
12 * published by the Free Software Foundation; either version 2 of the
13 * License, or (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
23 * USA
26 #include <config.h>
28 #include "multimatch.h"
30 #include "autoptr.h"
31 #include "collapser.h"
32 #include "debuglog.h"
33 #include "submatch.h"
34 #include "localsubmatch.h"
35 #include "omassert.h"
36 #include "api/omenquireinternal.h"
37 #include "realtime.h"
39 #include "api/emptypostlist.h"
40 #include "branchpostlist.h"
41 #include "mergepostlist.h"
43 #include "backends/document.h"
45 #include "msetcmp.h"
47 #include "valuestreamdocument.h"
48 #include "weight/weightinternal.h"
50 #include <xapian/error.h>
51 #include <xapian/matchspy.h>
52 #include <xapian/version.h> // For XAPIAN_HAS_REMOTE_BACKEND
54 #ifdef XAPIAN_HAS_REMOTE_BACKEND
55 #include "remotesubmatch.h"
56 #include "backends/remote/remote-database.h"
57 #endif /* XAPIAN_HAS_REMOTE_BACKEND */
59 #include <algorithm>
60 #include <cfloat> // For DBL_EPSILON.
61 #include <climits> // For UINT_MAX.
62 #include <vector>
63 #include <map>
64 #include <set>
66 #ifdef HAVE_TIMER_CREATE
67 #include <signal.h>
68 #include <time.h>
69 #include "safeunistd.h" // For _POSIX_* feature test macros.
71 extern "C" {
73 static void
74 set_timeout_flag(union sigval sv)
76 *(reinterpret_cast<volatile bool*>(sv.sival_ptr)) = true;
81 // The monotonic clock is the better basis for timeouts, but not always
82 // available.
84 #ifndef _POSIX_MONOTONIC_CLOCK
85 const clockid_t TIMEOUT_CLOCK = CLOCK_REALTIME;
86 #elif defined __sun
87 // Solaris defines CLOCK_MONOTONIC, but "man timer_create" doesn't mention it
88 // and when using it timer_create() fails with "EPERM" (perhaps you need to be
89 // root to use it? I can't test that).
91 // Solaris defines _POSIX_MONOTONIC_CLOCK so we need to special case.
92 const clockid_t TIMEOUT_CLOCK = CLOCK_REALTIME;
93 #elif defined __CYGWIN__
94 // https://cygwin.com/cygwin-api/std-notes.html currently (2016-05-13) says:
96 // clock_nanosleep currently supports only CLOCK_REALTIME and
97 // CLOCK_MONOTONIC. clock_setres, clock_settime, and timer_create
98 // currently support only CLOCK_REALTIME.
100 // So CLOCK_MONOTONIC is defined, but not supported by timer_create().
101 const clockid_t TIMEOUT_CLOCK = CLOCK_REALTIME;
102 #else
103 const clockid_t TIMEOUT_CLOCK = CLOCK_MONOTONIC;
104 #endif
106 class TimeOut {
107 struct sigevent sev;
108 timer_t timerid;
109 volatile bool expired;
111 public:
112 explicit TimeOut(double limit) : expired(false) {
113 if (limit > 0) {
114 sev.sigev_notify = SIGEV_THREAD;
115 sev.sigev_notify_function = set_timeout_flag;
116 sev.sigev_notify_attributes = NULL;
117 sev.sigev_value.sival_ptr =
118 static_cast<void*>(const_cast<bool*>(&expired));
119 if (usual(timer_create(TIMEOUT_CLOCK, &sev, &timerid) == 0)) {
120 struct itimerspec interval;
121 interval.it_interval.tv_sec = 0;
122 interval.it_interval.tv_nsec = 0;
123 RealTime::to_timespec(limit, &interval.it_value);
124 if (usual(timer_settime(timerid, 0, &interval, NULL) == 0)) {
125 // Timeout successfully set.
126 return;
128 timer_delete(timerid);
131 sev.sigev_notify = SIGEV_NONE;
134 ~TimeOut() {
135 if (sev.sigev_notify != SIGEV_NONE) {
136 timer_delete(timerid);
137 sev.sigev_notify = SIGEV_NONE;
141 bool timed_out() const { return expired; }
143 #else
144 class TimeOut {
145 public:
146 explicit TimeOut(double) { }
147 bool timed_out() const { return false; }
149 #endif
151 using namespace std;
152 using Xapian::Internal::intrusive_ptr;
154 const Xapian::Enquire::Internal::sort_setting REL =
155 Xapian::Enquire::Internal::REL;
156 const Xapian::Enquire::Internal::sort_setting REL_VAL =
157 Xapian::Enquire::Internal::REL_VAL;
158 const Xapian::Enquire::Internal::sort_setting VAL =
159 Xapian::Enquire::Internal::VAL;
160 #if 0 // VAL_REL isn't currently used so avoid compiler warnings.
161 const Xapian::Enquire::Internal::sort_setting VAL_REL =
162 Xapian::Enquire::Internal::VAL_REL;
163 #endif
165 /** Split an RSet into several sub rsets, one for each database.
167 * @param rset The RSet to split.
168 * @param number_of_subdbs The number of sub databases which exist.
169 * @param subrsets Vector of RSets which will the sub rsets will be placed in.
170 * This should be empty when the function is called.
172 static void
173 split_rset_by_db(const Xapian::RSet * rset,
174 Xapian::doccount number_of_subdbs,
175 vector<Xapian::RSet> & subrsets)
177 LOGCALL_STATIC_VOID(MATCH, "split_rset_by_db", rset | number_of_subdbs | subrsets);
178 if (rset && !rset->empty()) {
179 if (number_of_subdbs == 1) {
180 // The common case of a single database is easy to handle.
181 subrsets.push_back(*rset);
182 } else {
183 // Can't just use vector::resize() here, since that creates N
184 // copies of the same RSet!
185 subrsets.reserve(number_of_subdbs);
186 for (size_t i = 0; i < number_of_subdbs; ++i) {
187 subrsets.push_back(Xapian::RSet());
190 const set<Xapian::docid> & rsetitems = rset->internal->get_items();
191 set<Xapian::docid>::const_iterator j;
192 for (j = rsetitems.begin(); j != rsetitems.end(); ++j) {
193 Xapian::doccount local_docid = (*j - 1) / number_of_subdbs + 1;
194 Xapian::doccount subdatabase = (*j - 1) % number_of_subdbs;
195 subrsets[subdatabase].add_document(local_docid);
198 } else {
199 // NB vector::resize() creates N copies of the same empty RSet.
200 subrsets.resize(number_of_subdbs);
202 Assert(subrsets.size() == number_of_subdbs);
205 /** Prepare some SubMatches.
207 * This calls the prepare_match() method on each SubMatch object, causing them
208 * to lookup various statistics.
210 * This method is rather complicated in order to handle remote matches
211 * efficiently. Instead of simply calling "prepare_match()" on each submatch
212 * and waiting for it to return, it first calls "prepare_match(true)" on each
213 * submatch. If any of these calls return false, indicating that the required
214 * information has not yet been received from the server, the method will try
215 * those which returned false again, passing "false" as a parameter to
216 * indicate that they should block until the information is ready.
218 * This should improve performance in the case of mixed local-and-remote
219 * searches - the local searchers will all fetch their statistics from disk
220 * without waiting for the remote searchers, so as soon as the remote searcher
221 * statistics arrive, we can move on to the next step.
223 static void
224 prepare_sub_matches(vector<intrusive_ptr<SubMatch> > & leaves,
225 Xapian::Weight::Internal & stats)
227 LOGCALL_STATIC_VOID(MATCH, "prepare_sub_matches", leaves | stats);
228 // We use a vector<bool> to track which SubMatches we're already prepared.
229 vector<bool> prepared;
230 prepared.resize(leaves.size(), false);
231 size_t unprepared = leaves.size();
232 bool nowait = true;
233 while (unprepared) {
234 for (size_t leaf = 0; leaf < leaves.size(); ++leaf) {
235 if (prepared[leaf]) continue;
236 SubMatch * submatch = leaves[leaf].get();
237 if (!submatch || submatch->prepare_match(nowait, stats)) {
238 prepared[leaf] = true;
239 --unprepared;
242 // Use blocking IO on subsequent passes, so that we don't go into
243 // a tight loop.
244 nowait = false;
248 /// Class which applies several match spies in turn.
249 class MultipleMatchSpy : public Xapian::MatchSpy {
250 private:
251 /// List of match spies to call, in order.
252 const std::vector<Xapian::Internal::opt_intrusive_ptr<Xapian::MatchSpy>> & spies;
254 public:
255 explicit
256 MultipleMatchSpy(const std::vector<Xapian::Internal::opt_intrusive_ptr<Xapian::MatchSpy>> & spies_)
257 : spies(spies_) {}
259 /** Implementation of virtual operator().
261 * This implementation calls all the spies in turn.
263 void operator()(const Xapian::Document &doc, double wt);
266 void
267 MultipleMatchSpy::operator()(const Xapian::Document &doc, double wt) {
268 LOGCALL_VOID(MATCH, "MultipleMatchSpy::operator()", doc | wt);
269 for (auto i : spies) {
270 (*i)(doc, wt);
274 ////////////////////////////////////
275 // Initialisation and cleaning up //
276 ////////////////////////////////////
277 MultiMatch::MultiMatch(const Xapian::Database &db_,
278 const Xapian::Query & query_,
279 Xapian::termcount qlen,
280 const Xapian::RSet * omrset,
281 Xapian::doccount collapse_max_,
282 Xapian::valueno collapse_key_,
283 int percent_cutoff_, double weight_cutoff_,
284 Xapian::Enquire::docid_order order_,
285 Xapian::valueno sort_key_,
286 Xapian::Enquire::Internal::sort_setting sort_by_,
287 bool sort_value_forward_,
288 double time_limit_,
289 Xapian::Weight::Internal & stats,
290 const Xapian::Weight * weight_,
291 const vector<Xapian::Internal::opt_intrusive_ptr<Xapian::MatchSpy>> & matchspies_,
292 bool have_sorter, bool have_mdecider)
293 : db(db_), query(query_),
294 collapse_max(collapse_max_), collapse_key(collapse_key_),
295 percent_cutoff(percent_cutoff_), weight_cutoff(weight_cutoff_),
296 order(order_),
297 sort_key(sort_key_), sort_by(sort_by_),
298 sort_value_forward(sort_value_forward_),
299 time_limit(time_limit_),
300 weight(weight_),
301 is_remote(db.internal.size()),
302 matchspies(matchspies_)
304 LOGCALL_CTOR(MATCH, "MultiMatch", db_ | query_ | qlen | omrset | collapse_max_ | collapse_key_ | percent_cutoff_ | weight_cutoff_ | int(order_) | sort_key_ | int(sort_by_) | sort_value_forward_ | time_limit_| stats | weight_ | matchspies_ | have_sorter | have_mdecider);
306 if (query.empty()) return;
308 Xapian::doccount number_of_subdbs = db.internal.size();
309 vector<Xapian::RSet> subrsets;
310 split_rset_by_db(omrset, number_of_subdbs, subrsets);
312 for (size_t i = 0; i != number_of_subdbs; ++i) {
313 Xapian::Database::Internal *subdb = db.internal[i].get();
314 Assert(subdb);
315 intrusive_ptr<SubMatch> smatch;
317 // There is currently only one special case, for network databases.
318 #ifdef XAPIAN_HAS_REMOTE_BACKEND
319 if (subdb->get_backend_info(NULL) == BACKEND_REMOTE) {
320 RemoteDatabase *rem_db = static_cast<RemoteDatabase*>(subdb);
321 if (have_sorter) {
322 throw Xapian::UnimplementedError("Xapian::KeyMaker not supported for the remote backend");
324 if (have_mdecider) {
325 throw Xapian::UnimplementedError("Xapian::MatchDecider not supported for the remote backend");
327 // FIXME: Remote handling for time_limit with multiple
328 // databases may need some work.
329 rem_db->set_query(query, qlen, collapse_max, collapse_key,
330 order, sort_key, sort_by, sort_value_forward,
331 time_limit,
332 percent_cutoff, weight_cutoff, weight,
333 subrsets[i], matchspies);
334 bool decreasing_relevance =
335 (sort_by == REL || sort_by == REL_VAL);
336 smatch = new RemoteSubMatch(rem_db, decreasing_relevance, matchspies);
337 is_remote[i] = true;
338 } else {
339 smatch = new LocalSubMatch(subdb, query, qlen, subrsets[i], weight,
341 subdb->readahead_for_query(query);
343 #else
344 // Avoid unused parameter warnings.
345 (void)have_sorter;
346 (void)have_mdecider;
347 smatch = new LocalSubMatch(subdb, query, qlen, subrsets[i], weight, i);
348 #endif /* XAPIAN_HAS_REMOTE_BACKEND */
349 leaves.push_back(smatch);
352 stats.set_query(query);
353 prepare_sub_matches(leaves, stats);
354 stats.set_bounds_from_db(db);
357 double
358 MultiMatch::getorrecalc_maxweight(PostList *pl)
360 LOGCALL(MATCH, double, "MultiMatch::getorrecalc_maxweight", pl);
361 double wt;
362 if (recalculate_w_max) {
363 LOGLINE(MATCH, "recalculating max weight");
364 wt = pl->recalc_maxweight();
365 recalculate_w_max = false;
366 } else {
367 wt = pl->get_maxweight();
368 LOGLINE(MATCH, "pl = (" << pl->get_description() << ")");
369 // FIXME: This fails for hoistnotbug1 under multi_remoteprog_glass:
370 // AssertionError: matcher/multimatch.cc:370: within_DBL_EPSILON(wt, pl->recalc_maxweight()) : values were 2.7075940282079162813 and 2.6966211268553141878
372 // AssertEqDoubleParanoid(wt, pl->recalc_maxweight());
374 // Not sure why - adding code to call recalc_maxweight() doesn't seem
375 // to help. But the max weight not being as low as it could be is
376 // only a potential missed optimisation opportunity, not a correctness
377 // issue.
379 LOGLINE(MATCH, "max possible doc weight = " << wt);
380 RETURN(wt);
383 void
384 MultiMatch::get_mset(Xapian::doccount first, Xapian::doccount maxitems,
385 Xapian::doccount check_at_least,
386 Xapian::MSet & mset,
387 Xapian::Weight::Internal & stats,
388 const Xapian::MatchDecider *mdecider,
389 const Xapian::KeyMaker *sorter)
391 LOGCALL_VOID(MATCH, "MultiMatch::get_mset", first | maxitems | check_at_least | Literal("mset") | stats | Literal("mdecider") | Literal("sorter"));
392 AssertRel(check_at_least,>=,first + maxitems);
394 if (query.empty()) {
395 mset = Xapian::MSet();
396 mset.internal->firstitem = first;
397 return;
400 Assert(!leaves.empty());
402 TimeOut timeout(time_limit);
404 #ifdef XAPIAN_HAS_REMOTE_BACKEND
405 // If there's only one database and it's remote, we can just unserialise
406 // its MSet and return that.
407 if (leaves.size() == 1 && is_remote[0]) {
408 RemoteSubMatch * rem_match;
409 rem_match = static_cast<RemoteSubMatch*>(leaves[0].get());
410 rem_match->start_match(first, maxitems, check_at_least, stats);
411 rem_match->get_mset(mset);
412 return;
414 #endif
416 // Start matchers.
417 for (size_t i = 0; i != leaves.size(); ++i) {
418 auto& leaf = leaves[i];
419 #ifdef XAPIAN_HAS_REMOTE_BACKEND
420 if (is_remote[i]) {
421 Xapian::doccount remote_maxitems = first + maxitems;
422 if (collapse_max != 0) {
423 // If collapsing we need to fetch all check_at_least items in
424 // order to satisfy the requirement that if there are <=
425 // check_at_least results then then estimated number of matches
426 // is exact.
427 remote_maxitems = check_at_least;
429 leaf->start_match(0, remote_maxitems, check_at_least, stats);
430 continue;
432 #endif
433 leaf->start_match(0, first + maxitems, check_at_least, stats);
436 // Get postlists and term info
437 vector<PostList *> postlists;
438 Xapian::termcount total_subqs = 0;
439 // Keep a count of matches which we know exist, but we won't see. This
440 // occurs when a submatch is remote, and returns a lower bound on the
441 // number of matching documents which is higher than the number of
442 // documents it returns (because it wasn't asked for more documents).
443 Xapian::doccount definite_matches_not_seen = 0;
444 #ifdef XAPIAN_HAS_REMOTE_BACKEND
445 // Track these for calculating uncollapsed_upper_bound for the local.
446 size_t n_remotes = 0;
447 Xapian::doccount remote_uncollapsed_upper_bound = 0;
448 #endif
449 try {
450 for (size_t i = 0; i != leaves.size(); ++i) {
451 // Pick the highest total subqueries answer amongst the
452 // subdatabases, as the query to postlist conversion doesn't
453 // recurse into positional queries for shards that don't have
454 // positional data when at least one other shard does.
455 Xapian::termcount total_subqs_i = 0;
456 PostList* pl = leaves[i]->get_postlist(this, &total_subqs_i, stats);
457 total_subqs = max(total_subqs, total_subqs_i);
458 #ifdef XAPIAN_HAS_REMOTE_BACKEND
459 if (is_remote[i]) {
460 ++n_remotes;
461 RemoteSubMatch* rem_match =
462 static_cast<RemoteSubMatch*>(leaves[i].get());
463 remote_uncollapsed_upper_bound +=
464 rem_match->get_uncollapsed_upper_bound();
465 if (collapse_max == 0 &&
466 pl->get_termfreq_min() > first + maxitems) {
467 LOGLINE(MATCH, "Found " <<
468 pl->get_termfreq_min() - (first + maxitems)
469 << " definite matches in remote submatch "
470 "which aren't passed to local match");
471 definite_matches_not_seen += pl->get_termfreq_min();
472 definite_matches_not_seen -= first + maxitems;
475 #endif
476 postlists.push_back(pl);
478 } catch (...) {
479 for (auto pl : postlists) delete pl;
480 throw;
482 Assert(!postlists.empty());
484 ValueStreamDocument vsdoc(db);
485 ++vsdoc._refs;
486 Xapian::Document doc(&vsdoc);
488 // Get a single combined postlist
489 AutoPtr<PostList> pl;
490 if (postlists.size() == 1) {
491 pl.reset(postlists.front());
492 } else {
493 pl.reset(new MergePostList(postlists, this, vsdoc));
496 LOGLINE(MATCH, "pl = (" << pl->get_description() << ")");
498 // Empty result set
499 Xapian::doccount docs_matched = 0;
500 double greatest_wt = 0;
501 Xapian::termcount greatest_wt_subqs_matched = 0;
502 #ifdef XAPIAN_HAS_REMOTE_BACKEND
503 unsigned greatest_wt_subqs_db_num = UINT_MAX;
504 #endif
505 vector<Xapian::Internal::MSetItem> items;
507 // maximum weight a document could possibly have
508 const double max_possible = pl->recalc_maxweight();
510 LOGLINE(MATCH, "pl = (" << pl->get_description() << ")");
511 recalculate_w_max = false;
513 Xapian::doccount matches_upper_bound = pl->get_termfreq_max();
514 Xapian::doccount matches_lower_bound = 0;
515 Xapian::doccount matches_estimated = pl->get_termfreq_est();
517 if (mdecider == NULL) {
518 // If we have a match decider, the lower bound must be
519 // set to 0 as we could discard all hits. Otherwise set it to the
520 // minimum number of entries which the postlist could return.
521 matches_lower_bound = pl->get_termfreq_min();
524 // Prepare the matchspy
525 Xapian::MatchSpy *matchspy = NULL;
526 MultipleMatchSpy multispy(matchspies);
527 if (!matchspies.empty()) {
528 if (matchspies.size() == 1) {
529 matchspy = matchspies[0].get();
530 } else {
531 matchspy = &multispy;
535 // Check if any results have been asked for (might just be wanting
536 // maxweight).
537 if (check_at_least == 0) {
538 pl.reset(NULL);
539 Xapian::doccount uncollapsed_lower_bound = matches_lower_bound;
540 if (collapse_max) {
541 // Lower bound must be set to no more than collapse_max, since it's
542 // possible that all matching documents have the same collapse_key
543 // value and so are collapsed together.
544 if (matches_lower_bound > collapse_max)
545 matches_lower_bound = collapse_max;
548 mset.internal = new Xapian::MSet::Internal(
549 first,
550 matches_upper_bound,
551 matches_lower_bound,
552 matches_estimated,
553 matches_upper_bound,
554 uncollapsed_lower_bound,
555 matches_estimated,
556 max_possible, greatest_wt, items,
558 return;
561 // Number of documents considered by a decider.
562 Xapian::doccount decider_considered = 0;
563 // Number of documents denied by the decider.
564 Xapian::doccount decider_denied = 0;
566 // Set max number of results that we want - this is used to decide
567 // when to throw away unwanted items.
568 Xapian::doccount max_msize = first + maxitems;
569 items.reserve(max_msize + 1);
571 // Tracks the minimum item currently eligible for the MSet - we compare
572 // candidate items against this.
573 Xapian::Internal::MSetItem min_item(0.0, 0);
575 // Minimum weight an item must have to be worth considering.
576 double min_weight = weight_cutoff;
578 // Factor to multiply maximum weight seen by to get the cutoff weight.
579 double percent_cutoff_factor = percent_cutoff / 100.0;
580 // Corresponding correction to that in omenquire.cc to account for excess
581 // precision on x86.
582 percent_cutoff_factor -= DBL_EPSILON;
584 // Object to handle collapsing.
585 Collapser collapser(collapse_key, collapse_max);
587 /// Comparison functor for sorting MSet
588 bool sort_forward = (order != Xapian::Enquire::DESCENDING);
589 MSetCmp mcmp(get_msetcmp_function(sort_by, sort_forward, sort_value_forward));
591 // Perform query
593 // We form the mset in two stages. In the first we fill up our working
594 // mset. Adding a new document does not remove another.
596 // In the second, we consider documents which rank higher than the current
597 // lowest ranking document in the mset. Each document added expels the
598 // current lowest ranking document.
600 // If a percentage cutoff is in effect, it can cause the matcher to return
601 // from the second stage from the first.
603 // Is the mset a valid heap?
604 bool is_heap = false;
606 while (true) {
607 bool pushback;
609 if (rare(recalculate_w_max)) {
610 if (min_weight > 0.0) {
611 if (rare(getorrecalc_maxweight(pl.get()) < min_weight)) {
612 LOGLINE(MATCH, "*** TERMINATING EARLY (1)");
613 break;
618 PostList * pl_copy = pl.get();
619 if (rare(next_handling_prune(pl_copy, min_weight, this))) {
620 (void)pl.release();
621 pl.reset(pl_copy);
622 LOGLINE(MATCH, "*** REPLACING ROOT");
624 if (min_weight > 0.0) {
625 // No need for a full recalc (unless we've got to do one
626 // because of a prune elsewhere) - we're just switching to a
627 // subtree.
628 if (rare(getorrecalc_maxweight(pl.get()) < min_weight)) {
629 LOGLINE(MATCH, "*** TERMINATING EARLY (2)");
630 break;
635 if (rare(pl->at_end())) {
636 LOGLINE(MATCH, "Reached end of potential matches");
637 break;
640 // Only calculate the weight if we need it for mcmp, or there's a
641 // percentage or weight cutoff in effect. Otherwise we calculate it
642 // below if we haven't already rejected this candidate.
643 double wt = 0.0;
644 bool calculated_weight = false;
645 if (sort_by != VAL || min_weight > 0.0) {
646 wt = pl->get_weight();
647 if (wt < min_weight) {
648 LOGLINE(MATCH, "Rejecting potential match due to insufficient weight");
649 continue;
651 calculated_weight = true;
654 Xapian::docid did = pl->get_docid();
655 vsdoc.set_document(did);
656 LOGLINE(MATCH, "Candidate document id " << did << " wt " << wt);
657 Xapian::Internal::MSetItem new_item(wt, did);
658 if (check_at_least > first + maxitems && timeout.timed_out()) {
659 check_at_least = first + maxitems;
662 if (sort_by != REL) {
663 const string * ptr = pl->get_sort_key();
664 if (ptr) {
665 new_item.sort_key = *ptr;
666 } else if (sorter) {
667 new_item.sort_key = (*sorter)(doc);
668 } else {
669 new_item.sort_key = vsdoc.get_value(sort_key);
672 // We're sorting by value (in part at least), so compare the item
673 // against the lowest currently in the proto-mset. If sort_by is
674 // VAL, then new_item.wt won't yet be set, but that doesn't
675 // matter since it's not used by the sort function.
676 if (!mcmp(new_item, min_item)) {
677 if (mdecider == NULL && !collapser) {
678 // Document was definitely suitable for mset - no more
679 // processing needed.
680 LOGLINE(MATCH, "Making note of match item which sorts lower than min_item");
681 ++docs_matched;
682 if (!calculated_weight) wt = pl->get_weight();
683 if (matchspy) {
684 matchspy->operator()(doc, wt);
686 if (wt > greatest_wt) goto new_greatest_weight;
687 continue;
689 if (docs_matched >= check_at_least) {
690 // We've seen enough items - we can drop this one.
691 LOGLINE(MATCH, "Dropping candidate which sorts lower than min_item");
692 // FIXME: hmm, match decider might have rejected this...
693 if (!calculated_weight) wt = pl->get_weight();
694 if (wt > greatest_wt) goto new_greatest_weight;
695 continue;
697 // We can't drop the item, because we need to test whether the
698 // mdecider would accept it and/or test whether it would be
699 // collapsed.
700 LOGLINE(MATCH, "Keeping candidate which sorts lower than min_item for further investigation");
704 // Use the match spy and/or decision functors (if specified).
705 if (matchspy != NULL || mdecider != NULL) {
706 const unsigned int multiplier = db.internal.size();
707 Assert(multiplier != 0);
708 Xapian::doccount n = (did - 1) % multiplier; // which actual database
709 // If the results are from a remote database, then the functor will
710 // already have been applied there so we can skip this step.
711 if (!is_remote[n]) {
712 ++decider_considered;
713 if (mdecider && !mdecider->operator()(doc)) {
714 ++decider_denied;
715 continue;
717 if (matchspy) {
718 if (!calculated_weight) {
719 wt = pl->get_weight();
720 new_item.wt = wt;
721 calculated_weight = true;
723 matchspy->operator()(doc, wt);
728 if (!calculated_weight) {
729 // we didn't calculate the weight above, but now we will need it
730 wt = pl->get_weight();
731 new_item.wt = wt;
734 pushback = true;
736 // Perform collapsing on key if requested.
737 if (collapser) {
738 collapse_result res;
739 res = collapser.process(new_item, pl.get(), vsdoc, mcmp);
740 if (res == REJECTED) {
741 // If we're sorting by relevance primarily, then we throw away
742 // the lower weighted document anyway.
743 if (sort_by != REL && sort_by != REL_VAL) {
744 if (wt > greatest_wt) goto new_greatest_weight;
746 continue;
749 if (res == REPLACED) {
750 // There was a previous item in the collapse tab so
751 // the MSet can't be empty.
752 Assert(!items.empty());
754 const Xapian::Internal::MSetItem & old_item =
755 collapser.old_item;
756 // This is one of the best collapse_max potential MSet entries
757 // with this key which we've seen so far. Check if the
758 // entry with this key which it displaced might still be in the
759 // proto-MSet. If it might be, we need to check through for
760 // it.
761 double old_wt = old_item.wt;
762 if (old_wt >= min_weight && mcmp(old_item, min_item)) {
763 // Scan through (unsorted) MSet looking for entry.
764 // FIXME: more efficient way than just scanning?
765 Xapian::docid olddid = old_item.did;
766 vector<Xapian::Internal::MSetItem>::iterator i;
767 for (i = items.begin(); i != items.end(); ++i) {
768 if (i->did == olddid) {
769 LOGLINE(MATCH, "collapse: removing " <<
770 olddid << ": " <<
771 new_item.collapse_key);
772 // We can replace an arbitrary element in O(log N)
773 // but have to do it by hand (in this case the new
774 // elt is bigger, so we just swap down the tree).
775 // FIXME: implement this, and clean up is_heap
776 // handling
777 *i = new_item;
778 pushback = false;
779 is_heap = false;
780 break;
787 // OK, actually add the item to the mset.
788 if (pushback) {
789 ++docs_matched;
790 if (items.size() >= max_msize) {
791 items.push_back(new_item);
792 if (!is_heap) {
793 is_heap = true;
794 make_heap(items.begin(), items.end(), mcmp);
795 } else {
796 push_heap<vector<Xapian::Internal::MSetItem>::iterator,
797 MSetCmp>(items.begin(), items.end(), mcmp);
799 pop_heap<vector<Xapian::Internal::MSetItem>::iterator,
800 MSetCmp>(items.begin(), items.end(), mcmp);
801 items.pop_back();
803 if (!items.empty()) {
804 // In the get_mset(0, 0, X) (X > 0) case just don't update min_item.
805 min_item = items.front();
807 if (sort_by == REL || sort_by == REL_VAL) {
808 if (docs_matched >= check_at_least) {
809 if (sort_by == REL) {
810 // We're done if this is a forward boolean match
811 // with only one database (bodgetastic, FIXME
812 // better if we can!)
813 if (rare(max_possible == 0 && sort_forward)) {
814 // In the multi database case, MergePostList
815 // currently processes each database
816 // sequentially (which actually may well be
817 // more efficient) so the docids in general
818 // won't arrive in order.
819 if (leaves.size() == 1) break;
822 if (min_item.wt > min_weight) {
823 LOGLINE(MATCH, "Setting min_weight to " <<
824 min_item.wt << " from " << min_weight);
825 min_weight = min_item.wt;
829 if (rare(getorrecalc_maxweight(pl.get()) < min_weight)) {
830 LOGLINE(MATCH, "*** TERMINATING EARLY (3)");
831 break;
833 } else {
834 items.push_back(new_item);
835 is_heap = false;
836 if (sort_by == REL && items.size() == max_msize) {
837 if (docs_matched >= check_at_least) {
838 // We're done if this is a forward boolean match
839 // with only one database (bodgetastic, FIXME
840 // better if we can!)
841 if (rare(max_possible == 0 && sort_forward)) {
842 // In the multi database case, MergePostList
843 // currently processes each database
844 // sequentially (which actually may well be
845 // more efficient) so the docids in general
846 // won't arrive in order.
847 if (leaves.size() == 1) break;
854 // Keep a track of the greatest weight we've seen.
855 if (wt > greatest_wt) {
856 new_greatest_weight:
857 greatest_wt = wt;
858 #ifdef XAPIAN_HAS_REMOTE_BACKEND
859 const unsigned int multiplier = db.internal.size();
860 unsigned int db_num = (did - 1) % multiplier;
861 if (is_remote[db_num]) {
862 // Note that the greatest weighted document came from a remote
863 // database, and which one.
864 greatest_wt_subqs_db_num = db_num;
865 } else
866 #endif
868 greatest_wt_subqs_matched = pl->count_matching_subqs();
869 #ifdef XAPIAN_HAS_REMOTE_BACKEND
870 greatest_wt_subqs_db_num = UINT_MAX;
871 #endif
873 if (percent_cutoff) {
874 double w = wt * percent_cutoff_factor;
875 if (w > min_weight) {
876 min_weight = w;
877 if (!is_heap) {
878 is_heap = true;
879 make_heap<vector<Xapian::Internal::MSetItem>::iterator,
880 MSetCmp>(items.begin(), items.end(), mcmp);
882 while (!items.empty() && items.front().wt < min_weight) {
883 pop_heap<vector<Xapian::Internal::MSetItem>::iterator,
884 MSetCmp>(items.begin(), items.end(), mcmp);
885 Assert(items.back().wt < min_weight);
886 items.pop_back();
888 #ifdef XAPIAN_ASSERTIONS_PARANOID
889 vector<Xapian::Internal::MSetItem>::const_iterator i;
890 for (i = items.begin(); i != items.end(); ++i) {
891 Assert(i->wt >= min_weight);
893 #endif
899 // done with posting list tree
900 pl.reset(NULL);
902 double percent_scale = 0;
903 if (!items.empty() && greatest_wt > 0) {
904 #ifdef XAPIAN_HAS_REMOTE_BACKEND
905 if (greatest_wt_subqs_db_num != UINT_MAX) {
906 const unsigned int n = greatest_wt_subqs_db_num;
907 RemoteSubMatch * rem_match;
908 rem_match = static_cast<RemoteSubMatch*>(leaves[n].get());
909 percent_scale = rem_match->get_percent_factor() / 100.0;
910 } else
911 #endif
913 percent_scale = greatest_wt_subqs_matched / double(total_subqs);
914 percent_scale /= greatest_wt;
916 Assert(percent_scale > 0);
917 if (percent_cutoff) {
918 // FIXME: better to sort and binary chop maybe?
919 // Or we could just use a linear scan here instead.
921 // trim the mset to the correct answer...
922 double min_wt = percent_cutoff_factor / percent_scale;
923 if (!is_heap) {
924 is_heap = true;
925 make_heap<vector<Xapian::Internal::MSetItem>::iterator,
926 MSetCmp>(items.begin(), items.end(), mcmp);
928 while (!items.empty() && items.front().wt < min_wt) {
929 pop_heap<vector<Xapian::Internal::MSetItem>::iterator,
930 MSetCmp>(items.begin(), items.end(), mcmp);
931 Assert(items.back().wt < min_wt);
932 items.pop_back();
934 #ifdef XAPIAN_ASSERTIONS_PARANOID
935 vector<Xapian::Internal::MSetItem>::const_iterator j;
936 for (j = items.begin(); j != items.end(); ++j) {
937 Assert(j->wt >= min_wt);
939 #endif
943 LOGLINE(MATCH,
944 "docs_matched = " << docs_matched <<
945 ", definite_matches_not_seen = " << definite_matches_not_seen <<
946 ", matches_lower_bound = " << matches_lower_bound <<
947 ", matches_estimated = " << matches_estimated <<
948 ", matches_upper_bound = " << matches_upper_bound);
950 // Adjust docs_matched to take account of documents which matched remotely
951 // but weren't sent across.
952 docs_matched += definite_matches_not_seen;
954 Xapian::doccount uncollapsed_lower_bound = matches_lower_bound;
955 Xapian::doccount uncollapsed_upper_bound = matches_upper_bound;
956 Xapian::doccount uncollapsed_estimated = matches_estimated;
957 #ifdef XAPIAN_HAS_REMOTE_BACKEND
958 if (collapser && n_remotes) {
959 // We need to adjust uncollapsed_upper_bound if there are multiple
960 // shards and some or all are remote. The lower bound and estimate
961 // will be valid, though potentially could be better, but this
962 // doesn't seem worth addressing in 1.4.x - the code on master
963 // handles this correctly via merging MSet objects.
964 if (n_remotes == leaves.size()) {
965 // All shards are remote so we just use the sum of
966 // uncollapsed_upper_bound over the remotes.
967 uncollapsed_upper_bound = remote_uncollapsed_upper_bound;
968 } else {
969 // Sum and clamp to the number of documents. This is crude but
970 // at least gives a valid answer.
971 Xapian::doccount num_docs = db.get_doccount();
972 uncollapsed_upper_bound += remote_uncollapsed_upper_bound;
973 if (uncollapsed_upper_bound < remote_uncollapsed_upper_bound ||
974 uncollapsed_upper_bound > num_docs) {
975 uncollapsed_upper_bound = num_docs;
979 #endif
980 if (items.size() < max_msize) {
981 // We have fewer items in the mset than we tried to get for it, so we
982 // must have all the matches in it.
983 LOGLINE(MATCH, "items.size() = " << items.size() <<
984 ", max_msize = " << max_msize << ", setting bounds equal");
985 Assert(definite_matches_not_seen == 0);
986 Assert(percent_cutoff || docs_matched == items.size());
987 matches_lower_bound = matches_upper_bound = matches_estimated
988 = items.size();
989 if (collapser && matches_lower_bound > uncollapsed_lower_bound)
990 uncollapsed_lower_bound = matches_lower_bound;
991 } else if (!collapser && docs_matched < check_at_least) {
992 // We have seen fewer matches than we checked for, so we must have seen
993 // all the matches.
994 LOGLINE(MATCH, "Setting bounds equal");
995 matches_lower_bound = matches_upper_bound = matches_estimated
996 = docs_matched;
997 if (collapser && matches_lower_bound > uncollapsed_lower_bound)
998 uncollapsed_lower_bound = matches_lower_bound;
999 } else {
1000 AssertRel(matches_estimated,>=,matches_lower_bound);
1001 AssertRel(matches_estimated,<=,matches_upper_bound);
1003 // We can end up scaling the estimate more than once, so collect
1004 // the scale factors and apply them in one go to avoid rounding
1005 // more than once.
1006 double estimate_scale = 1.0;
1007 double unique_rate = 1.0;
1009 if (collapser) {
1010 LOGLINE(MATCH, "Adjusting bounds due to collapse_key");
1011 matches_lower_bound = collapser.get_matches_lower_bound();
1012 if (matches_lower_bound > uncollapsed_lower_bound)
1013 uncollapsed_lower_bound = matches_lower_bound;
1015 // The estimate for the number of hits can be modified by
1016 // multiplying it by the rate at which we've been finding
1017 // unique documents.
1018 Xapian::doccount docs_considered = collapser.get_docs_considered();
1019 Xapian::doccount dups_ignored = collapser.get_dups_ignored();
1020 if (docs_considered > 0) {
1021 double unique = double(docs_considered - dups_ignored);
1022 unique_rate = unique / double(docs_considered);
1025 // We can safely reduce the upper bound by the number of duplicates
1026 // we've ignored.
1027 matches_upper_bound -= dups_ignored;
1029 LOGLINE(MATCH, "matches_lower_bound=" << matches_lower_bound <<
1030 ", matches_estimated=" << matches_estimated <<
1031 "*" << estimate_scale << "*" << unique_rate <<
1032 ", matches_upper_bound=" << matches_upper_bound);
1035 if (mdecider) {
1036 if (!percent_cutoff) {
1037 if (!collapser) {
1038 // We're not collapsing or doing a percentage cutoff, so
1039 // docs_matched is a lower bound on the total number of
1040 // matches.
1041 matches_lower_bound = max(docs_matched, matches_lower_bound);
1045 // The estimate for the number of hits can be modified by
1046 // multiplying it by the rate at which the match decider has
1047 // been accepting documents.
1048 if (decider_considered > 0) {
1049 double accept = double(decider_considered - decider_denied);
1050 double accept_rate = accept / double(decider_considered);
1051 estimate_scale *= accept_rate;
1054 // If a document is denied by a match decider, it is not possible
1055 // for it to found to be a duplicate, so it is safe to also reduce
1056 // the upper bound by the number of documents denied by a match
1057 // decider.
1058 matches_upper_bound -= decider_denied;
1059 if (collapser) uncollapsed_upper_bound -= decider_denied;
1062 if (percent_cutoff) {
1063 estimate_scale *= (1.0 - percent_cutoff_factor);
1064 // another approach:
1065 // Xapian::doccount new_est = items.size() * (1 - percent_cutoff_factor) / (1 - min_weight / greatest_wt);
1066 // and another: items.size() + (1 - greatest_wt * percent_cutoff_factor / min_weight) * (matches_estimated - items.size());
1068 // Very likely an underestimate, but we can't really do better
1069 // without checking further matches... Only possibility would be
1070 // to track how many docs made the min weight test but didn't make
1071 // the candidate set since the last greatest_wt change, which we
1072 // could use if the top documents matched all the prob terms.
1073 matches_lower_bound = items.size();
1074 if (collapser) uncollapsed_lower_bound = matches_lower_bound;
1076 // matches_upper_bound could be reduced by the number of documents
1077 // which fail the min weight test (FIXME)
1079 LOGLINE(MATCH, "Adjusted bounds due to percent_cutoff (" <<
1080 percent_cutoff << "): now have matches_estimated=" <<
1081 matches_estimated << ", matches_lower_bound=" <<
1082 matches_lower_bound);
1085 if (collapser && estimate_scale != 1.0) {
1086 uncollapsed_estimated =
1087 Xapian::doccount(uncollapsed_estimated * estimate_scale + 0.5);
1090 estimate_scale *= unique_rate;
1092 if (estimate_scale != 1.0) {
1093 matches_estimated =
1094 Xapian::doccount(matches_estimated * estimate_scale + 0.5);
1095 if (matches_estimated < matches_lower_bound)
1096 matches_estimated = matches_lower_bound;
1099 if (collapser || mdecider) {
1100 LOGLINE(MATCH, "Clamping estimate between bounds: "
1101 "matches_lower_bound = " << matches_lower_bound <<
1102 ", matches_estimated = " << matches_estimated <<
1103 ", matches_upper_bound = " << matches_upper_bound);
1104 AssertRel(matches_lower_bound,<=,matches_upper_bound);
1105 matches_estimated = max(matches_estimated, matches_lower_bound);
1106 matches_estimated = min(matches_estimated, matches_upper_bound);
1107 } else if (!percent_cutoff) {
1108 AssertRel(docs_matched,<=,matches_upper_bound);
1109 if (docs_matched > matches_lower_bound)
1110 matches_lower_bound = docs_matched;
1111 if (docs_matched > matches_estimated)
1112 matches_estimated = docs_matched;
1115 if (collapser && !mdecider && !percent_cutoff) {
1116 AssertRel(docs_matched,<=,uncollapsed_upper_bound);
1117 if (docs_matched > uncollapsed_lower_bound)
1118 uncollapsed_lower_bound = docs_matched;
1122 if (collapser) {
1123 AssertRel(uncollapsed_lower_bound,<=,uncollapsed_upper_bound);
1124 if (uncollapsed_estimated < uncollapsed_lower_bound) {
1125 uncollapsed_estimated = uncollapsed_lower_bound;
1126 } else if (uncollapsed_estimated > uncollapsed_upper_bound) {
1127 uncollapsed_estimated = uncollapsed_upper_bound;
1129 } else {
1130 // We're not collapsing, so the bounds must be the same.
1131 uncollapsed_lower_bound = matches_lower_bound;
1132 uncollapsed_upper_bound = matches_upper_bound;
1133 uncollapsed_estimated = matches_estimated;
1136 LOGLINE(MATCH, items.size() << " items in potential mset");
1138 if (first > 0) {
1139 // Remove unwanted leading entries
1140 if (items.size() <= first) {
1141 items.clear();
1142 } else {
1143 LOGLINE(MATCH, "finding " << first << "th");
1144 // We perform nth_element() on reverse iterators so that the
1145 // unwanted elements end up at the end of items, which means
1146 // that the call to erase() to remove them doesn't have to copy
1147 // any elements.
1148 vector<Xapian::Internal::MSetItem>::reverse_iterator nth;
1149 nth = items.rbegin() + first;
1150 nth_element(items.rbegin(), nth, items.rend(), mcmp);
1151 // Erase the trailing "first" elements
1152 items.erase(items.begin() + items.size() - first, items.end());
1156 LOGLINE(MATCH, "sorting " << items.size() << " entries");
1158 // We could use std::sort_heap if is_heap is true, but profiling
1159 // suggests that's actually slower. Cast to void to suppress
1160 // compiler warnings that the last set value of is_heap is never
1161 // used.
1162 (void)is_heap;
1164 // Need a stable sort, but this is provided by comparison operator
1165 sort(items.begin(), items.end(), mcmp);
1167 if (!items.empty()) {
1168 LOGLINE(MATCH, "min weight in mset = " << items.back().wt);
1169 LOGLINE(MATCH, "max weight in mset = " << items[0].wt);
1172 AssertRel(matches_estimated,>=,matches_lower_bound);
1173 AssertRel(matches_estimated,<=,matches_upper_bound);
1175 AssertRel(uncollapsed_estimated,>=,uncollapsed_lower_bound);
1176 AssertRel(uncollapsed_estimated,<=,uncollapsed_upper_bound);
1178 // We may need to qualify any collapse_count to see if the highest weight
1179 // collapsed item would have qualified percent_cutoff
1180 // We WILL need to restore collapse_count to the mset by taking from
1181 // collapse_tab; this is what comes of copying around whole objects
1182 // instead of taking references, we find it hard to update collapse_count
1183 // of an item that has already been pushed-back as we don't know where it
1184 // is any more. If we keep or find references we won't need to mess with
1185 // is_heap so much maybe?
1186 if (!items.empty() && collapser && !collapser.empty()) {
1187 // Nicked this formula from above.
1188 double min_wt = 0.0;
1189 if (percent_scale > 0.0)
1190 min_wt = percent_cutoff_factor / percent_scale;
1191 Xapian::doccount entries = collapser.entries();
1192 vector<Xapian::Internal::MSetItem>::iterator i;
1193 for (i = items.begin(); i != items.end(); ++i) {
1194 // Skip entries without a collapse key.
1195 if (i->collapse_key.empty()) continue;
1197 // Set collapse_count appropriately.
1198 i->collapse_count = collapser.get_collapse_count(i->collapse_key, percent_cutoff, min_wt);
1199 if (--entries == 0) {
1200 // Stop once we've processed all items with collapse keys.
1201 break;
1206 mset.internal = new Xapian::MSet::Internal(
1207 first,
1208 matches_upper_bound,
1209 matches_lower_bound,
1210 matches_estimated,
1211 uncollapsed_upper_bound,
1212 uncollapsed_lower_bound,
1213 uncollapsed_estimated,
1214 max_possible, greatest_wt, items,
1215 percent_scale * 100.0);