3 * Copyright 1999,2000,2001 BrightStation PLC
4 * Copyright 2001,2002 Ananova Ltd
5 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2013,2014,2015,2016 Olly Betts
6 * Copyright 2003 Orange PCS Ltd
7 * Copyright 2003 Sam Liddicott
8 * Copyright 2007,2008,2009 Lemur Consulting Ltd
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License as
12 * published by the Free Software Foundation; either version 2 of the
13 * License, or (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
28 #include "multimatch.h"
31 #include "collapser.h"
34 #include "localsubmatch.h"
36 #include "api/omenquireinternal.h"
39 #include "api/emptypostlist.h"
40 #include "branchpostlist.h"
41 #include "mergepostlist.h"
43 #include "backends/document.h"
47 #include "valuestreamdocument.h"
48 #include "weight/weightinternal.h"
50 #include <xapian/error.h>
51 #include <xapian/matchspy.h>
52 #include <xapian/version.h> // For XAPIAN_HAS_REMOTE_BACKEND
54 #ifdef XAPIAN_HAS_REMOTE_BACKEND
55 #include "remotesubmatch.h"
56 #include "backends/remote/remote-database.h"
57 #endif /* XAPIAN_HAS_REMOTE_BACKEND */
60 #include <cfloat> // For DBL_EPSILON.
61 #include <climits> // For UINT_MAX.
66 #ifdef HAVE_TIMER_CREATE
69 #include "safeunistd.h" // For _POSIX_* feature test macros.
74 set_timeout_flag(union sigval sv
)
76 *(reinterpret_cast<volatile bool*>(sv
.sival_ptr
)) = true;
81 // The monotonic clock is the better basis for timeouts, but not always
84 #ifndef _POSIX_MONOTONIC_CLOCK
85 const clockid_t TIMEOUT_CLOCK
= CLOCK_REALTIME
;
87 // Solaris defines CLOCK_MONOTONIC, but "man timer_create" doesn't mention it
88 // and when using it timer_create() fails with "EPERM" (perhaps you need to be
89 // root to use it? I can't test that).
91 // Solaris defines _POSIX_MONOTONIC_CLOCK so we need to special case.
92 const clockid_t TIMEOUT_CLOCK
= CLOCK_REALTIME
;
93 #elif defined __CYGWIN__
94 // https://cygwin.com/cygwin-api/std-notes.html currently (2016-05-13) says:
96 // clock_nanosleep currently supports only CLOCK_REALTIME and
97 // CLOCK_MONOTONIC. clock_setres, clock_settime, and timer_create
98 // currently support only CLOCK_REALTIME.
100 // So CLOCK_MONOTONIC is defined, but not supported by timer_create().
101 const clockid_t TIMEOUT_CLOCK
= CLOCK_REALTIME
;
103 const clockid_t TIMEOUT_CLOCK
= CLOCK_MONOTONIC
;
109 volatile bool expired
;
112 explicit TimeOut(double limit
) : expired(false) {
114 sev
.sigev_notify
= SIGEV_THREAD
;
115 sev
.sigev_notify_function
= set_timeout_flag
;
116 sev
.sigev_notify_attributes
= NULL
;
117 sev
.sigev_value
.sival_ptr
=
118 static_cast<void*>(const_cast<bool*>(&expired
));
119 if (usual(timer_create(TIMEOUT_CLOCK
, &sev
, &timerid
) == 0)) {
120 struct itimerspec interval
;
121 interval
.it_interval
.tv_sec
= 0;
122 interval
.it_interval
.tv_nsec
= 0;
123 RealTime::to_timespec(limit
, &interval
.it_value
);
124 if (usual(timer_settime(timerid
, 0, &interval
, NULL
) == 0)) {
125 // Timeout successfully set.
128 timer_delete(timerid
);
131 sev
.sigev_notify
= SIGEV_NONE
;
135 if (sev
.sigev_notify
!= SIGEV_NONE
) {
136 timer_delete(timerid
);
137 sev
.sigev_notify
= SIGEV_NONE
;
141 bool timed_out() const { return expired
; }
146 explicit TimeOut(double) { }
147 bool timed_out() const { return false; }
152 using Xapian::Internal::intrusive_ptr
;
154 const Xapian::Enquire::Internal::sort_setting REL
=
155 Xapian::Enquire::Internal::REL
;
156 const Xapian::Enquire::Internal::sort_setting REL_VAL
=
157 Xapian::Enquire::Internal::REL_VAL
;
158 const Xapian::Enquire::Internal::sort_setting VAL
=
159 Xapian::Enquire::Internal::VAL
;
160 #if 0 // VAL_REL isn't currently used so avoid compiler warnings.
161 const Xapian::Enquire::Internal::sort_setting VAL_REL
=
162 Xapian::Enquire::Internal::VAL_REL
;
165 /** Split an RSet into several sub rsets, one for each database.
167 * @param rset The RSet to split.
168 * @param number_of_subdbs The number of sub databases which exist.
169 * @param subrsets Vector of RSets which will the sub rsets will be placed in.
170 * This should be empty when the function is called.
173 split_rset_by_db(const Xapian::RSet
* rset
,
174 Xapian::doccount number_of_subdbs
,
175 vector
<Xapian::RSet
> & subrsets
)
177 LOGCALL_STATIC_VOID(MATCH
, "split_rset_by_db", rset
| number_of_subdbs
| subrsets
);
178 if (rset
&& !rset
->empty()) {
179 if (number_of_subdbs
== 1) {
180 // The common case of a single database is easy to handle.
181 subrsets
.push_back(*rset
);
183 // Can't just use vector::resize() here, since that creates N
184 // copies of the same RSet!
185 subrsets
.reserve(number_of_subdbs
);
186 for (size_t i
= 0; i
< number_of_subdbs
; ++i
) {
187 subrsets
.push_back(Xapian::RSet());
190 const set
<Xapian::docid
> & rsetitems
= rset
->internal
->get_items();
191 set
<Xapian::docid
>::const_iterator j
;
192 for (j
= rsetitems
.begin(); j
!= rsetitems
.end(); ++j
) {
193 Xapian::doccount local_docid
= (*j
- 1) / number_of_subdbs
+ 1;
194 Xapian::doccount subdatabase
= (*j
- 1) % number_of_subdbs
;
195 subrsets
[subdatabase
].add_document(local_docid
);
199 // NB vector::resize() creates N copies of the same empty RSet.
200 subrsets
.resize(number_of_subdbs
);
202 Assert(subrsets
.size() == number_of_subdbs
);
205 /** Prepare some SubMatches.
207 * This calls the prepare_match() method on each SubMatch object, causing them
208 * to lookup various statistics.
210 * This method is rather complicated in order to handle remote matches
211 * efficiently. Instead of simply calling "prepare_match()" on each submatch
212 * and waiting for it to return, it first calls "prepare_match(true)" on each
213 * submatch. If any of these calls return false, indicating that the required
214 * information has not yet been received from the server, the method will try
215 * those which returned false again, passing "false" as a parameter to
216 * indicate that they should block until the information is ready.
218 * This should improve performance in the case of mixed local-and-remote
219 * searches - the local searchers will all fetch their statistics from disk
220 * without waiting for the remote searchers, so as soon as the remote searcher
221 * statistics arrive, we can move on to the next step.
224 prepare_sub_matches(vector
<intrusive_ptr
<SubMatch
> > & leaves
,
225 Xapian::Weight::Internal
& stats
)
227 LOGCALL_STATIC_VOID(MATCH
, "prepare_sub_matches", leaves
| stats
);
228 // We use a vector<bool> to track which SubMatches we're already prepared.
229 vector
<bool> prepared
;
230 prepared
.resize(leaves
.size(), false);
231 size_t unprepared
= leaves
.size();
234 for (size_t leaf
= 0; leaf
< leaves
.size(); ++leaf
) {
235 if (prepared
[leaf
]) continue;
236 SubMatch
* submatch
= leaves
[leaf
].get();
237 if (!submatch
|| submatch
->prepare_match(nowait
, stats
)) {
238 prepared
[leaf
] = true;
242 // Use blocking IO on subsequent passes, so that we don't go into
248 /// Class which applies several match spies in turn.
249 class MultipleMatchSpy
: public Xapian::MatchSpy
{
251 /// List of match spies to call, in order.
252 const std::vector
<Xapian::Internal::opt_intrusive_ptr
<Xapian::MatchSpy
>> & spies
;
256 MultipleMatchSpy(const std::vector
<Xapian::Internal::opt_intrusive_ptr
<Xapian::MatchSpy
>> & spies_
)
259 /** Implementation of virtual operator().
261 * This implementation calls all the spies in turn.
263 void operator()(const Xapian::Document
&doc
, double wt
);
267 MultipleMatchSpy::operator()(const Xapian::Document
&doc
, double wt
) {
268 LOGCALL_VOID(MATCH
, "MultipleMatchSpy::operator()", doc
| wt
);
269 for (auto i
: spies
) {
274 ////////////////////////////////////
275 // Initialisation and cleaning up //
276 ////////////////////////////////////
277 MultiMatch::MultiMatch(const Xapian::Database
&db_
,
278 const Xapian::Query
& query_
,
279 Xapian::termcount qlen
,
280 const Xapian::RSet
* omrset
,
281 Xapian::doccount collapse_max_
,
282 Xapian::valueno collapse_key_
,
283 int percent_cutoff_
, double weight_cutoff_
,
284 Xapian::Enquire::docid_order order_
,
285 Xapian::valueno sort_key_
,
286 Xapian::Enquire::Internal::sort_setting sort_by_
,
287 bool sort_value_forward_
,
289 Xapian::Weight::Internal
& stats
,
290 const Xapian::Weight
* weight_
,
291 const vector
<Xapian::Internal::opt_intrusive_ptr
<Xapian::MatchSpy
>> & matchspies_
,
292 bool have_sorter
, bool have_mdecider
)
293 : db(db_
), query(query_
),
294 collapse_max(collapse_max_
), collapse_key(collapse_key_
),
295 percent_cutoff(percent_cutoff_
), weight_cutoff(weight_cutoff_
),
297 sort_key(sort_key_
), sort_by(sort_by_
),
298 sort_value_forward(sort_value_forward_
),
299 time_limit(time_limit_
),
301 is_remote(db
.internal
.size()),
302 matchspies(matchspies_
)
304 LOGCALL_CTOR(MATCH
, "MultiMatch", db_
| query_
| qlen
| omrset
| collapse_max_
| collapse_key_
| percent_cutoff_
| weight_cutoff_
| int(order_
) | sort_key_
| int(sort_by_
) | sort_value_forward_
| time_limit_
| stats
| weight_
| matchspies_
| have_sorter
| have_mdecider
);
306 if (query
.empty()) return;
308 Xapian::doccount number_of_subdbs
= db
.internal
.size();
309 vector
<Xapian::RSet
> subrsets
;
310 split_rset_by_db(omrset
, number_of_subdbs
, subrsets
);
312 for (size_t i
= 0; i
!= number_of_subdbs
; ++i
) {
313 Xapian::Database::Internal
*subdb
= db
.internal
[i
].get();
315 intrusive_ptr
<SubMatch
> smatch
;
317 // There is currently only one special case, for network databases.
318 #ifdef XAPIAN_HAS_REMOTE_BACKEND
319 if (subdb
->get_backend_info(NULL
) == BACKEND_REMOTE
) {
320 RemoteDatabase
*rem_db
= static_cast<RemoteDatabase
*>(subdb
);
322 throw Xapian::UnimplementedError("Xapian::KeyMaker not supported for the remote backend");
325 throw Xapian::UnimplementedError("Xapian::MatchDecider not supported for the remote backend");
327 // FIXME: Remote handling for time_limit with multiple
328 // databases may need some work.
329 rem_db
->set_query(query
, qlen
, collapse_max
, collapse_key
,
330 order
, sort_key
, sort_by
, sort_value_forward
,
332 percent_cutoff
, weight_cutoff
, weight
,
333 subrsets
[i
], matchspies
);
334 bool decreasing_relevance
=
335 (sort_by
== REL
|| sort_by
== REL_VAL
);
336 smatch
= new RemoteSubMatch(rem_db
, decreasing_relevance
, matchspies
);
339 smatch
= new LocalSubMatch(subdb
, query
, qlen
, subrsets
[i
], weight
,
341 subdb
->readahead_for_query(query
);
344 // Avoid unused parameter warnings.
347 smatch
= new LocalSubMatch(subdb
, query
, qlen
, subrsets
[i
], weight
, i
);
348 #endif /* XAPIAN_HAS_REMOTE_BACKEND */
349 leaves
.push_back(smatch
);
352 stats
.set_query(query
);
353 prepare_sub_matches(leaves
, stats
);
354 stats
.set_bounds_from_db(db
);
358 MultiMatch::getorrecalc_maxweight(PostList
*pl
)
360 LOGCALL(MATCH
, double, "MultiMatch::getorrecalc_maxweight", pl
);
362 if (recalculate_w_max
) {
363 LOGLINE(MATCH
, "recalculating max weight");
364 wt
= pl
->recalc_maxweight();
365 recalculate_w_max
= false;
367 wt
= pl
->get_maxweight();
368 LOGLINE(MATCH
, "pl = (" << pl
->get_description() << ")");
369 // FIXME: This fails for hoistnotbug1 under multi_remoteprog_glass:
370 // AssertionError: matcher/multimatch.cc:370: within_DBL_EPSILON(wt, pl->recalc_maxweight()) : values were 2.7075940282079162813 and 2.6966211268553141878
372 // AssertEqDoubleParanoid(wt, pl->recalc_maxweight());
374 // Not sure why - adding code to call recalc_maxweight() doesn't seem
375 // to help. But the max weight not being as low as it could be is
376 // only a potential missed optimisation opportunity, not a correctness
379 LOGLINE(MATCH
, "max possible doc weight = " << wt
);
384 MultiMatch::get_mset(Xapian::doccount first
, Xapian::doccount maxitems
,
385 Xapian::doccount check_at_least
,
387 Xapian::Weight::Internal
& stats
,
388 const Xapian::MatchDecider
*mdecider
,
389 const Xapian::KeyMaker
*sorter
)
391 LOGCALL_VOID(MATCH
, "MultiMatch::get_mset", first
| maxitems
| check_at_least
| Literal("mset") | stats
| Literal("mdecider") | Literal("sorter"));
392 AssertRel(check_at_least
,>=,first
+ maxitems
);
395 mset
= Xapian::MSet();
396 mset
.internal
->firstitem
= first
;
400 Assert(!leaves
.empty());
402 TimeOut
timeout(time_limit
);
404 #ifdef XAPIAN_HAS_REMOTE_BACKEND
405 // If there's only one database and it's remote, we can just unserialise
406 // its MSet and return that.
407 if (leaves
.size() == 1 && is_remote
[0]) {
408 RemoteSubMatch
* rem_match
;
409 rem_match
= static_cast<RemoteSubMatch
*>(leaves
[0].get());
410 rem_match
->start_match(first
, maxitems
, check_at_least
, stats
);
411 rem_match
->get_mset(mset
);
417 for (size_t i
= 0; i
!= leaves
.size(); ++i
) {
418 auto& leaf
= leaves
[i
];
419 #ifdef XAPIAN_HAS_REMOTE_BACKEND
421 Xapian::doccount remote_maxitems
= first
+ maxitems
;
422 if (collapse_max
!= 0) {
423 // If collapsing we need to fetch all check_at_least items in
424 // order to satisfy the requirement that if there are <=
425 // check_at_least results then then estimated number of matches
427 remote_maxitems
= check_at_least
;
429 leaf
->start_match(0, remote_maxitems
, check_at_least
, stats
);
433 leaf
->start_match(0, first
+ maxitems
, check_at_least
, stats
);
436 // Get postlists and term info
437 vector
<PostList
*> postlists
;
438 Xapian::termcount total_subqs
= 0;
439 // Keep a count of matches which we know exist, but we won't see. This
440 // occurs when a submatch is remote, and returns a lower bound on the
441 // number of matching documents which is higher than the number of
442 // documents it returns (because it wasn't asked for more documents).
443 Xapian::doccount definite_matches_not_seen
= 0;
444 #ifdef XAPIAN_HAS_REMOTE_BACKEND
445 // Track these for calculating uncollapsed_upper_bound for the local.
446 size_t n_remotes
= 0;
447 Xapian::doccount remote_uncollapsed_upper_bound
= 0;
450 for (size_t i
= 0; i
!= leaves
.size(); ++i
) {
451 // Pick the highest total subqueries answer amongst the
452 // subdatabases, as the query to postlist conversion doesn't
453 // recurse into positional queries for shards that don't have
454 // positional data when at least one other shard does.
455 Xapian::termcount total_subqs_i
= 0;
456 PostList
* pl
= leaves
[i
]->get_postlist(this, &total_subqs_i
, stats
);
457 total_subqs
= max(total_subqs
, total_subqs_i
);
458 #ifdef XAPIAN_HAS_REMOTE_BACKEND
461 RemoteSubMatch
* rem_match
=
462 static_cast<RemoteSubMatch
*>(leaves
[i
].get());
463 remote_uncollapsed_upper_bound
+=
464 rem_match
->get_uncollapsed_upper_bound();
465 if (collapse_max
== 0 &&
466 pl
->get_termfreq_min() > first
+ maxitems
) {
467 LOGLINE(MATCH
, "Found " <<
468 pl
->get_termfreq_min() - (first
+ maxitems
)
469 << " definite matches in remote submatch "
470 "which aren't passed to local match");
471 definite_matches_not_seen
+= pl
->get_termfreq_min();
472 definite_matches_not_seen
-= first
+ maxitems
;
476 postlists
.push_back(pl
);
479 for (auto pl
: postlists
) delete pl
;
482 Assert(!postlists
.empty());
484 ValueStreamDocument
vsdoc(db
);
486 Xapian::Document
doc(&vsdoc
);
488 // Get a single combined postlist
489 AutoPtr
<PostList
> pl
;
490 if (postlists
.size() == 1) {
491 pl
.reset(postlists
.front());
493 pl
.reset(new MergePostList(postlists
, this, vsdoc
));
496 LOGLINE(MATCH
, "pl = (" << pl
->get_description() << ")");
499 Xapian::doccount docs_matched
= 0;
500 double greatest_wt
= 0;
501 Xapian::termcount greatest_wt_subqs_matched
= 0;
502 #ifdef XAPIAN_HAS_REMOTE_BACKEND
503 unsigned greatest_wt_subqs_db_num
= UINT_MAX
;
505 vector
<Xapian::Internal::MSetItem
> items
;
507 // maximum weight a document could possibly have
508 const double max_possible
= pl
->recalc_maxweight();
510 LOGLINE(MATCH
, "pl = (" << pl
->get_description() << ")");
511 recalculate_w_max
= false;
513 Xapian::doccount matches_upper_bound
= pl
->get_termfreq_max();
514 Xapian::doccount matches_lower_bound
= 0;
515 Xapian::doccount matches_estimated
= pl
->get_termfreq_est();
517 if (mdecider
== NULL
) {
518 // If we have a match decider, the lower bound must be
519 // set to 0 as we could discard all hits. Otherwise set it to the
520 // minimum number of entries which the postlist could return.
521 matches_lower_bound
= pl
->get_termfreq_min();
524 // Prepare the matchspy
525 Xapian::MatchSpy
*matchspy
= NULL
;
526 MultipleMatchSpy
multispy(matchspies
);
527 if (!matchspies
.empty()) {
528 if (matchspies
.size() == 1) {
529 matchspy
= matchspies
[0].get();
531 matchspy
= &multispy
;
535 // Check if any results have been asked for (might just be wanting
537 if (check_at_least
== 0) {
539 Xapian::doccount uncollapsed_lower_bound
= matches_lower_bound
;
541 // Lower bound must be set to no more than collapse_max, since it's
542 // possible that all matching documents have the same collapse_key
543 // value and so are collapsed together.
544 if (matches_lower_bound
> collapse_max
)
545 matches_lower_bound
= collapse_max
;
548 mset
.internal
= new Xapian::MSet::Internal(
554 uncollapsed_lower_bound
,
556 max_possible
, greatest_wt
, items
,
561 // Number of documents considered by a decider.
562 Xapian::doccount decider_considered
= 0;
563 // Number of documents denied by the decider.
564 Xapian::doccount decider_denied
= 0;
566 // Set max number of results that we want - this is used to decide
567 // when to throw away unwanted items.
568 Xapian::doccount max_msize
= first
+ maxitems
;
569 items
.reserve(max_msize
+ 1);
571 // Tracks the minimum item currently eligible for the MSet - we compare
572 // candidate items against this.
573 Xapian::Internal::MSetItem
min_item(0.0, 0);
575 // Minimum weight an item must have to be worth considering.
576 double min_weight
= weight_cutoff
;
578 // Factor to multiply maximum weight seen by to get the cutoff weight.
579 double percent_cutoff_factor
= percent_cutoff
/ 100.0;
580 // Corresponding correction to that in omenquire.cc to account for excess
582 percent_cutoff_factor
-= DBL_EPSILON
;
584 // Object to handle collapsing.
585 Collapser
collapser(collapse_key
, collapse_max
);
587 /// Comparison functor for sorting MSet
588 bool sort_forward
= (order
!= Xapian::Enquire::DESCENDING
);
589 MSetCmp
mcmp(get_msetcmp_function(sort_by
, sort_forward
, sort_value_forward
));
593 // We form the mset in two stages. In the first we fill up our working
594 // mset. Adding a new document does not remove another.
596 // In the second, we consider documents which rank higher than the current
597 // lowest ranking document in the mset. Each document added expels the
598 // current lowest ranking document.
600 // If a percentage cutoff is in effect, it can cause the matcher to return
601 // from the second stage from the first.
603 // Is the mset a valid heap?
604 bool is_heap
= false;
609 if (rare(recalculate_w_max
)) {
610 if (min_weight
> 0.0) {
611 if (rare(getorrecalc_maxweight(pl
.get()) < min_weight
)) {
612 LOGLINE(MATCH
, "*** TERMINATING EARLY (1)");
618 PostList
* pl_copy
= pl
.get();
619 if (rare(next_handling_prune(pl_copy
, min_weight
, this))) {
622 LOGLINE(MATCH
, "*** REPLACING ROOT");
624 if (min_weight
> 0.0) {
625 // No need for a full recalc (unless we've got to do one
626 // because of a prune elsewhere) - we're just switching to a
628 if (rare(getorrecalc_maxweight(pl
.get()) < min_weight
)) {
629 LOGLINE(MATCH
, "*** TERMINATING EARLY (2)");
635 if (rare(pl
->at_end())) {
636 LOGLINE(MATCH
, "Reached end of potential matches");
640 // Only calculate the weight if we need it for mcmp, or there's a
641 // percentage or weight cutoff in effect. Otherwise we calculate it
642 // below if we haven't already rejected this candidate.
644 bool calculated_weight
= false;
645 if (sort_by
!= VAL
|| min_weight
> 0.0) {
646 wt
= pl
->get_weight();
647 if (wt
< min_weight
) {
648 LOGLINE(MATCH
, "Rejecting potential match due to insufficient weight");
651 calculated_weight
= true;
654 Xapian::docid did
= pl
->get_docid();
655 vsdoc
.set_document(did
);
656 LOGLINE(MATCH
, "Candidate document id " << did
<< " wt " << wt
);
657 Xapian::Internal::MSetItem
new_item(wt
, did
);
658 if (check_at_least
> first
+ maxitems
&& timeout
.timed_out()) {
659 check_at_least
= first
+ maxitems
;
662 if (sort_by
!= REL
) {
663 const string
* ptr
= pl
->get_sort_key();
665 new_item
.sort_key
= *ptr
;
667 new_item
.sort_key
= (*sorter
)(doc
);
669 new_item
.sort_key
= vsdoc
.get_value(sort_key
);
672 // We're sorting by value (in part at least), so compare the item
673 // against the lowest currently in the proto-mset. If sort_by is
674 // VAL, then new_item.wt won't yet be set, but that doesn't
675 // matter since it's not used by the sort function.
676 if (!mcmp(new_item
, min_item
)) {
677 if (mdecider
== NULL
&& !collapser
) {
678 // Document was definitely suitable for mset - no more
679 // processing needed.
680 LOGLINE(MATCH
, "Making note of match item which sorts lower than min_item");
682 if (!calculated_weight
) wt
= pl
->get_weight();
684 matchspy
->operator()(doc
, wt
);
686 if (wt
> greatest_wt
) goto new_greatest_weight
;
689 if (docs_matched
>= check_at_least
) {
690 // We've seen enough items - we can drop this one.
691 LOGLINE(MATCH
, "Dropping candidate which sorts lower than min_item");
692 // FIXME: hmm, match decider might have rejected this...
693 if (!calculated_weight
) wt
= pl
->get_weight();
694 if (wt
> greatest_wt
) goto new_greatest_weight
;
697 // We can't drop the item, because we need to test whether the
698 // mdecider would accept it and/or test whether it would be
700 LOGLINE(MATCH
, "Keeping candidate which sorts lower than min_item for further investigation");
704 // Use the match spy and/or decision functors (if specified).
705 if (matchspy
!= NULL
|| mdecider
!= NULL
) {
706 const unsigned int multiplier
= db
.internal
.size();
707 Assert(multiplier
!= 0);
708 Xapian::doccount n
= (did
- 1) % multiplier
; // which actual database
709 // If the results are from a remote database, then the functor will
710 // already have been applied there so we can skip this step.
712 ++decider_considered
;
713 if (mdecider
&& !mdecider
->operator()(doc
)) {
718 if (!calculated_weight
) {
719 wt
= pl
->get_weight();
721 calculated_weight
= true;
723 matchspy
->operator()(doc
, wt
);
728 if (!calculated_weight
) {
729 // we didn't calculate the weight above, but now we will need it
730 wt
= pl
->get_weight();
736 // Perform collapsing on key if requested.
739 res
= collapser
.process(new_item
, pl
.get(), vsdoc
, mcmp
);
740 if (res
== REJECTED
) {
741 // If we're sorting by relevance primarily, then we throw away
742 // the lower weighted document anyway.
743 if (sort_by
!= REL
&& sort_by
!= REL_VAL
) {
744 if (wt
> greatest_wt
) goto new_greatest_weight
;
749 if (res
== REPLACED
) {
750 // There was a previous item in the collapse tab so
751 // the MSet can't be empty.
752 Assert(!items
.empty());
754 const Xapian::Internal::MSetItem
& old_item
=
756 // This is one of the best collapse_max potential MSet entries
757 // with this key which we've seen so far. Check if the
758 // entry with this key which it displaced might still be in the
759 // proto-MSet. If it might be, we need to check through for
761 double old_wt
= old_item
.wt
;
762 if (old_wt
>= min_weight
&& mcmp(old_item
, min_item
)) {
763 // Scan through (unsorted) MSet looking for entry.
764 // FIXME: more efficient way than just scanning?
765 Xapian::docid olddid
= old_item
.did
;
766 vector
<Xapian::Internal::MSetItem
>::iterator i
;
767 for (i
= items
.begin(); i
!= items
.end(); ++i
) {
768 if (i
->did
== olddid
) {
769 LOGLINE(MATCH
, "collapse: removing " <<
771 new_item
.collapse_key
);
772 // We can replace an arbitrary element in O(log N)
773 // but have to do it by hand (in this case the new
774 // elt is bigger, so we just swap down the tree).
775 // FIXME: implement this, and clean up is_heap
787 // OK, actually add the item to the mset.
790 if (items
.size() >= max_msize
) {
791 items
.push_back(new_item
);
794 make_heap(items
.begin(), items
.end(), mcmp
);
796 push_heap
<vector
<Xapian::Internal::MSetItem
>::iterator
,
797 MSetCmp
>(items
.begin(), items
.end(), mcmp
);
799 pop_heap
<vector
<Xapian::Internal::MSetItem
>::iterator
,
800 MSetCmp
>(items
.begin(), items
.end(), mcmp
);
803 if (!items
.empty()) {
804 // In the get_mset(0, 0, X) (X > 0) case just don't update min_item.
805 min_item
= items
.front();
807 if (sort_by
== REL
|| sort_by
== REL_VAL
) {
808 if (docs_matched
>= check_at_least
) {
809 if (sort_by
== REL
) {
810 // We're done if this is a forward boolean match
811 // with only one database (bodgetastic, FIXME
812 // better if we can!)
813 if (rare(max_possible
== 0 && sort_forward
)) {
814 // In the multi database case, MergePostList
815 // currently processes each database
816 // sequentially (which actually may well be
817 // more efficient) so the docids in general
818 // won't arrive in order.
819 if (leaves
.size() == 1) break;
822 if (min_item
.wt
> min_weight
) {
823 LOGLINE(MATCH
, "Setting min_weight to " <<
824 min_item
.wt
<< " from " << min_weight
);
825 min_weight
= min_item
.wt
;
829 if (rare(getorrecalc_maxweight(pl
.get()) < min_weight
)) {
830 LOGLINE(MATCH
, "*** TERMINATING EARLY (3)");
834 items
.push_back(new_item
);
836 if (sort_by
== REL
&& items
.size() == max_msize
) {
837 if (docs_matched
>= check_at_least
) {
838 // We're done if this is a forward boolean match
839 // with only one database (bodgetastic, FIXME
840 // better if we can!)
841 if (rare(max_possible
== 0 && sort_forward
)) {
842 // In the multi database case, MergePostList
843 // currently processes each database
844 // sequentially (which actually may well be
845 // more efficient) so the docids in general
846 // won't arrive in order.
847 if (leaves
.size() == 1) break;
854 // Keep a track of the greatest weight we've seen.
855 if (wt
> greatest_wt
) {
858 #ifdef XAPIAN_HAS_REMOTE_BACKEND
859 const unsigned int multiplier
= db
.internal
.size();
860 unsigned int db_num
= (did
- 1) % multiplier
;
861 if (is_remote
[db_num
]) {
862 // Note that the greatest weighted document came from a remote
863 // database, and which one.
864 greatest_wt_subqs_db_num
= db_num
;
868 greatest_wt_subqs_matched
= pl
->count_matching_subqs();
869 #ifdef XAPIAN_HAS_REMOTE_BACKEND
870 greatest_wt_subqs_db_num
= UINT_MAX
;
873 if (percent_cutoff
) {
874 double w
= wt
* percent_cutoff_factor
;
875 if (w
> min_weight
) {
879 make_heap
<vector
<Xapian::Internal::MSetItem
>::iterator
,
880 MSetCmp
>(items
.begin(), items
.end(), mcmp
);
882 while (!items
.empty() && items
.front().wt
< min_weight
) {
883 pop_heap
<vector
<Xapian::Internal::MSetItem
>::iterator
,
884 MSetCmp
>(items
.begin(), items
.end(), mcmp
);
885 Assert(items
.back().wt
< min_weight
);
888 #ifdef XAPIAN_ASSERTIONS_PARANOID
889 vector
<Xapian::Internal::MSetItem
>::const_iterator i
;
890 for (i
= items
.begin(); i
!= items
.end(); ++i
) {
891 Assert(i
->wt
>= min_weight
);
899 // done with posting list tree
902 double percent_scale
= 0;
903 if (!items
.empty() && greatest_wt
> 0) {
904 #ifdef XAPIAN_HAS_REMOTE_BACKEND
905 if (greatest_wt_subqs_db_num
!= UINT_MAX
) {
906 const unsigned int n
= greatest_wt_subqs_db_num
;
907 RemoteSubMatch
* rem_match
;
908 rem_match
= static_cast<RemoteSubMatch
*>(leaves
[n
].get());
909 percent_scale
= rem_match
->get_percent_factor() / 100.0;
913 percent_scale
= greatest_wt_subqs_matched
/ double(total_subqs
);
914 percent_scale
/= greatest_wt
;
916 Assert(percent_scale
> 0);
917 if (percent_cutoff
) {
918 // FIXME: better to sort and binary chop maybe?
919 // Or we could just use a linear scan here instead.
921 // trim the mset to the correct answer...
922 double min_wt
= percent_cutoff_factor
/ percent_scale
;
925 make_heap
<vector
<Xapian::Internal::MSetItem
>::iterator
,
926 MSetCmp
>(items
.begin(), items
.end(), mcmp
);
928 while (!items
.empty() && items
.front().wt
< min_wt
) {
929 pop_heap
<vector
<Xapian::Internal::MSetItem
>::iterator
,
930 MSetCmp
>(items
.begin(), items
.end(), mcmp
);
931 Assert(items
.back().wt
< min_wt
);
934 #ifdef XAPIAN_ASSERTIONS_PARANOID
935 vector
<Xapian::Internal::MSetItem
>::const_iterator j
;
936 for (j
= items
.begin(); j
!= items
.end(); ++j
) {
937 Assert(j
->wt
>= min_wt
);
944 "docs_matched = " << docs_matched
<<
945 ", definite_matches_not_seen = " << definite_matches_not_seen
<<
946 ", matches_lower_bound = " << matches_lower_bound
<<
947 ", matches_estimated = " << matches_estimated
<<
948 ", matches_upper_bound = " << matches_upper_bound
);
950 // Adjust docs_matched to take account of documents which matched remotely
951 // but weren't sent across.
952 docs_matched
+= definite_matches_not_seen
;
954 Xapian::doccount uncollapsed_lower_bound
= matches_lower_bound
;
955 Xapian::doccount uncollapsed_upper_bound
= matches_upper_bound
;
956 Xapian::doccount uncollapsed_estimated
= matches_estimated
;
957 #ifdef XAPIAN_HAS_REMOTE_BACKEND
958 if (collapser
&& n_remotes
) {
959 // We need to adjust uncollapsed_upper_bound if there are multiple
960 // shards and some or all are remote. The lower bound and estimate
961 // will be valid, though potentially could be better, but this
962 // doesn't seem worth addressing in 1.4.x - the code on master
963 // handles this correctly via merging MSet objects.
964 if (n_remotes
== leaves
.size()) {
965 // All shards are remote so we just use the sum of
966 // uncollapsed_upper_bound over the remotes.
967 uncollapsed_upper_bound
= remote_uncollapsed_upper_bound
;
969 // Sum and clamp to the number of documents. This is crude but
970 // at least gives a valid answer.
971 Xapian::doccount num_docs
= db
.get_doccount();
972 uncollapsed_upper_bound
+= remote_uncollapsed_upper_bound
;
973 if (uncollapsed_upper_bound
< remote_uncollapsed_upper_bound
||
974 uncollapsed_upper_bound
> num_docs
) {
975 uncollapsed_upper_bound
= num_docs
;
980 if (items
.size() < max_msize
) {
981 // We have fewer items in the mset than we tried to get for it, so we
982 // must have all the matches in it.
983 LOGLINE(MATCH
, "items.size() = " << items
.size() <<
984 ", max_msize = " << max_msize
<< ", setting bounds equal");
985 Assert(definite_matches_not_seen
== 0);
986 Assert(percent_cutoff
|| docs_matched
== items
.size());
987 matches_lower_bound
= matches_upper_bound
= matches_estimated
989 if (collapser
&& matches_lower_bound
> uncollapsed_lower_bound
)
990 uncollapsed_lower_bound
= matches_lower_bound
;
991 } else if (!collapser
&& docs_matched
< check_at_least
) {
992 // We have seen fewer matches than we checked for, so we must have seen
994 LOGLINE(MATCH
, "Setting bounds equal");
995 matches_lower_bound
= matches_upper_bound
= matches_estimated
997 if (collapser
&& matches_lower_bound
> uncollapsed_lower_bound
)
998 uncollapsed_lower_bound
= matches_lower_bound
;
1000 AssertRel(matches_estimated
,>=,matches_lower_bound
);
1001 AssertRel(matches_estimated
,<=,matches_upper_bound
);
1003 // We can end up scaling the estimate more than once, so collect
1004 // the scale factors and apply them in one go to avoid rounding
1006 double estimate_scale
= 1.0;
1007 double unique_rate
= 1.0;
1010 LOGLINE(MATCH
, "Adjusting bounds due to collapse_key");
1011 matches_lower_bound
= collapser
.get_matches_lower_bound();
1012 if (matches_lower_bound
> uncollapsed_lower_bound
)
1013 uncollapsed_lower_bound
= matches_lower_bound
;
1015 // The estimate for the number of hits can be modified by
1016 // multiplying it by the rate at which we've been finding
1017 // unique documents.
1018 Xapian::doccount docs_considered
= collapser
.get_docs_considered();
1019 Xapian::doccount dups_ignored
= collapser
.get_dups_ignored();
1020 if (docs_considered
> 0) {
1021 double unique
= double(docs_considered
- dups_ignored
);
1022 unique_rate
= unique
/ double(docs_considered
);
1025 // We can safely reduce the upper bound by the number of duplicates
1027 matches_upper_bound
-= dups_ignored
;
1029 LOGLINE(MATCH
, "matches_lower_bound=" << matches_lower_bound
<<
1030 ", matches_estimated=" << matches_estimated
<<
1031 "*" << estimate_scale
<< "*" << unique_rate
<<
1032 ", matches_upper_bound=" << matches_upper_bound
);
1036 if (!percent_cutoff
) {
1038 // We're not collapsing or doing a percentage cutoff, so
1039 // docs_matched is a lower bound on the total number of
1041 matches_lower_bound
= max(docs_matched
, matches_lower_bound
);
1045 // The estimate for the number of hits can be modified by
1046 // multiplying it by the rate at which the match decider has
1047 // been accepting documents.
1048 if (decider_considered
> 0) {
1049 double accept
= double(decider_considered
- decider_denied
);
1050 double accept_rate
= accept
/ double(decider_considered
);
1051 estimate_scale
*= accept_rate
;
1054 // If a document is denied by a match decider, it is not possible
1055 // for it to found to be a duplicate, so it is safe to also reduce
1056 // the upper bound by the number of documents denied by a match
1058 matches_upper_bound
-= decider_denied
;
1059 if (collapser
) uncollapsed_upper_bound
-= decider_denied
;
1062 if (percent_cutoff
) {
1063 estimate_scale
*= (1.0 - percent_cutoff_factor
);
1064 // another approach:
1065 // Xapian::doccount new_est = items.size() * (1 - percent_cutoff_factor) / (1 - min_weight / greatest_wt);
1066 // and another: items.size() + (1 - greatest_wt * percent_cutoff_factor / min_weight) * (matches_estimated - items.size());
1068 // Very likely an underestimate, but we can't really do better
1069 // without checking further matches... Only possibility would be
1070 // to track how many docs made the min weight test but didn't make
1071 // the candidate set since the last greatest_wt change, which we
1072 // could use if the top documents matched all the prob terms.
1073 matches_lower_bound
= items
.size();
1074 if (collapser
) uncollapsed_lower_bound
= matches_lower_bound
;
1076 // matches_upper_bound could be reduced by the number of documents
1077 // which fail the min weight test (FIXME)
1079 LOGLINE(MATCH
, "Adjusted bounds due to percent_cutoff (" <<
1080 percent_cutoff
<< "): now have matches_estimated=" <<
1081 matches_estimated
<< ", matches_lower_bound=" <<
1082 matches_lower_bound
);
1085 if (collapser
&& estimate_scale
!= 1.0) {
1086 uncollapsed_estimated
=
1087 Xapian::doccount(uncollapsed_estimated
* estimate_scale
+ 0.5);
1090 estimate_scale
*= unique_rate
;
1092 if (estimate_scale
!= 1.0) {
1094 Xapian::doccount(matches_estimated
* estimate_scale
+ 0.5);
1095 if (matches_estimated
< matches_lower_bound
)
1096 matches_estimated
= matches_lower_bound
;
1099 if (collapser
|| mdecider
) {
1100 LOGLINE(MATCH
, "Clamping estimate between bounds: "
1101 "matches_lower_bound = " << matches_lower_bound
<<
1102 ", matches_estimated = " << matches_estimated
<<
1103 ", matches_upper_bound = " << matches_upper_bound
);
1104 AssertRel(matches_lower_bound
,<=,matches_upper_bound
);
1105 matches_estimated
= max(matches_estimated
, matches_lower_bound
);
1106 matches_estimated
= min(matches_estimated
, matches_upper_bound
);
1107 } else if (!percent_cutoff
) {
1108 AssertRel(docs_matched
,<=,matches_upper_bound
);
1109 if (docs_matched
> matches_lower_bound
)
1110 matches_lower_bound
= docs_matched
;
1111 if (docs_matched
> matches_estimated
)
1112 matches_estimated
= docs_matched
;
1115 if (collapser
&& !mdecider
&& !percent_cutoff
) {
1116 AssertRel(docs_matched
,<=,uncollapsed_upper_bound
);
1117 if (docs_matched
> uncollapsed_lower_bound
)
1118 uncollapsed_lower_bound
= docs_matched
;
1123 AssertRel(uncollapsed_lower_bound
,<=,uncollapsed_upper_bound
);
1124 if (uncollapsed_estimated
< uncollapsed_lower_bound
) {
1125 uncollapsed_estimated
= uncollapsed_lower_bound
;
1126 } else if (uncollapsed_estimated
> uncollapsed_upper_bound
) {
1127 uncollapsed_estimated
= uncollapsed_upper_bound
;
1130 // We're not collapsing, so the bounds must be the same.
1131 uncollapsed_lower_bound
= matches_lower_bound
;
1132 uncollapsed_upper_bound
= matches_upper_bound
;
1133 uncollapsed_estimated
= matches_estimated
;
1136 LOGLINE(MATCH
, items
.size() << " items in potential mset");
1139 // Remove unwanted leading entries
1140 if (items
.size() <= first
) {
1143 LOGLINE(MATCH
, "finding " << first
<< "th");
1144 // We perform nth_element() on reverse iterators so that the
1145 // unwanted elements end up at the end of items, which means
1146 // that the call to erase() to remove them doesn't have to copy
1148 vector
<Xapian::Internal::MSetItem
>::reverse_iterator nth
;
1149 nth
= items
.rbegin() + first
;
1150 nth_element(items
.rbegin(), nth
, items
.rend(), mcmp
);
1151 // Erase the trailing "first" elements
1152 items
.erase(items
.begin() + items
.size() - first
, items
.end());
1156 LOGLINE(MATCH
, "sorting " << items
.size() << " entries");
1158 // We could use std::sort_heap if is_heap is true, but profiling
1159 // suggests that's actually slower. Cast to void to suppress
1160 // compiler warnings that the last set value of is_heap is never
1164 // Need a stable sort, but this is provided by comparison operator
1165 sort(items
.begin(), items
.end(), mcmp
);
1167 if (!items
.empty()) {
1168 LOGLINE(MATCH
, "min weight in mset = " << items
.back().wt
);
1169 LOGLINE(MATCH
, "max weight in mset = " << items
[0].wt
);
1172 AssertRel(matches_estimated
,>=,matches_lower_bound
);
1173 AssertRel(matches_estimated
,<=,matches_upper_bound
);
1175 AssertRel(uncollapsed_estimated
,>=,uncollapsed_lower_bound
);
1176 AssertRel(uncollapsed_estimated
,<=,uncollapsed_upper_bound
);
1178 // We may need to qualify any collapse_count to see if the highest weight
1179 // collapsed item would have qualified percent_cutoff
1180 // We WILL need to restore collapse_count to the mset by taking from
1181 // collapse_tab; this is what comes of copying around whole objects
1182 // instead of taking references, we find it hard to update collapse_count
1183 // of an item that has already been pushed-back as we don't know where it
1184 // is any more. If we keep or find references we won't need to mess with
1185 // is_heap so much maybe?
1186 if (!items
.empty() && collapser
&& !collapser
.empty()) {
1187 // Nicked this formula from above.
1188 double min_wt
= 0.0;
1189 if (percent_scale
> 0.0)
1190 min_wt
= percent_cutoff_factor
/ percent_scale
;
1191 Xapian::doccount entries
= collapser
.entries();
1192 vector
<Xapian::Internal::MSetItem
>::iterator i
;
1193 for (i
= items
.begin(); i
!= items
.end(); ++i
) {
1194 // Skip entries without a collapse key.
1195 if (i
->collapse_key
.empty()) continue;
1197 // Set collapse_count appropriately.
1198 i
->collapse_count
= collapser
.get_collapse_count(i
->collapse_key
, percent_cutoff
, min_wt
);
1199 if (--entries
== 0) {
1200 // Stop once we've processed all items with collapse keys.
1206 mset
.internal
= new Xapian::MSet::Internal(
1208 matches_upper_bound
,
1209 matches_lower_bound
,
1211 uncollapsed_upper_bound
,
1212 uncollapsed_lower_bound
,
1213 uncollapsed_estimated
,
1214 max_possible
, greatest_wt
, items
,
1215 percent_scale
* 100.0);