2 * @brief SubMatch class for a local database.
4 /* Copyright (C) 2006,2007,2009,2010,2011,2013,2014,2015,2016,2018,2020 Olly Betts
5 * Copyright (C) 2007,2008,2009 Lemur Consulting Ltd
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
24 #include "localsubmatch.h"
26 #include "backends/database.h"
28 #include "api/emptypostlist.h"
29 #include "extraweightpostlist.h"
30 #include "api/leafpostlist.h"
32 #include "queryoptimiser.h"
33 #include "synonympostlist.h"
34 #include "api/termlist.h"
35 #include "weight/weightinternal.h"
37 #include "xapian/error.h"
45 /** Xapian::Weight subclass which adds laziness.
47 * For terms from a wildcard when remote databases are involved, we need to
48 * delay calling init_() on the weight object until the stats for the terms
49 * from the wildcard have been collated.
51 class LazyWeight
: public Xapian::Weight
{
54 Xapian::Weight
* real_wt
;
56 Xapian::Weight::Internal
* stats
;
58 Xapian::termcount qlen
;
60 Xapian::termcount wqf
;
64 LazyWeight
* clone() const;
66 void init(double factor_
);
69 LazyWeight(LeafPostList
* pl_
,
70 Xapian::Weight
* real_wt_
,
71 Xapian::Weight::Internal
* stats_
,
72 Xapian::termcount qlen_
,
73 Xapian::termcount wqf__
,
83 std::string
name() const;
85 std::string
serialise() const;
86 LazyWeight
* unserialise(const std::string
& serialised
) const;
88 double get_sumpart(Xapian::termcount wdf
,
89 Xapian::termcount doclen
,
90 Xapian::termcount uniqterms
) const;
91 double get_maxpart() const;
93 double get_sumextra(Xapian::termcount doclen
,
94 Xapian::termcount uniqterms
) const;
95 double get_maxextra() const;
99 LazyWeight::clone() const
101 throw Xapian::InvalidOperationError("LazyWeight::clone()");
105 LazyWeight::init(double factor_
)
108 throw Xapian::InvalidOperationError("LazyWeight::init()");
112 LazyWeight::name() const
114 string desc
= "LazyWeight(";
115 desc
+= real_wt
->name();
121 LazyWeight::serialise() const
123 throw Xapian::InvalidOperationError("LazyWeight::serialise()");
127 LazyWeight::unserialise(const string
&) const
129 throw Xapian::InvalidOperationError("LazyWeight::unserialise()");
133 LazyWeight::get_sumpart(Xapian::termcount wdf
,
134 Xapian::termcount doclen
,
135 Xapian::termcount uniqterms
) const
140 throw Xapian::InvalidOperationError("LazyWeight::get_sumpart()");
144 LazyWeight::get_sumextra(Xapian::termcount doclen
,
145 Xapian::termcount uniqterms
) const
149 throw Xapian::InvalidOperationError("LazyWeight::get_sumextra()");
153 LazyWeight::get_maxpart() const
155 // This gets called first for the case we care about.
156 return pl
->resolve_lazy_termweight(real_wt
, stats
, qlen
, wqf
, factor
);
160 LazyWeight::get_maxextra() const
162 throw Xapian::InvalidOperationError("LazyWeight::get_maxextra()");
166 LocalSubMatch::prepare_match(bool nowait
,
167 Xapian::Weight::Internal
& total_stats
)
169 LOGCALL(MATCH
, bool, "LocalSubMatch::prepare_match", nowait
| total_stats
);
172 total_stats
.accumulate_stats(*db
, rset
);
177 LocalSubMatch::start_match(Xapian::doccount first
,
178 Xapian::doccount maxitems
,
179 Xapian::doccount check_at_least
,
180 Xapian::Weight::Internal
& total_stats
)
182 LOGCALL_VOID(MATCH
, "LocalSubMatch::start_match", first
| maxitems
| check_at_least
| total_stats
);
185 (void)check_at_least
;
186 // Store a pointer to the total stats to use when building the Query tree.
187 stats
= &total_stats
;
191 LocalSubMatch::get_postlist(MultiMatch
* matcher
,
192 Xapian::termcount
* total_subqs_ptr
,
193 Xapian::Weight::Internal
&)
195 LOGCALL(MATCH
, PostList
*, "LocalSubMatch::get_postlist", matcher
| total_subqs_ptr
| Literal("[total_subqs]"));
197 if (query
.empty() || db
->get_doccount() == 0)
198 RETURN(new EmptyPostList
); // MatchNothing
200 // Build the postlist tree for the query. This calls
201 // LocalSubMatch::open_post_list() for each term in the query.
204 QueryOptimiser
opt(*db
, *this, matcher
, shard_index
);
205 double factor
= wt_factory
->is_bool_weight_() ? 0.0 : 1.0;
206 pl
= query
.internal
->postlist(&opt
, factor
);
207 *total_subqs_ptr
= opt
.get_total_subqs();
210 AutoPtr
<Xapian::Weight
> extra_wt(wt_factory
->clone());
211 // Only uses term-independent stats.
212 extra_wt
->init_(*stats
, qlen
);
213 if (extra_wt
->get_maxextra() != 0.0) {
214 // There's a term-independent weight contribution, so we combine the
215 // postlist tree with an ExtraWeightPostList which adds in this
217 pl
= new ExtraWeightPostList(pl
, extra_wt
.release(), matcher
);
224 LocalSubMatch::make_synonym_postlist(PostList
* or_pl
, MultiMatch
* matcher
,
228 LOGCALL(MATCH
, PostList
*, "LocalSubMatch::make_synonym_postlist", or_pl
| matcher
| factor
| wdf_disjoint
);
229 if (rare(or_pl
->get_termfreq_max() == 0)) {
230 // or_pl is an EmptyPostList or equivalent.
233 LOGVALUE(MATCH
, or_pl
->get_termfreq_est());
234 Xapian::termcount len_lb
= db
->get_doclength_lower_bound();
235 AutoPtr
<SynonymPostList
> res(new SynonymPostList(or_pl
, matcher
, len_lb
,
237 AutoPtr
<Xapian::Weight
> wt(wt_factory
->clone());
240 // Avoid calling get_termfreq_est_using_stats() if the database is empty
241 // so we don't need to special case that repeatedly when implementing it.
242 // FIXME: it would be nicer to handle an empty database higher up, though
243 // we need to catch the case where all the non-empty subdatabases have
244 // failed, so we can't just push this right up to the start of get_mset().
245 if (usual(stats
->collection_size
!= 0)) {
246 freqs
= or_pl
->get_termfreq_est_using_stats(*stats
);
248 wt
->init_(*stats
, qlen
, factor
,
249 freqs
.termfreq
, freqs
.reltermfreq
, freqs
.collfreq
);
251 res
->set_weight(wt
.release());
252 RETURN(res
.release());
256 LocalSubMatch::open_post_list(const string
& term
,
257 Xapian::termcount wqf
,
261 QueryOptimiser
* qopt
,
264 LOGCALL(MATCH
, LeafPostList
*, "LocalSubMatch::open_post_list", term
| wqf
| factor
| need_positions
| qopt
| lazy_weight
);
266 bool weighted
= (factor
!= 0.0 && !term
.empty());
268 LeafPostList
* pl
= NULL
;
269 if (!term
.empty() && !need_positions
) {
270 if ((!weighted
&& !in_synonym
) ||
271 !wt_factory
->get_sumpart_needs_wdf_()) {
272 Xapian::doccount sub_tf
;
273 db
->get_freqs(term
, &sub_tf
, NULL
);
274 if (sub_tf
== qopt
->db_size
) {
275 // If we're not going to use the wdf or term positions, and the
276 // term indexes all documents, we can replace it with the
277 // MatchAll postlist, which is especially efficient if there
278 // are no gaps in the docids.
279 pl
= db
->open_post_list(string());
280 // Set the term name so the postlist looks up the correct term
281 // frequencies - this is necessary if the weighting scheme
282 // needs collection frequency or reltermfreq (termfreq would be
283 // correct anyway since it's just the collection size in this
291 const LeafPostList
* hint
= qopt
->get_hint_postlist();
293 pl
= hint
->open_nearby_postlist(term
);
295 pl
= db
->open_post_list(term
);
296 qopt
->set_hint_postlist(pl
);
300 auto res
= stats
->termfreqs
.emplace(term
, TermFreqs());
302 // Term came from a wildcard, but the same term may be elsewhere
303 // in the query so only accumulate its TermFreqs if emplace()
304 // created a new element.
306 &res
.first
->second
.termfreq
,
307 &res
.first
->second
.collfreq
);
312 Xapian::Weight
* wt
= wt_factory
->clone();
314 wt
->init_(*stats
, qlen
, term
, wqf
, factor
, pl
);
315 if (pl
->get_termfreq() > 0)
316 stats
->set_max_part(term
, wt
->get_maxpart());
318 // Delay initialising the actual weight object, so that we can
319 // gather stats for the terms lazily expanded from a wildcard
320 // (needed for the remote database case).
321 wt
= new LazyWeight(pl
, wt
, stats
, qlen
, wqf
, factor
);
323 pl
->set_termweight(wt
);