Update NEWS from git log
[xapian.git] / xapian-core / matcher / multiandpostlist.cc
blob4f0d9d162b205c2d89304d5fe63eec858c53ec5c
1 /** @file
2 * @brief N-way AND postlist
3 */
4 /* Copyright (C) 2007,2009,2011,2012,2015,2017 Olly Betts
5 * Copyright (C) 2009 Lemur Consulting Ltd
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
22 #include <config.h>
24 #include "multiandpostlist.h"
25 #include "omassert.h"
26 #include "debuglog.h"
28 using namespace std;
30 void
31 MultiAndPostList::allocate_plist_and_max_wt()
33 plist = new PostList * [n_kids];
34 try {
35 max_wt = new double [n_kids]();
36 } catch (...) {
37 delete [] plist;
38 plist = NULL;
39 throw;
43 MultiAndPostList::~MultiAndPostList()
45 if (plist) {
46 for (size_t i = 0; i < n_kids; ++i) {
47 delete plist[i];
49 delete [] plist;
51 delete [] max_wt;
54 Xapian::doccount
55 MultiAndPostList::get_termfreq_min() const
57 // The number of matching documents is minimised when we have the minimum
58 // number of matching documents from each sub-postlist, and these are
59 // maximally disjoint.
60 Xapian::doccount sum = plist[0]->get_termfreq_min();
61 if (sum) {
62 for (size_t i = 1; i < n_kids; ++i) {
63 Xapian::doccount sum_old = sum;
64 sum += plist[i]->get_termfreq_min();
65 // If sum < sum_old, the calculation overflowed and the true sum
66 // must be > db_size. Since we added a value <= db_size,
67 // subtracting db_size must un-overflow us.
68 if (sum >= sum_old && sum <= db_size) {
69 // It's possible there's no overlap.
70 return 0;
72 sum -= db_size;
74 AssertRelParanoid(sum,<=,MultiAndPostList::get_termfreq_est());
76 return sum;
79 Xapian::doccount
80 MultiAndPostList::get_termfreq_max() const
82 // We can't match more documents than the least of our sub-postlists.
83 Xapian::doccount result = plist[0]->get_termfreq_max();
84 for (size_t i = 1; i < n_kids; ++i) {
85 Xapian::doccount tf = plist[i]->get_termfreq_max();
86 if (tf < result) result = tf;
88 return result;
91 Xapian::doccount
92 MultiAndPostList::get_termfreq_est() const
94 LOGCALL(MATCH, Xapian::doccount, "MultiAndPostList::get_termfreq_est", NO_ARGS);
95 // We shortcut an empty shard and avoid creating a postlist tree for it.
96 Assert(db_size);
97 // We calculate the estimate assuming independence. With this assumption,
98 // the estimate is the product of the estimates for the sub-postlists
99 // divided by db_size (n_kids - 1) times.
100 double result = plist[0]->get_termfreq_est();
101 for (size_t i = 1; i < n_kids; ++i) {
102 result = (result * plist[i]->get_termfreq_est()) / db_size;
104 return static_cast<Xapian::doccount>(result + 0.5);
107 TermFreqs
108 MultiAndPostList::get_termfreq_est_using_stats(
109 const Xapian::Weight::Internal & stats) const
111 LOGCALL(MATCH, TermFreqs, "MultiAndPostList::get_termfreq_est_using_stats", stats);
112 // We calculate the estimate assuming independence. With this assumption,
113 // the estimate is the product of the estimates for the sub-postlists
114 // divided by db_size (n_kids - 1) times.
115 TermFreqs freqs(plist[0]->get_termfreq_est_using_stats(stats));
117 double freqest = double(freqs.termfreq);
118 double relfreqest = double(freqs.reltermfreq);
119 double collfreqest = double(freqs.collfreq);
121 // Our caller should have ensured this.
122 Assert(stats.collection_size);
124 for (size_t i = 1; i < n_kids; ++i) {
125 freqs = plist[i]->get_termfreq_est_using_stats(stats);
127 // If the collection is empty, freqest should be 0 already, so leave
128 // it alone.
129 freqest = (freqest * freqs.termfreq) / stats.collection_size;
130 if (usual(stats.total_length != 0)) {
131 collfreqest = (collfreqest * freqs.collfreq) / stats.total_length;
134 // If the rset is empty, relfreqest should be 0 already, so leave
135 // it alone.
136 if (stats.rset_size != 0)
137 relfreqest = (relfreqest * freqs.reltermfreq) / stats.rset_size;
140 RETURN(TermFreqs(static_cast<Xapian::doccount>(freqest + 0.5),
141 static_cast<Xapian::doccount>(relfreqest + 0.5),
142 static_cast<Xapian::termcount>(collfreqest + 0.5)));
145 double
146 MultiAndPostList::get_maxweight() const
148 return max_total;
151 Xapian::docid
152 MultiAndPostList::get_docid() const
154 return did;
157 Xapian::termcount
158 MultiAndPostList::get_doclength() const
160 Assert(did);
161 Xapian::termcount doclength = plist[0]->get_doclength();
162 for (size_t i = 1; i < n_kids; ++i) {
163 AssertEq(doclength, plist[i]->get_doclength());
165 return doclength;
168 Xapian::termcount
169 MultiAndPostList::get_unique_terms() const
171 Assert(did);
172 Xapian::termcount unique_terms = plist[0]->get_unique_terms();
173 for (size_t i = 1; i < n_kids; ++i) {
174 AssertEq(unique_terms, plist[i]->get_unique_terms());
176 return unique_terms;
179 double
180 MultiAndPostList::get_weight() const
182 Assert(did);
183 double result = 0;
184 for (size_t i = 0; i < n_kids; ++i) {
185 result += plist[i]->get_weight();
187 return result;
190 bool
191 MultiAndPostList::at_end() const
193 return (did == 0);
196 double
197 MultiAndPostList::recalc_maxweight()
199 max_total = 0.0;
200 for (size_t i = 0; i < n_kids; ++i) {
201 double new_max = plist[i]->recalc_maxweight();
202 max_wt[i] = new_max;
203 max_total += new_max;
205 return max_total;
208 PostList *
209 MultiAndPostList::find_next_match(double w_min)
211 advanced_plist0:
212 if (plist[0]->at_end()) {
213 did = 0;
214 return NULL;
216 did = plist[0]->get_docid();
217 for (size_t i = 1; i < n_kids; ++i) {
218 bool valid;
219 check_helper(i, did, w_min, valid);
220 if (!valid) {
221 next_helper(0, w_min);
222 goto advanced_plist0;
224 if (plist[i]->at_end()) {
225 did = 0;
226 return NULL;
228 Xapian::docid new_did = plist[i]->get_docid();
229 if (new_did != did) {
230 skip_to_helper(0, new_did, w_min);
231 goto advanced_plist0;
234 return NULL;
237 PostList *
238 MultiAndPostList::next(double w_min)
240 next_helper(0, w_min);
241 return find_next_match(w_min);
244 PostList *
245 MultiAndPostList::skip_to(Xapian::docid did_min, double w_min)
247 skip_to_helper(0, did_min, w_min);
248 return find_next_match(w_min);
251 std::string
252 MultiAndPostList::get_description() const
254 string desc("(");
255 desc += plist[0]->get_description();
256 for (size_t i = 1; i < n_kids; ++i) {
257 desc += " AND ";
258 desc += plist[i]->get_description();
260 desc += ')';
261 return desc;
264 Xapian::termcount
265 MultiAndPostList::get_wdf() const
267 Xapian::termcount totwdf = 0;
268 for (size_t i = 0; i < n_kids; ++i) {
269 totwdf += plist[i]->get_wdf();
271 return totwdf;
274 Xapian::termcount
275 MultiAndPostList::count_matching_subqs() const
277 Xapian::termcount total = 0;
278 for (size_t i = 0; i < n_kids; ++i) {
279 total += plist[i]->count_matching_subqs();
281 return total;
284 void
285 MultiAndPostList::gather_position_lists(OrPositionList* orposlist)
287 for (size_t i = 0; i < n_kids; ++i) {
288 plist[i]->gather_position_lists(orposlist);