scriptindex: Fix weird error cases
[xapian.git] / xapian-core / matcher / nearpostlist.cc
blob26e3532270b58283ae058f3f45dc6692907a968e
1 /** @file
2 * @brief Return docs containing terms within a specified window.
3 */
4 /* Copyright (C) 2006,2007,2009,2010,2011,2014,2015,2017 Olly Betts
5 * Copyright (C) 2007 Lemur Consulting Ltd
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
22 #include <config.h>
24 #include "nearpostlist.h"
26 #include "debuglog.h"
27 #include "backends/positionlist.h"
28 #include "omassert.h"
29 #include "str.h"
31 #include <algorithm>
32 #include <vector>
34 using namespace std;
36 NearPostList::NearPostList(PostList *source_,
37 Xapian::termpos window_,
38 const vector<PostList*>::const_iterator &terms_begin,
39 const vector<PostList*>::const_iterator &terms_end)
40 : SelectPostList(source_), window(window_), terms(terms_begin, terms_end)
42 size_t n = terms.size();
43 Assert(n > 1);
44 poslists = new PositionList*[n];
47 NearPostList::~NearPostList()
49 delete [] poslists;
52 struct TermCmp {
53 bool operator()(const PostList * a, const PostList * b) const {
54 return a->get_wdf() < b->get_wdf();
58 struct Cmp {
59 bool operator()(const PositionList * a, const PositionList * b) const {
60 return a->get_position() > b->get_position();
64 bool
65 NearPostList::test_doc()
67 LOGCALL(MATCH, bool, "NearPostList::test_doc", NO_ARGS);
69 // Sort to put least frequent terms first, to try to minimise the number of
70 // position lists we need to read if there are no matches.
72 // We use wdf as a proxy for the length of the position lists, since we'd
73 // need to read each position list to find its length and we're trying to
74 // avoid having to read them all if we can.
75 sort(terms.begin(), terms.end(), TermCmp());
77 poslists[0] = terms[0]->read_position_list();
78 if (!poslists[0]->next())
79 RETURN(false);
81 Xapian::termpos last = poslists[0]->get_position();
82 PositionList ** end = poslists + 1;
84 while (true) {
85 if (last - poslists[0]->get_position() < window) {
86 if (size_t(end - poslists) != terms.size()) {
87 // We haven't started all the position lists yet, so start the
88 // next one.
89 PositionList * posl =
90 terms[end - poslists]->read_position_list();
91 if (last < window) {
92 if (!posl->next())
93 RETURN(false);
94 } else {
95 if (!posl->skip_to(last - window + 1))
96 RETURN(false);
98 Xapian::termpos pos = posl->get_position();
99 if (pos > last) last = pos;
100 *end++ = posl;
101 push_heap<PositionList **, Cmp>(poslists, end, Cmp());
102 continue;
105 // We have all the terms within the specified window, but some may
106 // be at the same position (in particular if the same term is
107 // listed twice). So we work through the terms in ascending
108 // position order, and each time we find one with a duplicate
109 // position, we advance it - if that pushes us out of the window,
110 // we return to the outer loop, otherwise we reinsert it into the
111 // heap at its new position and continue to look for duplicates
112 // we need to adjust.
113 PositionList ** i = end;
114 pop_heap<PositionList **, Cmp>(poslists, i, Cmp());
115 Xapian::termpos pos = (*--i)->get_position();
116 while (true) {
117 pop_heap<PositionList **, Cmp>(poslists, i, Cmp());
118 if ((*--i)->get_position() == pos) {
119 if (!(*i)->next())
120 RETURN(false);
121 Xapian::termpos newpos = (*i)->get_position();
122 if (newpos - end[-1]->get_position() >= window) {
123 // No longer fits in the window.
124 last = newpos;
125 break;
127 push_heap<PositionList **, Cmp>(poslists, ++i, Cmp());
128 continue;
130 pos = (*i)->get_position();
131 if (i == poslists) {
132 Assert(pos - end[-1]->get_position() < window);
133 RETURN(true);
137 make_heap<PositionList **, Cmp>(poslists, end, Cmp());
138 continue;
140 pop_heap<PositionList **, Cmp>(poslists, end, Cmp());
141 if (!end[-1]->skip_to(last - window + 1))
142 break;
143 last = max(last, end[-1]->get_position());
144 push_heap<PositionList **, Cmp>(poslists, end, Cmp());
147 RETURN(false);
150 Xapian::termcount
151 NearPostList::get_wdf() const
153 // Calculate an estimate for the wdf of a near postlist.
155 // The natural definition of the wdf for a near postlist is the number of
156 // times the terms occur in a group within the windowsize in the document -
157 // in other words, the number of times a routine like do_test() could find
158 // a match, if it kept going after finding the first one. However,
159 // calculating this would be expensive (in CPU time at least - the position
160 // lists are probably already read from disk), and the benefit is dubious
161 // (given that the estimate here is probably only going to be used for
162 // calculating the weight for synonym postlists, and it's not clear that
163 // the above definition is exactly appropriate in this situation, anyway).
165 // Instead, we could do an estimate of this value, based on the lengths
166 // of the position lists. Rather than having to open the position lists,
167 // we could use the wdfs, which will be the same value unless the wdfs have
168 // been artificially inflated - in which case we probably want to use the
169 // inflated value anyway (it will have been inflated to represent the fact
170 // that this term is more important than others, so we should make use of
171 // this hint).
173 // A reasonable way to calculate an estimate would be to consider the
174 // document to be split into W (overlapping) windows, each 1 term apart,
175 // and of the same length as the windowsize used for the phrase. Then,
176 // calculate the probability that each term is found in this window (as
177 // Prob = wdf / doclen), multiply these probabilities to get the
178 // probability that they're all found in the window, and then multiply by
179 // the windowsize again to get an estimate.
181 // However, this requires the doclen, which won't always be available (ie,
182 // if the weighting scheme doesn't use it, we won't have read it from
183 // disk).
185 // Rather than force an access to disk to get the doclen, we use an even
186 // rougher estimate: the minimum wdf in the sub-lists. This is actually
187 // the upper bound for the wdf (since the phrase can only occur the same
188 // number of times as the least frequent of its constituent terms).
190 // If this estimate proves to give bad results, we can revisit this and try
191 // a better approximation.
192 vector<PostList *>::const_iterator i = terms.begin();
193 Xapian::termcount wdf = (*i)->get_wdf();
194 while (++i != terms.end()) {
195 wdf = min(wdf, (*i)->get_wdf());
197 return wdf;
200 Xapian::doccount
201 NearPostList::get_termfreq_est() const
203 // It's hard to estimate how many times the postlist will match as it
204 // depends a lot on the terms and window, but usually it will occur
205 // significantly less often than the individual terms.
206 return source->get_termfreq_est() / 2;
209 TermFreqs
210 NearPostList::get_termfreq_est_using_stats(
211 const Xapian::Weight::Internal & stats) const
213 LOGCALL(MATCH, TermFreqs, "NearPostList::get_termfreq_est_using_stats", stats);
214 // No idea how to estimate this - do the same as get_termfreq_est() for
215 // now.
216 TermFreqs result(source->get_termfreq_est_using_stats(stats));
217 result.termfreq /= 2;
218 result.reltermfreq /= 2;
219 result.collfreq /= 2;
220 RETURN(result);
223 string
224 NearPostList::get_description() const
226 string m = "(Near ";
227 m += str(window);
228 m += ' ';
229 m += source->get_description();
230 m += ")";
231 return m;