2 * @brief Return docs containing terms within a specified window.
4 /* Copyright (C) 2006,2007,2009,2010,2011,2014,2015,2017 Olly Betts
5 * Copyright (C) 2007 Lemur Consulting Ltd
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
24 #include "nearpostlist.h"
27 #include "backends/positionlist.h"
36 NearPostList::NearPostList(PostList
*source_
,
37 Xapian::termpos window_
,
38 const vector
<PostList
*>::const_iterator
&terms_begin
,
39 const vector
<PostList
*>::const_iterator
&terms_end
)
40 : SelectPostList(source_
), window(window_
), terms(terms_begin
, terms_end
)
42 size_t n
= terms
.size();
44 poslists
= new PositionList
*[n
];
47 NearPostList::~NearPostList()
53 bool operator()(const PostList
* a
, const PostList
* b
) const {
54 return a
->get_wdf() < b
->get_wdf();
59 bool operator()(const PositionList
* a
, const PositionList
* b
) const {
60 return a
->get_position() > b
->get_position();
65 NearPostList::test_doc()
67 LOGCALL(MATCH
, bool, "NearPostList::test_doc", NO_ARGS
);
69 // Sort to put least frequent terms first, to try to minimise the number of
70 // position lists we need to read if there are no matches.
72 // We use wdf as a proxy for the length of the position lists, since we'd
73 // need to read each position list to find its length and we're trying to
74 // avoid having to read them all if we can.
75 sort(terms
.begin(), terms
.end(), TermCmp());
77 poslists
[0] = terms
[0]->read_position_list();
78 if (!poslists
[0]->next())
81 Xapian::termpos last
= poslists
[0]->get_position();
82 PositionList
** end
= poslists
+ 1;
85 if (last
- poslists
[0]->get_position() < window
) {
86 if (size_t(end
- poslists
) != terms
.size()) {
87 // We haven't started all the position lists yet, so start the
90 terms
[end
- poslists
]->read_position_list();
95 if (!posl
->skip_to(last
- window
+ 1))
98 Xapian::termpos pos
= posl
->get_position();
99 if (pos
> last
) last
= pos
;
101 push_heap
<PositionList
**, Cmp
>(poslists
, end
, Cmp());
105 // We have all the terms within the specified window, but some may
106 // be at the same position (in particular if the same term is
107 // listed twice). So we work through the terms in ascending
108 // position order, and each time we find one with a duplicate
109 // position, we advance it - if that pushes us out of the window,
110 // we return to the outer loop, otherwise we reinsert it into the
111 // heap at its new position and continue to look for duplicates
112 // we need to adjust.
113 PositionList
** i
= end
;
114 pop_heap
<PositionList
**, Cmp
>(poslists
, i
, Cmp());
115 Xapian::termpos pos
= (*--i
)->get_position();
117 pop_heap
<PositionList
**, Cmp
>(poslists
, i
, Cmp());
118 if ((*--i
)->get_position() == pos
) {
121 Xapian::termpos newpos
= (*i
)->get_position();
122 if (newpos
- end
[-1]->get_position() >= window
) {
123 // No longer fits in the window.
127 push_heap
<PositionList
**, Cmp
>(poslists
, ++i
, Cmp());
130 pos
= (*i
)->get_position();
132 Assert(pos
- end
[-1]->get_position() < window
);
137 make_heap
<PositionList
**, Cmp
>(poslists
, end
, Cmp());
140 pop_heap
<PositionList
**, Cmp
>(poslists
, end
, Cmp());
141 if (!end
[-1]->skip_to(last
- window
+ 1))
143 last
= max(last
, end
[-1]->get_position());
144 push_heap
<PositionList
**, Cmp
>(poslists
, end
, Cmp());
151 NearPostList::get_wdf() const
153 // Calculate an estimate for the wdf of a near postlist.
155 // The natural definition of the wdf for a near postlist is the number of
156 // times the terms occur in a group within the windowsize in the document -
157 // in other words, the number of times a routine like do_test() could find
158 // a match, if it kept going after finding the first one. However,
159 // calculating this would be expensive (in CPU time at least - the position
160 // lists are probably already read from disk), and the benefit is dubious
161 // (given that the estimate here is probably only going to be used for
162 // calculating the weight for synonym postlists, and it's not clear that
163 // the above definition is exactly appropriate in this situation, anyway).
165 // Instead, we could do an estimate of this value, based on the lengths
166 // of the position lists. Rather than having to open the position lists,
167 // we could use the wdfs, which will be the same value unless the wdfs have
168 // been artificially inflated - in which case we probably want to use the
169 // inflated value anyway (it will have been inflated to represent the fact
170 // that this term is more important than others, so we should make use of
173 // A reasonable way to calculate an estimate would be to consider the
174 // document to be split into W (overlapping) windows, each 1 term apart,
175 // and of the same length as the windowsize used for the phrase. Then,
176 // calculate the probability that each term is found in this window (as
177 // Prob = wdf / doclen), multiply these probabilities to get the
178 // probability that they're all found in the window, and then multiply by
179 // the windowsize again to get an estimate.
181 // However, this requires the doclen, which won't always be available (ie,
182 // if the weighting scheme doesn't use it, we won't have read it from
185 // Rather than force an access to disk to get the doclen, we use an even
186 // rougher estimate: the minimum wdf in the sub-lists. This is actually
187 // the upper bound for the wdf (since the phrase can only occur the same
188 // number of times as the least frequent of its constituent terms).
190 // If this estimate proves to give bad results, we can revisit this and try
191 // a better approximation.
192 vector
<PostList
*>::const_iterator i
= terms
.begin();
193 Xapian::termcount wdf
= (*i
)->get_wdf();
194 while (++i
!= terms
.end()) {
195 wdf
= min(wdf
, (*i
)->get_wdf());
201 NearPostList::get_termfreq_est() const
203 // It's hard to estimate how many times the postlist will match as it
204 // depends a lot on the terms and window, but usually it will occur
205 // significantly less often than the individual terms.
206 return source
->get_termfreq_est() / 2;
210 NearPostList::get_termfreq_est_using_stats(
211 const Xapian::Weight::Internal
& stats
) const
213 LOGCALL(MATCH
, TermFreqs
, "NearPostList::get_termfreq_est_using_stats", stats
);
214 // No idea how to estimate this - do the same as get_termfreq_est() for
216 TermFreqs
result(source
->get_termfreq_est_using_stats(stats
));
217 result
.termfreq
/= 2;
218 result
.reltermfreq
/= 2;
219 result
.collfreq
/= 2;
224 NearPostList::get_description() const
229 m
+= source
->get_description();