xapian-core/matcher/nearpostlist.cc

   1 /** @file
   2  * @brief Return docs containing terms within a specified window.
   3  */
   4 /* Copyright (C) 2006,2007,2009,2010,2011,2014,2015,2017 Olly Betts
   5  * Copyright (C) 2007 Lemur Consulting Ltd
   6  *
   7  * This program is free software; you can redistribute it and/or modify
   8  * it under the terms of the GNU General Public License as published by
   9  * the Free Software Foundation; either version 2 of the License, or
  10  * (at your option) any later version.
  11  *
  12  * This program is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15  * GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with this program; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
  20  */
  21
  22 #include <config.h>
  23
  24 #include "nearpostlist.h"
  25
  26 #include "debuglog.h"
  27 #include "backends/positionlist.h"
  28 #include "omassert.h"
  29 #include "str.h"
  30
  31 #include <algorithm>
  32 #include <vector>
  33
  34 using namespace std;
  35
  36 NearPostList::NearPostList(PostList *source_,
  37                            Xapian::termpos window_,
  38                            const vector<PostList*>::const_iterator &terms_begin,
  39                            const vector<PostList*>::const_iterator &terms_end)
  40     : SelectPostList(source_), window(window_), terms(terms_begin, terms_end)
  41 {
  42     size_t n = terms.size();
  43     Assert(n > 1);
  44     poslists = new PositionList*[n];
  45 }
  46
  47 NearPostList::~NearPostList()
  48 {
  49     delete [] poslists;
  50 }
  51
  52 struct TermCmp {
  53     bool operator()(const PostList * a, const PostList * b) const {
  54         return a->get_wdf() < b->get_wdf();
  55     }
  56 };
  57
  58 struct Cmp {
  59     bool operator()(const PositionList * a, const PositionList * b) const {
  60         return a->get_position() > b->get_position();
  61     }
  62 };
  63
  64 bool
  65 NearPostList::test_doc()
  66 {
  67     LOGCALL(MATCH, bool, "NearPostList::test_doc", NO_ARGS);
  68
  69     // Sort to put least frequent terms first, to try to minimise the number of
  70     // position lists we need to read if there are no matches.
  71     //
  72     // We use wdf as a proxy for the length of the position lists, since we'd
  73     // need to read each position list to find its length and we're trying to
  74     // avoid having to read them all if we can.
  75     sort(terms.begin(), terms.end(), TermCmp());
  76
  77     poslists[0] = terms[0]->read_position_list();
  78     if (!poslists[0]->next())
  79         RETURN(false);
  80
  81     Xapian::termpos last = poslists[0]->get_position();
  82     PositionList ** end = poslists + 1;
  83
  84     while (true) {
  85         if (last - poslists[0]->get_position() < window) {
  86             if (size_t(end - poslists) != terms.size()) {
  87                 // We haven't started all the position lists yet, so start the
  88                 // next one.
  89                 PositionList * posl =
  90                     terms[end - poslists]->read_position_list();
  91                 if (last < window) {
  92                     if (!posl->next())
  93                         RETURN(false);
  94                 } else {
  95                     if (!posl->skip_to(last - window + 1))
  96                         RETURN(false);
  97                 }
  98                 Xapian::termpos pos = posl->get_position();
  99                 if (pos > last) last = pos;
 100                 *end++ = posl;
 101                 push_heap<PositionList **, Cmp>(poslists, end, Cmp());
 102                 continue;
 103             }
 104
 105             // We have all the terms within the specified window, but some may
 106             // be at the same position (in particular if the same term is
 107             // listed twice).  So we work through the terms in ascending
 108             // position order, and each time we find one with a duplicate
 109             // position, we advance it - if that pushes us out of the window,
 110             // we return to the outer loop, otherwise we reinsert it into the
 111             // heap at its new position and continue to look for duplicates
 112             // we need to adjust.
 113             PositionList ** i = end;
 114             pop_heap<PositionList **, Cmp>(poslists, i, Cmp());
 115             Xapian::termpos pos = (*--i)->get_position();
 116             while (true) {
 117                 pop_heap<PositionList **, Cmp>(poslists, i, Cmp());
 118                 if ((*--i)->get_position() == pos) {
 119                     if (!(*i)->next())
 120                         RETURN(false);
 121                     Xapian::termpos newpos = (*i)->get_position();
 122                     if (newpos - end[-1]->get_position() >= window) {
 123                         // No longer fits in the window.
 124                         last = newpos;
 125                         break;
 126                     }
 127                     push_heap<PositionList **, Cmp>(poslists, ++i, Cmp());
 128                     continue;
 129                 }
 130                 pos = (*i)->get_position();
 131                 if (i == poslists) {
 132                     Assert(pos - end[-1]->get_position() < window);
 133                     RETURN(true);
 134                 }
 135             }
 136
 137             make_heap<PositionList **, Cmp>(poslists, end, Cmp());
 138             continue;
 139         }
 140         pop_heap<PositionList **, Cmp>(poslists, end, Cmp());
 141         if (!end[-1]->skip_to(last - window + 1))
 142             break;
 143         last = max(last, end[-1]->get_position());
 144         push_heap<PositionList **, Cmp>(poslists, end, Cmp());
 145     }
 146
 147     RETURN(false);
 148 }
 149
 150 Xapian::termcount
 151 NearPostList::get_wdf() const
 152 {
 153     // Calculate an estimate for the wdf of a near postlist.
 154     //
 155     // The natural definition of the wdf for a near postlist is the number of
 156     // times the terms occur in a group within the windowsize in the document -
 157     // in other words, the number of times a routine like do_test() could find
 158     // a match, if it kept going after finding the first one.  However,
 159     // calculating this would be expensive (in CPU time at least - the position
 160     // lists are probably already read from disk), and the benefit is dubious
 161     // (given that the estimate here is probably only going to be used for
 162     // calculating the weight for synonym postlists, and it's not clear that
 163     // the above definition is exactly appropriate in this situation, anyway).
 164     //
 165     // Instead, we could do an estimate of this value, based on the lengths
 166     // of the position lists.  Rather than having to open the position lists,
 167     // we could use the wdfs, which will be the same value unless the wdfs have
 168     // been artificially inflated - in which case we probably want to use the
 169     // inflated value anyway (it will have been inflated to represent the fact
 170     // that this term is more important than others, so we should make use of
 171     // this hint).
 172     //
 173     // A reasonable way to calculate an estimate would be to consider the
 174     // document to be split into W (overlapping) windows, each 1 term apart,
 175     // and of the same length as the windowsize used for the phrase.  Then,
 176     // calculate the probability that each term is found in this window (as
 177     // Prob = wdf / doclen), multiply these probabilities to get the
 178     // probability that they're all found in the window, and then multiply by
 179     // the windowsize again to get an estimate.
 180     //
 181     // However, this requires the doclen, which won't always be available (ie,
 182     // if the weighting scheme doesn't use it, we won't have read it from
 183     // disk).
 184     //
 185     // Rather than force an access to disk to get the doclen, we use an even
 186     // rougher estimate: the minimum wdf in the sub-lists.  This is actually
 187     // the upper bound for the wdf (since the phrase can only occur the same
 188     // number of times as the least frequent of its constituent terms).
 189     //
 190     // If this estimate proves to give bad results, we can revisit this and try
 191     // a better approximation.
 192     vector<PostList *>::const_iterator i = terms.begin();
 193     Xapian::termcount wdf = (*i)->get_wdf();
 194     while (++i != terms.end()) {
 195         wdf = min(wdf, (*i)->get_wdf());
 196     }
 197     return wdf;
 198 }
 199
 200 Xapian::doccount
 201 NearPostList::get_termfreq_est() const
 202 {
 203     // It's hard to estimate how many times the postlist will match as it
 204     // depends a lot on the terms and window, but usually it will occur
 205     // significantly less often than the individual terms.
 206     return source->get_termfreq_est() / 2;
 207 }
 208
 209 TermFreqs
 210 NearPostList::get_termfreq_est_using_stats(
 211         const Xapian::Weight::Internal & stats) const
 212 {
 213     LOGCALL(MATCH, TermFreqs, "NearPostList::get_termfreq_est_using_stats", stats);
 214     // No idea how to estimate this - do the same as get_termfreq_est() for
 215     // now.
 216     TermFreqs result(source->get_termfreq_est_using_stats(stats));
 217     result.termfreq /= 2;
 218     result.reltermfreq /= 2;
 219     result.collfreq /= 2;
 220     RETURN(result);
 221 }
 222
 223 string
 224 NearPostList::get_description() const
 225 {
 226     string m = "(Near ";
 227     m += str(window);
 228     m += ' ';
 229     m += source->get_description();
 230     m += ")";
 231     return m;
 232 }