Improve fix for invalid Btree level count
[xapian.git] / xapian-core / weight / pl2plusweight.cc
blob0aaca80ecb6079717967f1bb5ea1a47948d806ac
1 /** @file
2 * @brief Xapian::PL2PlusWeight class - the PL2+ weighting scheme of the DFR framework.
3 */
4 /* Copyright (C) 2013 Aarsh Shah
5 * Copyright (C) 2013,2014,2016,2017 Olly Betts
6 * Copyright (C) 2016 Vivek Pal
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
13 * This program is distributed in the hope that it will be useful
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
23 #include <config.h>
25 #include "xapian/weight.h"
26 #include "common/log2.h"
28 #include "serialise-double.h"
30 #include "xapian/error.h"
32 #include <algorithm>
34 using namespace std;
36 namespace Xapian {
38 PL2PlusWeight::PL2PlusWeight(double c, double delta)
39 : param_c(c), param_delta(delta)
41 if (param_c <= 0)
42 throw Xapian::InvalidArgumentError("Parameter c is invalid");
43 if (param_delta <= 0)
44 throw Xapian::InvalidArgumentError("Parameter delta is invalid");
45 need_stat(AVERAGE_LENGTH);
46 need_stat(DOC_LENGTH);
47 need_stat(DOC_LENGTH_MIN);
48 need_stat(DOC_LENGTH_MAX);
49 need_stat(COLLECTION_SIZE);
50 need_stat(COLLECTION_FREQ);
51 need_stat(WDF);
52 need_stat(WDF_MAX);
53 need_stat(WQF);
56 PL2PlusWeight *
57 PL2PlusWeight::clone() const
59 return new PL2PlusWeight(param_c, param_delta);
62 void
63 PL2PlusWeight::init(double factor_)
65 if (factor_ == 0.0) {
66 // This object is for the term-independent contribution, and that's
67 // always zero for this scheme.
68 return;
71 factor = factor_;
73 if (get_wdf_upper_bound() == 0) {
74 // The "extra" weight object is cloned, init() called and then
75 // get_maxextra() is called and we discover that we don't need it.
76 // So we need to handle that case (which will give us 0 from
77 // get_wdf_upper_bound() here).
78 upper_bound = 0;
79 return;
82 factor *= get_wqf();
84 cl = param_c * get_average_length();
86 double base_change(1.0 / log(2.0));
87 mean = double(get_collection_freq()) / get_collection_size();
88 P1 = mean * base_change + 0.5 * log2(2.0 * M_PI);
89 P2 = log2(mean) + base_change;
91 double wdfn_lower = log2(1 + cl / get_doclength_upper_bound());
92 double divisior = max(get_wdf_upper_bound(), get_doclength_lower_bound());
93 double wdfn_upper = get_wdf_upper_bound() * log2(1 + cl / divisior);
95 double P_delta = P1 + (param_delta + 0.5) * log2(param_delta) - P2 * param_delta;
96 dw = P_delta / (param_delta + 1.0);
98 // Calculate an upper bound on the weights which get_sumpart() can return.
100 // We consider the equation for P as the sum of two parts which we
101 // maximise individually:
103 // (a) (wdfn + 0.5) / (wdfn + 1) * log2(wdfn)
104 // (b) (P1 - P2 * wdfn) / (wdfn + 1)
106 // To maximise (a), the fractional part is always positive (since wdfn>0)
107 // and is maximised by maximising wdfn - clearer when rewritten as:
108 // (1 - 0.5 / (wdfn + 1))
110 // The log part of (a) is clearly also maximised by maximising wdfn,
111 // so we want to evaluate (a) at wdfn=wdfn_upper.
112 double P_max2a = (wdfn_upper + 0.5) * log2(wdfn_upper) / (wdfn_upper + 1.0);
113 // To maximise (b) substitute x=wdfn+1 (so x>1) and we get:
115 // (P1 + P2)/x - P2
117 // Differentiating wrt x gives:
119 // -(P1 + P2)/x²
121 // So there are no local minima or maxima, and the function is continuous
122 // in the range of interest, so the sign of this differential tells us
123 // whether we want to maximise or minimise wdfn, and since x>1, we can
124 // just consider the sign of: (P1 + P2)
126 // Commonly P1 + P2 > 0, in which case we evaluate P at wdfn=wdfn_upper
127 // giving us a bound that can't be bettered if wdfn_upper is tight.
128 double wdfn_optb = P1 + P2 > 0 ? wdfn_upper : wdfn_lower;
129 double P_max2b = (P1 - P2 * wdfn_optb) / (wdfn_optb + 1.0);
130 upper_bound = factor * (P_max2a + P_max2b + dw);
132 if (rare(upper_bound <= 0)) upper_bound = 0;
135 string
136 PL2PlusWeight::name() const
138 return "Xapian::PL2PlusWeight";
141 string
142 PL2PlusWeight::serialise() const
144 string result = serialise_double(param_c);
145 result += serialise_double(param_delta);
146 return result;
149 PL2PlusWeight *
150 PL2PlusWeight::unserialise(const string & s) const
152 const char *ptr = s.data();
153 const char *end = ptr + s.size();
154 double c = unserialise_double(&ptr, end);
155 double delta = unserialise_double(&ptr, end);
156 if (rare(ptr != end))
157 throw Xapian::SerialisationError("Extra data in PL2PlusWeight::unserialise()");
158 return new PL2PlusWeight(c, delta);
161 double
162 PL2PlusWeight::get_sumpart(Xapian::termcount wdf, Xapian::termcount len,
163 Xapian::termcount) const
165 if (wdf == 0 || mean < 1) return 0.0;
167 double wdfn = wdf * log2(1 + cl / len);
169 double P = P1 + (wdfn + 0.5) * log2(wdfn) - P2 * wdfn;
171 double wt = (P / (wdfn + 1.0)) + dw;
172 // FIXME: Is a negative wt possible here? It is with vanilla PL2, but for
173 // PL2+ we've added on dw, and bailed out early if mean < 1.
174 if (rare(wt <= 0)) return 0.0;
176 return factor * wt;
179 double
180 PL2PlusWeight::get_maxpart() const
182 return upper_bound;
185 double
186 PL2PlusWeight::get_sumextra(Xapian::termcount, Xapian::termcount) const
188 return 0;
191 double
192 PL2PlusWeight::get_maxextra() const
194 return 0;