Reimplement Language Modelling weights
[xapian.git] / xapian-core / weight / bm25plusweight.cc
blob0b8793d8a69584c6bc27f0c7f3b1c63dd524d030
1 /** @file
2 * @brief Xapian::BM25PlusWeight class - the BM25+ probabilistic formula
3 */
4 /* Copyright (C) 2009,2010,2011,2012,2014,2015,2016 Olly Betts
5 * Copyright (C) 2016 Vivek Pal
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
22 #include <config.h>
24 #include "xapian/weight.h"
25 #include "weightinternal.h"
27 #include "debuglog.h"
28 #include "omassert.h"
29 #include "serialise-double.h"
31 #include "xapian/error.h"
33 #include <algorithm>
34 #include <cmath>
36 using namespace std;
38 namespace Xapian {
40 BM25PlusWeight *
41 BM25PlusWeight::clone() const
43 return new BM25PlusWeight(param_k1, param_k2, param_k3, param_b,
44 param_min_normlen, param_delta);
47 void
48 BM25PlusWeight::init(double factor)
50 Xapian::doccount tf = get_termfreq();
52 if (rare(tf == 0)) {
53 termweight = 0;
54 } else {
55 // BM25+ formula uses IDF = log((total_no_of_docs + 1) / tf)
56 termweight = log(double(get_collection_size() + 1) / tf);
57 termweight *= factor;
58 if (param_k3 != 0) {
59 double wqf_double = get_wqf();
60 termweight *= (param_k3 + 1) * wqf_double / (param_k3 + wqf_double);
64 LOGVALUE(WTCALC, termweight);
66 if (param_k2 == 0 && (param_b == 0 || param_k1 == 0)) {
67 // If k2 is 0, and either param_b or param_k1 is 0 then the document
68 // length doesn't affect the weight.
69 len_factor = 0;
70 } else {
71 len_factor = get_average_length();
72 // len_factor can be zero if all documents are empty (or the database
73 // is empty!)
74 if (len_factor != 0) len_factor = 1 / len_factor;
77 LOGVALUE(WTCALC, len_factor);
80 string
81 BM25PlusWeight::name() const
83 return "Xapian::BM25PlusWeight";
86 string
87 BM25PlusWeight::short_name() const
89 return "bm25plus";
92 string
93 BM25PlusWeight::serialise() const
95 string result = serialise_double(param_k1);
96 result += serialise_double(param_k2);
97 result += serialise_double(param_k3);
98 result += serialise_double(param_b);
99 result += serialise_double(param_min_normlen);
100 result += serialise_double(param_delta);
101 return result;
104 BM25PlusWeight *
105 BM25PlusWeight::unserialise(const string & s) const
107 const char *ptr = s.data();
108 const char *end = ptr + s.size();
109 double k1 = unserialise_double(&ptr, end);
110 double k2 = unserialise_double(&ptr, end);
111 double k3 = unserialise_double(&ptr, end);
112 double b = unserialise_double(&ptr, end);
113 double min_normlen = unserialise_double(&ptr, end);
114 double delta = unserialise_double(&ptr, end);
115 if (rare(ptr != end))
116 throw Xapian::SerialisationError("Extra data in BM25PlusWeight::unserialise()");
117 return new BM25PlusWeight(k1, k2, k3, b, min_normlen, delta);
120 double
121 BM25PlusWeight::get_sumpart(Xapian::termcount wdf, Xapian::termcount len,
122 Xapian::termcount, Xapian::termcount) const
124 LOGCALL(WTCALC, double, "BM25PlusWeight::get_sumpart", wdf | len);
125 Xapian::doclength normlen = max(len * len_factor, param_min_normlen);
127 double wdf_double = wdf;
128 double denom = param_k1 * (normlen * param_b + (1 - param_b)) + wdf_double;
129 AssertRel(denom,>,0);
130 // Parameter delta (δ) is a pseudo tf value to control the scale of the
131 // tf lower bound. δ can be tuned for e.g from 0.0 to 1.5 but BM25+ can
132 // still work effectively across collections with a fixed δ = 1.0
133 RETURN(termweight * ((param_k1 + 1) * wdf_double / denom + param_delta));
136 double
137 BM25PlusWeight::get_maxpart() const
139 LOGCALL(WTCALC, double, "BM25PlusWeight::get_maxpart", NO_ARGS);
140 double denom = param_k1;
141 Xapian::termcount wdf_max = get_wdf_upper_bound();
142 if (param_k1 != 0.0) {
143 if (param_b != 0.0) {
144 // "Upper-bound Approximations for Dynamic Pruning" Craig
145 // Macdonald, Nicola Tonellotto and Iadh Ounis. ACM Transactions on
146 // Information Systems. 29(4), 2011 shows that evaluating at
147 // doclen=wdf_max is a good bound.
149 // However, we can do better if doclen_min > wdf_max since then a
150 // better bound can be found by simply evaluating at
151 // doclen=doclen_min and wdf=wdf_max.
152 Xapian::doclength normlen_lb =
153 max(max(wdf_max, get_doclength_lower_bound()) * len_factor,
154 param_min_normlen);
155 denom *= (normlen_lb * param_b + (1 - param_b));
158 denom += wdf_max;
159 AssertRel(denom,>,0);
160 RETURN(termweight * ((param_k1 + 1) * wdf_max / denom + param_delta));
163 /* The paper which describes BM25+ ignores BM25's document-independent
164 * component (so implicitly k2=0), but we support non-zero k2 too.
166 * The BM25 formula gives:
168 * param_k2 * query_length * (1 - normlen) / (1 + normlen)
170 * To avoid negative sumextra we add the constant (param_k2 * query_length)
171 * to give:
173 * 2 * param_k2 * query_length / (1 + normlen)
175 double
176 BM25PlusWeight::get_sumextra(Xapian::termcount len,
177 Xapian::termcount,
178 Xapian::termcount) const
180 LOGCALL(WTCALC, double, "BM25PlusWeight::get_sumextra", len);
181 double num = (2.0 * param_k2 * get_query_length());
182 RETURN(num / (1.0 + max(len * len_factor, param_min_normlen)));
185 double
186 BM25PlusWeight::get_maxextra() const
188 LOGCALL(WTCALC, double, "BM25PlusWeight::get_maxextra", NO_ARGS);
189 if (param_k2 == 0.0)
190 RETURN(0.0);
191 double num = (2.0 * param_k2 * get_query_length());
192 RETURN(num / (1.0 + max(get_doclength_lower_bound() * len_factor,
193 param_min_normlen)));
196 static inline void
197 parameter_error(const char* message)
199 Xapian::Weight::Internal::parameter_error(message, "bm25plus");
202 BM25PlusWeight *
203 BM25PlusWeight::create_from_parameters(const char * p) const
205 if (*p == '\0')
206 return new Xapian::BM25PlusWeight();
207 double k1 = 1;
208 double k2 = 0;
209 double k3 = 1;
210 double b = 0.5;
211 double min_normlen = 0.5;
212 double delta = 1.0;
213 if (!Xapian::Weight::Internal::double_param(&p, &k1))
214 parameter_error("Parameter 1 (k1) is invalid");
215 if (*p && !Xapian::Weight::Internal::double_param(&p, &k2))
216 parameter_error("Parameter 2 (k2) is invalid");
217 if (*p && !Xapian::Weight::Internal::double_param(&p, &k3))
218 parameter_error("Parameter 3 (k3) is invalid");
219 if (*p && !Xapian::Weight::Internal::double_param(&p, &b))
220 parameter_error("Parameter 4 (b) is invalid");
221 if (*p && !Xapian::Weight::Internal::double_param(&p, &min_normlen))
222 parameter_error("Parameter 5 (min_normlen) is invalid");
223 if (*p && !Xapian::Weight::Internal::double_param(&p, &delta))
224 parameter_error("Parameter 6 (delta) is invalid");
225 if (*p)
226 parameter_error("Extra data after parameter 6");
227 return new Xapian::BM25PlusWeight(k1, k2, k3, b, min_normlen, delta);