Reimplement Language Modelling weights
[xapian.git] / xapian-core / weight / bb2weight.cc
blob88975e9029a6c973e6832015508a58f99df9043b
1 /** @file
2 * @brief Xapian::BB2Weight class - the BB2 weighting scheme of the DFR framework.
3 */
4 /* Copyright (C) 2013,2014 Aarsh Shah
5 * Copyright (C) 2014,2015,2016,2017 Olly Betts
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
22 #include <config.h>
24 #include "xapian/weight.h"
26 #include "weightinternal.h"
28 #include "serialise-double.h"
30 #include "xapian/error.h"
32 #include <cmath>
34 using namespace std;
36 namespace Xapian {
38 static double stirling_value(double difference, double y, double stirling_constant)
40 return ((y + 0.5) * (stirling_constant - log2(y)) + (difference * stirling_constant));
43 BB2Weight::BB2Weight(double c) : param_c(c)
45 if (param_c <= 0)
46 throw Xapian::InvalidArgumentError("Parameter c is invalid");
47 need_stat(AVERAGE_LENGTH);
48 need_stat(DOC_LENGTH);
49 need_stat(DOC_LENGTH_MIN);
50 need_stat(DOC_LENGTH_MAX);
51 need_stat(COLLECTION_SIZE);
52 need_stat(COLLECTION_FREQ);
53 need_stat(WDF);
54 need_stat(WDF_MAX);
55 need_stat(WQF);
56 need_stat(TERMFREQ);
59 BB2Weight *
60 BB2Weight::clone() const
62 return new BB2Weight(param_c);
65 void
66 BB2Weight::init(double factor)
68 if (factor == 0.0) {
69 // This object is for the term-independent contribution, and that's
70 // always zero for this scheme.
71 return;
74 double wdfn_upper = get_wdf_upper_bound();
76 if (wdfn_upper == 0) {
77 upper_bound = 0.0;
78 return;
81 c_product_avlen = param_c * get_average_length();
82 double wdfn_lower(1.0);
83 wdfn_lower *= log2(1 + c_product_avlen / get_doclength_upper_bound());
84 wdfn_upper *= log2(1 + c_product_avlen / get_doclength_lower_bound());
86 double F = get_collection_freq();
88 // Clamp wdfn to at most (F - 1) to avoid ill-defined log calculations in
89 // stirling_value().
90 if (rare(wdfn_lower >= F - 1))
91 wdfn_upper = F - 1;
92 if (rare(wdfn_upper >= F - 1))
93 wdfn_upper = F - 1;
95 B_constant = get_wqf() * factor * (F + 1.0) / get_termfreq();
97 // Clamp N to at least 2 to avoid ill-defined log calculations in
98 // stirling_value().
99 double N = rare(get_collection_size() <= 2) ? 2.0 : double(get_collection_size());
101 wt = -1.0 / log(2.0) - log2(N - 1.0);
102 stirling_constant_1 = log2(N + F - 1.0);
103 stirling_constant_2 = log2(F);
105 // Maximize the Stirling value to be used in the upper bound.
106 // Calculate the individual terms keeping the maximization of Stirling value
107 // in mind.
108 double y_min = F - wdfn_upper;
109 double y_max = N + F - wdfn_lower - 2.0;
111 double stirling_max = stirling_value(wdfn_upper + 1.0, y_max,
112 stirling_constant_1) -
113 stirling_value(wdfn_lower, y_min,
114 stirling_constant_2);
116 double B_max = B_constant / (wdfn_lower + 1.0);
117 upper_bound = B_max * (wt + stirling_max);
118 if (rare(upper_bound < 0.0))
119 upper_bound = 0.0;
122 string
123 BB2Weight::name() const
125 return "Xapian::BB2Weight";
128 string
129 BB2Weight::short_name() const
131 return "bb2";
134 string
135 BB2Weight::serialise() const
137 return serialise_double(param_c);
140 BB2Weight *
141 BB2Weight::unserialise(const string & s) const
143 const char *ptr = s.data();
144 const char *end = ptr + s.size();
145 double c = unserialise_double(&ptr, end);
146 if (rare(ptr != end))
147 throw Xapian::SerialisationError("Extra data in BB2Weight::unserialise()");
148 return new BB2Weight(c);
151 double
152 BB2Weight::get_sumpart(Xapian::termcount wdf, Xapian::termcount len,
153 Xapian::termcount, Xapian::termcount) const
155 if (wdf == 0) return 0.0;
157 double wdfn = wdf * log2(1 + c_product_avlen / len);
159 double F = get_collection_freq();
161 // Clamp wdfn to at most (F - 1) to avoid ill-defined log calculations in
162 // stirling_value().
163 if (rare(wdfn >= F - 1))
164 wdfn = F - 1;
166 // Clamp N to at least 2 to avoid ill-defined log calculations in
167 // stirling_value().
168 Xapian::doccount N = get_collection_size();
169 Xapian::doccount N_less_2 = rare(N <= 2) ? 0 : N - 2;
171 double y2 = F - wdfn;
172 double y1 = N_less_2 + y2;
173 double stirling = stirling_value(wdfn + 1.0, y1, stirling_constant_1) -
174 stirling_value(wdfn, y2, stirling_constant_2);
176 double B = B_constant / (wdfn + 1.0);
177 double final_weight = B * (wt + stirling);
178 if (rare(final_weight < 0.0))
179 final_weight = 0.0;
180 return final_weight;
183 double
184 BB2Weight::get_maxpart() const
186 return upper_bound;
189 static inline void
190 parameter_error(const char* message)
192 Xapian::Weight::Internal::parameter_error(message, "bb2");
195 BB2Weight *
196 BB2Weight::create_from_parameters(const char * p) const
198 if (*p == '\0')
199 return new Xapian::BB2Weight();
200 double k = 1.0;
201 if (!Xapian::Weight::Internal::double_param(&p, &k))
202 parameter_error("Parameter is invalid");
203 if (*p)
204 parameter_error("Extra data after parameter");
205 return new Xapian::BB2Weight(k);