Reimplement Language Modelling weights
[xapian.git] / xapian-core / weight / ineb2weight.cc
blob8c623b11d54741dd398f8f66b5bf261e28a1094f
1 /** @file
2 * @brief Xapian::IneB2Weight class - the IneB2 weighting scheme of the DFR framework.
3 */
4 /* Copyright (C) 2013,2014 Aarsh Shah
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
11 * This program is distributed in the hope that it will be useful
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21 #include <config.h>
23 #include "xapian/weight.h"
25 #include "weightinternal.h"
27 #include "serialise-double.h"
29 #include "xapian/error.h"
31 #include <cmath>
33 using namespace std;
35 namespace Xapian {
37 IneB2Weight::IneB2Weight(double c) : param_c(c) {
38 if (param_c <= 0)
39 throw Xapian::InvalidArgumentError("Parameter c is invalid");
40 need_stat(AVERAGE_LENGTH);
41 need_stat(DOC_LENGTH);
42 need_stat(DOC_LENGTH_MIN);
43 need_stat(COLLECTION_SIZE);
44 need_stat(WDF);
45 need_stat(WDF_MAX);
46 need_stat(WQF);
47 need_stat(COLLECTION_FREQ);
48 need_stat(TERMFREQ);
51 IneB2Weight *
52 IneB2Weight::clone() const
54 return new IneB2Weight(param_c);
57 void
58 IneB2Weight::init(double factor)
60 if (factor == 0.0) {
61 // This object is for the term-independent contribution, and that's
62 // always zero for this scheme.
63 return;
66 double wdfn_upper = get_wdf_upper_bound();
67 if (wdfn_upper == 0) {
68 upper_bound = 0.0;
69 return;
72 wdfn_upper *= log2(1 + (param_c * get_average_length()) /
73 get_doclength_lower_bound());
75 double N = get_collection_size();
76 double F = get_collection_freq();
77 double termfreq = get_termfreq();
79 double max_wdfn_product_B = (F + 1.0) / (termfreq + (termfreq / wdfn_upper));
81 double mean = F / N;
83 double expected_max = N * (1.0 - exp(-mean));
85 double idf_max = log2((N + 1.0) / (expected_max + 0.5));
87 /* Calculate constant values used in get_sumpart(). */
88 wqf_product_idf = get_wqf() * idf_max * factor;
89 c_product_avlen = param_c * get_average_length();
90 B_constant = (F + 1.0) / termfreq;
92 upper_bound = max_wdfn_product_B * idf_max * get_wqf() * factor;
95 string
96 IneB2Weight::name() const
98 return "Xapian::IneB2Weight";
101 string
102 IneB2Weight::short_name() const
104 return "ineb2";
107 string
108 IneB2Weight::serialise() const
110 return serialise_double(param_c);
113 IneB2Weight *
114 IneB2Weight::unserialise(const string & s) const
116 const char *ptr = s.data();
117 const char *end = ptr + s.size();
118 double c = unserialise_double(&ptr, end);
119 if (rare(ptr != end))
120 throw Xapian::SerialisationError("Extra data in IneB2Weight::unserialise()");
121 return new IneB2Weight(c);
124 double
125 IneB2Weight::get_sumpart(Xapian::termcount wdf, Xapian::termcount len,
126 Xapian::termcount, Xapian::termcount) const
128 if (wdf == 0) return 0.0;
129 double wdfn = wdf;
131 wdfn *= log2(1 + c_product_avlen / len);
133 double wdfn_product_B = wdfn * B_constant / (wdfn + 1.0);
135 return (wdfn_product_B * wqf_product_idf);
138 double
139 IneB2Weight::get_maxpart() const
141 return upper_bound;
144 static inline void
145 parameter_error(const char* message)
147 Xapian::Weight::Internal::parameter_error(message, "ineb2");
150 IneB2Weight *
151 IneB2Weight::create_from_parameters(const char * p) const
153 if (*p == '\0')
154 return new Xapian::IneB2Weight();
155 double k = 1.0;
156 if (!Xapian::Weight::Internal::double_param(&p, &k))
157 parameter_error("Parameter is invalid");
158 if (*p)
159 parameter_error("Extra data after parameter");
160 return new Xapian::IneB2Weight(k);