scriptindex: Finish parsing index script after error
[xapian.git] / xapian-core / weight / tfidfweight.cc
blob12d7e5604bb9420eaa94a9b5e63c15a07df01e0b
1 /** @file
2 * @brief Xapian::TfIdfWeight class - The TfIdf weighting scheme
3 */
4 /* Copyright (C) 2013 Aarsh Shah
5 * Copyright (C) 2016 Vivek Pal
6 * Copyright (C) 2016,2017 Olly Betts
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
13 * This program is distributed in the hope that it will be useful
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
23 #include <config.h>
25 #include "xapian/weight.h"
26 #include <cmath>
27 #include <cstring>
29 #include "debuglog.h"
30 #include "omassert.h"
32 #include "xapian/error.h"
34 using namespace std;
36 namespace Xapian {
38 TfIdfWeight::TfIdfWeight(const std::string &normals)
39 : normalizations(normals)
41 if (normalizations.length() != 3 ||
42 !strchr("nbslL", normalizations[0]) ||
43 !strchr("ntpfs", normalizations[1]) ||
44 !strchr("n", normalizations[2]))
45 throw Xapian::InvalidArgumentError("Normalization string is invalid");
46 if (normalizations[1] != 'n') {
47 need_stat(TERMFREQ);
48 need_stat(COLLECTION_SIZE);
50 need_stat(WDF);
51 need_stat(WDF_MAX);
52 need_stat(WQF);
53 if (normalizations[0] == 'L') {
54 need_stat(DOC_LENGTH);
55 need_stat(DOC_LENGTH_MIN);
56 need_stat(DOC_LENGTH_MAX);
57 need_stat(UNIQUE_TERMS);
61 TfIdfWeight *
62 TfIdfWeight::clone() const
64 return new TfIdfWeight(normalizations);
67 void
68 TfIdfWeight::init(double factor_)
70 if (factor_ == 0.0) {
71 // This object is for the term-independent contribution, and that's
72 // always zero for this scheme.
73 return;
76 factor = get_wqf() * factor_;
79 string
80 TfIdfWeight::name() const
82 return "Xapian::TfIdfWeight";
85 string
86 TfIdfWeight::serialise() const
88 return normalizations;
91 TfIdfWeight *
92 TfIdfWeight::unserialise(const string & s) const
94 if (s.length() != 3)
95 throw Xapian::SerialisationError("Extra data in TfIdfWeight::unserialise()");
96 return new TfIdfWeight(s);
99 static double
100 get_wdfn_for_L(Xapian::termcount wdf, Xapian::termcount doclen,
101 Xapian::termcount uniqterms)
103 if (wdf == 0) return 0;
104 double uniqterm_double = uniqterms;
105 double doclen_double = doclen;
106 double wdf_avg = 1;
107 if (doclen_double == 0 || uniqterm_double == 0)
108 wdf_avg = 1;
109 else
110 wdf_avg = doclen_double / uniqterm_double;
111 double num = 1 + log(double(wdf));
112 double den = 1 + log(wdf_avg);
113 return num / den;
116 double
117 TfIdfWeight::get_sumpart(Xapian::termcount wdf, Xapian::termcount doclen,
118 Xapian::termcount uniqterms) const
120 Xapian::doccount termfreq = 1;
121 if (normalizations[1] != 'n') termfreq = get_termfreq();
122 double wt;
123 if (normalizations[0] != 'L') {
124 wt = get_wdfn(wdf, normalizations[0]);
125 } else {
126 wt = get_wdfn_for_L(wdf, doclen, uniqterms);
128 wt *= get_idfn(termfreq, normalizations[1]);
129 return get_wtn(wt, normalizations[2]) * factor;
132 // An upper bound can be calculated simply on the basis of wdf_max as termfreq
133 // and N are constants.
134 double
135 TfIdfWeight::get_maxpart() const
137 Xapian::doccount termfreq = 1;
138 if (normalizations[1] != 'n') termfreq = get_termfreq();
139 Xapian::termcount wdf_max = get_wdf_upper_bound();
140 double wt;
141 if (normalizations[0] != 'L') {
142 wt = get_wdfn(wdf_max, normalizations[0]);
143 } else {
144 Xapian::termcount len_min = get_doclength_lower_bound();
145 wt = get_wdfn_for_L(wdf_max, len_min, len_min);
147 wt *= get_idfn(termfreq, normalizations[1]);
148 return get_wtn(wt, normalizations[2]) * factor;
151 // There is no extra per document component in the TfIdfWeighting scheme.
152 double
153 TfIdfWeight::get_sumextra(Xapian::termcount, Xapian::termcount) const
155 return 0;
158 double
159 TfIdfWeight::get_maxextra() const
161 return 0;
164 // Return normalized wdf, idf and weight depending on the normalization string.
165 double
166 TfIdfWeight::get_wdfn(Xapian::termcount wdf, char c) const
168 switch (c) {
169 case 'b':
170 if (wdf == 0) return 0;
171 return 1.0;
172 case 's':
173 return (wdf * wdf);
174 case 'l':
175 if (wdf == 0) return 0;
176 return (1 + log(double(wdf)));
177 default:
178 AssertEq(c, 'n');
179 return wdf;
183 double
184 TfIdfWeight::get_idfn(Xapian::doccount termfreq, char c) const
186 double N = 1.0;
187 if (c != 'n' && c != 'f') N = get_collection_size();
188 switch (c) {
189 case 'n':
190 return 1.0;
191 case 'p':
192 // All documents are indexed by the term
193 if (N == termfreq) return 0;
194 return log((N - termfreq) / termfreq);
195 case 'f':
196 return (1.0 / termfreq);
197 case 's':
198 return pow(log(N / termfreq), 2.0);
199 default:
200 AssertEq(c, 't');
201 return (log(N / termfreq));
205 double
206 TfIdfWeight::get_wtn(double wt, char c) const
208 (void)c;
209 AssertEq(c, 'n');
210 return wt;