2 * @brief Xapian::BM25Weight class - the BM25 probabilistic formula
4 /* Copyright (C) 2009,2010,2011,2012,2014,2015 Olly Betts
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
11 * This program is distributed in the hope that it will be useful
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
23 #include "xapian/weight.h"
27 #include "serialise-double.h"
29 #include "xapian/error.h"
39 BM25Weight::clone() const
41 return new BM25Weight(param_k1
, param_k2
, param_k3
, param_b
,
46 BM25Weight::init(double factor
)
48 Xapian::doccount tf
= get_termfreq();
51 if (get_rset_size() != 0) {
52 Xapian::doccount reltermfreq
= get_reltermfreq();
54 // There can't be more relevant documents indexed by a term than there
55 // are documents indexed by that term.
56 AssertRel(reltermfreq
,<=,tf
);
58 // There can't be more relevant documents indexed by a term than there
59 // are relevant documents.
60 AssertRel(reltermfreq
,<=,get_rset_size());
62 Xapian::doccount reldocs_not_indexed
= get_rset_size() - reltermfreq
;
64 // There can't be more relevant documents not indexed by a term than
65 // there are documents not indexed by that term.
66 AssertRel(reldocs_not_indexed
,<=,get_collection_size() - tf
);
68 Xapian::doccount Q
= get_collection_size() - reldocs_not_indexed
;
70 Xapian::doccount nonreldocs_indexed
= tf
- reltermfreq
;
71 double numerator
= (reltermfreq
+ 0.5) * (Q
- tf
+ 0.5);
72 double denom
= (reldocs_not_indexed
+ 0.5) * (nonreldocs_indexed
+ 0.5);
73 tw
= numerator
/ denom
;
75 tw
= (get_collection_size() - tf
+ 0.5) / (tf
+ 0.5);
80 // The "official" formula can give a negative termweight in unusual cases
81 // (without an RSet, when a term indexes more than half the documents in
82 // the database). These negative weights aren't actually helpful, and it
83 // is common for implementations to replace them with a small positive
86 // Truncating to zero doesn't seem a great approach in practice as it
87 // means that some terms in the query can have no effect at all on the
88 // ranking, and that some results can have zero weight, both of which
89 // are seem surprising.
91 // Xapian 1.0.x and earlier adjusted the termweight for any term indexing
92 // more than a third of documents, which seems rather "intrusive". That's
93 // what the code currently enabled does, but perhaps it would be better to
94 // do something else. (FIXME)
96 if (rare(tw
<= 1.0)) {
99 termweight
= log(tw
) * factor
;
101 double wqf_double
= get_wqf();
102 termweight
*= (param_k3
+ 1) * wqf_double
/ (param_k3
+ wqf_double
);
106 if (tw
< 2) tw
= tw
* 0.5 + 1;
107 termweight
= log(tw
) * factor
;
109 double wqf_double
= get_wqf();
110 termweight
*= (param_k3
+ 1) * wqf_double
/ (param_k3
+ wqf_double
);
113 termweight
*= (param_k1
+ 1);
115 LOGVALUE(WTCALC
, termweight
);
117 if (param_k2
== 0 && (param_b
== 0 || param_k1
== 0)) {
118 // If k2 is 0, and either param_b or param_k1 is 0 then the document
119 // length doesn't affect the weight.
122 len_factor
= get_average_length();
123 // len_factor can be zero if all documents are empty (or the database
125 if (len_factor
!= 0) len_factor
= 1 / len_factor
;
128 LOGVALUE(WTCALC
, len_factor
);
132 BM25Weight::name() const
134 return "Xapian::BM25Weight";
138 BM25Weight::serialise() const
140 string result
= serialise_double(param_k1
);
141 result
+= serialise_double(param_k2
);
142 result
+= serialise_double(param_k3
);
143 result
+= serialise_double(param_b
);
144 result
+= serialise_double(param_min_normlen
);
149 BM25Weight::unserialise(const string
& s
) const
151 const char *ptr
= s
.data();
152 const char *end
= ptr
+ s
.size();
153 double k1
= unserialise_double(&ptr
, end
);
154 double k2
= unserialise_double(&ptr
, end
);
155 double k3
= unserialise_double(&ptr
, end
);
156 double b
= unserialise_double(&ptr
, end
);
157 double min_normlen
= unserialise_double(&ptr
, end
);
158 if (rare(ptr
!= end
))
159 throw Xapian::SerialisationError("Extra data in BM25Weight::unserialise()");
160 return new BM25Weight(k1
, k2
, k3
, b
, min_normlen
);
164 BM25Weight::get_sumpart(Xapian::termcount wdf
, Xapian::termcount len
,
165 Xapian::termcount
) const
167 LOGCALL(WTCALC
, double, "BM25Weight::get_sumpart", wdf
| len
);
168 Xapian::doclength normlen
= max(len
* len_factor
, param_min_normlen
);
170 double wdf_double
= wdf
;
171 double denom
= param_k1
* (normlen
* param_b
+ (1 - param_b
)) + wdf_double
;
172 AssertRel(denom
,>,0);
173 RETURN(termweight
* (wdf_double
/ denom
));
177 BM25Weight::get_maxpart() const
179 LOGCALL(WTCALC
, double, "BM25Weight::get_maxpart", NO_ARGS
);
180 double denom
= param_k1
;
181 Xapian::termcount wdf_max
= get_wdf_upper_bound();
182 if (param_k1
!= 0.0) {
183 if (param_b
!= 0.0) {
184 // "Upper-bound Approximations for Dynamic Pruning" Craig
185 // Macdonald, Nicola Tonellotto and Iadh Ounis. ACM Transactions on
186 // Information Systems. 29(4), 2011 shows that evaluating at
187 // doclen=wdf_max is a good bound.
189 // However, we can do better if doclen_min > wdf_max since then a
190 // better bound can be found by simply evaluating at
191 // doclen=doclen_min and wdf=wdf_max.
192 Xapian::doclength normlen_lb
=
193 max(max(wdf_max
, get_doclength_lower_bound()) * len_factor
,
195 denom
*= (normlen_lb
* param_b
+ (1 - param_b
));
199 AssertRel(denom
,>,0);
200 RETURN(termweight
* (wdf_max
/ denom
));
203 /* The BM25 formula gives:
205 * param_k2 * query_length * (1 - normlen) / (1 + normlen)
207 * To avoid negative sumextra we add the constant (param_k2 * query_length)
210 * 2 * param_k2 * query_length / (1 + normlen)
213 BM25Weight::get_sumextra(Xapian::termcount len
, Xapian::termcount
) const
215 LOGCALL(WTCALC
, double, "BM25Weight::get_sumextra", len
);
216 double num
= (2.0 * param_k2
* get_query_length());
217 RETURN(num
/ (1.0 + max(len
* len_factor
, param_min_normlen
)));
221 BM25Weight::get_maxextra() const
223 LOGCALL(WTCALC
, double, "BM25Weight::get_maxextra", NO_ARGS
);
226 double num
= (2.0 * param_k2
* get_query_length());
227 RETURN(num
/ (1.0 + max(get_doclength_lower_bound() * len_factor
,
228 param_min_normlen
)));