2 * @brief Xapian::BM25PlusWeight class - the BM25+ probabilistic formula
4 /* Copyright (C) 2009,2010,2011,2012,2014,2015,2016 Olly Betts
5 * Copyright (C) 2016 Vivek Pal
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
24 #include "xapian/weight.h"
28 #include "serialise-double.h"
30 #include "xapian/error.h"
40 BM25PlusWeight::clone() const
42 return new BM25PlusWeight(param_k1
, param_k2
, param_k3
, param_b
,
43 param_min_normlen
, param_delta
);
47 BM25PlusWeight::init(double factor
)
49 Xapian::doccount tf
= get_termfreq();
54 // BM25+ formula uses IDF = log((total_no_of_docs + 1) / tf)
55 termweight
= log(double(get_collection_size() + 1) / tf
);
58 double wqf_double
= get_wqf();
59 termweight
*= (param_k3
+ 1) * wqf_double
/ (param_k3
+ wqf_double
);
63 LOGVALUE(WTCALC
, termweight
);
65 if (param_k2
== 0 && (param_b
== 0 || param_k1
== 0)) {
66 // If k2 is 0, and either param_b or param_k1 is 0 then the document
67 // length doesn't affect the weight.
70 len_factor
= get_average_length();
71 // len_factor can be zero if all documents are empty (or the database
73 if (len_factor
!= 0) len_factor
= 1 / len_factor
;
76 LOGVALUE(WTCALC
, len_factor
);
80 BM25PlusWeight::name() const
82 return "Xapian::BM25PlusWeight";
86 BM25PlusWeight::serialise() const
88 string result
= serialise_double(param_k1
);
89 result
+= serialise_double(param_k2
);
90 result
+= serialise_double(param_k3
);
91 result
+= serialise_double(param_b
);
92 result
+= serialise_double(param_min_normlen
);
93 result
+= serialise_double(param_delta
);
98 BM25PlusWeight::unserialise(const string
& s
) const
100 const char *ptr
= s
.data();
101 const char *end
= ptr
+ s
.size();
102 double k1
= unserialise_double(&ptr
, end
);
103 double k2
= unserialise_double(&ptr
, end
);
104 double k3
= unserialise_double(&ptr
, end
);
105 double b
= unserialise_double(&ptr
, end
);
106 double min_normlen
= unserialise_double(&ptr
, end
);
107 double delta
= unserialise_double(&ptr
, end
);
108 if (rare(ptr
!= end
))
109 throw Xapian::SerialisationError("Extra data in BM25PlusWeight::unserialise()");
110 return new BM25PlusWeight(k1
, k2
, k3
, b
, min_normlen
, delta
);
114 BM25PlusWeight::get_sumpart(Xapian::termcount wdf
, Xapian::termcount len
,
115 Xapian::termcount
) const
117 LOGCALL(WTCALC
, double, "BM25PlusWeight::get_sumpart", wdf
| len
);
118 Xapian::doclength normlen
= max(len
* len_factor
, param_min_normlen
);
120 double wdf_double
= wdf
;
121 double denom
= param_k1
* (normlen
* param_b
+ (1 - param_b
)) + wdf_double
;
122 AssertRel(denom
,>,0);
123 // Parameter delta (δ) is a pseudo tf value to control the scale of the
124 // tf lower bound. δ can be tuned for e.g from 0.0 to 1.5 but BM25+ can
125 // still work effectively across collections with a fixed δ = 1.0
126 RETURN(termweight
* ((param_k1
+ 1) * wdf_double
/ denom
+ param_delta
));
130 BM25PlusWeight::get_maxpart() const
132 LOGCALL(WTCALC
, double, "BM25PlusWeight::get_maxpart", NO_ARGS
);
133 double denom
= param_k1
;
134 Xapian::termcount wdf_max
= get_wdf_upper_bound();
135 if (param_k1
!= 0.0) {
136 if (param_b
!= 0.0) {
137 // "Upper-bound Approximations for Dynamic Pruning" Craig
138 // Macdonald, Nicola Tonellotto and Iadh Ounis. ACM Transactions on
139 // Information Systems. 29(4), 2011 shows that evaluating at
140 // doclen=wdf_max is a good bound.
142 // However, we can do better if doclen_min > wdf_max since then a
143 // better bound can be found by simply evaluating at
144 // doclen=doclen_min and wdf=wdf_max.
145 Xapian::doclength normlen_lb
=
146 max(max(wdf_max
, get_doclength_lower_bound()) * len_factor
,
148 denom
*= (normlen_lb
* param_b
+ (1 - param_b
));
152 AssertRel(denom
,>,0);
153 RETURN(termweight
* ((param_k1
+ 1) * wdf_max
/ denom
+ param_delta
));
156 /* The paper which describes BM25+ ignores BM25's document-independent
157 * component (so implicitly k2=0), but we support non-zero k2 too.
159 * The BM25 formula gives:
161 * param_k2 * query_length * (1 - normlen) / (1 + normlen)
163 * To avoid negative sumextra we add the constant (param_k2 * query_length)
166 * 2 * param_k2 * query_length / (1 + normlen)
169 BM25PlusWeight::get_sumextra(Xapian::termcount len
, Xapian::termcount
) const
171 LOGCALL(WTCALC
, double, "BM25PlusWeight::get_sumextra", len
);
172 double num
= (2.0 * param_k2
* get_query_length());
173 RETURN(num
/ (1.0 + max(len
* len_factor
, param_min_normlen
)));
177 BM25PlusWeight::get_maxextra() const
179 LOGCALL(WTCALC
, double, "BM25PlusWeight::get_maxextra", NO_ARGS
);
182 double num
= (2.0 * param_k2
* get_query_length());
183 RETURN(num
/ (1.0 + max(get_doclength_lower_bound() * len_factor
,
184 param_min_normlen
)));