2 * @brief Xapian::BB2Weight class - the BB2 weighting scheme of the DFR framework.
4 /* Copyright (C) 2013,2014 Aarsh Shah
5 * Copyright (C) 2014,2015,2016,2017 Olly Betts
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
24 #include "xapian/weight.h"
26 #include "weightinternal.h"
28 #include "serialise-double.h"
30 #include "xapian/error.h"
38 static double stirling_value(double difference
, double y
, double stirling_constant
)
40 return ((y
+ 0.5) * (stirling_constant
- log2(y
)) + (difference
* stirling_constant
));
43 BB2Weight::BB2Weight(double c
) : param_c(c
)
46 throw Xapian::InvalidArgumentError("Parameter c is invalid");
47 need_stat(AVERAGE_LENGTH
);
48 need_stat(DOC_LENGTH
);
49 need_stat(DOC_LENGTH_MIN
);
50 need_stat(DOC_LENGTH_MAX
);
51 need_stat(COLLECTION_SIZE
);
52 need_stat(COLLECTION_FREQ
);
60 BB2Weight::clone() const
62 return new BB2Weight(param_c
);
66 BB2Weight::init(double factor
)
69 // This object is for the term-independent contribution, and that's
70 // always zero for this scheme.
74 double wdfn_upper
= get_wdf_upper_bound();
76 if (wdfn_upper
== 0) {
81 c_product_avlen
= param_c
* get_average_length();
82 double wdfn_lower(1.0);
83 wdfn_lower
*= log2(1 + c_product_avlen
/ get_doclength_upper_bound());
84 wdfn_upper
*= log2(1 + c_product_avlen
/ get_doclength_lower_bound());
86 double F
= get_collection_freq();
88 // Clamp wdfn to at most (F - 1) to avoid ill-defined log calculations in
90 if (rare(wdfn_lower
>= F
- 1))
92 if (rare(wdfn_upper
>= F
- 1))
95 B_constant
= get_wqf() * factor
* (F
+ 1.0) / get_termfreq();
97 // Clamp N to at least 2 to avoid ill-defined log calculations in
99 double N
= rare(get_collection_size() <= 2) ? 2.0 : double(get_collection_size());
101 wt
= -1.0 / log(2.0) - log2(N
- 1.0);
102 stirling_constant_1
= log2(N
+ F
- 1.0);
103 stirling_constant_2
= log2(F
);
105 // Maximize the Stirling value to be used in the upper bound.
106 // Calculate the individual terms keeping the maximization of Stirling value
108 double y_min
= F
- wdfn_upper
;
109 double y_max
= N
+ F
- wdfn_lower
- 2.0;
111 double stirling_max
= stirling_value(wdfn_upper
+ 1.0, y_max
,
112 stirling_constant_1
) -
113 stirling_value(wdfn_lower
, y_min
,
114 stirling_constant_2
);
116 double B_max
= B_constant
/ (wdfn_lower
+ 1.0);
117 upper_bound
= B_max
* (wt
+ stirling_max
);
118 if (rare(upper_bound
< 0.0))
123 BB2Weight::name() const
125 return "Xapian::BB2Weight";
129 BB2Weight::short_name() const
135 BB2Weight::serialise() const
137 return serialise_double(param_c
);
141 BB2Weight::unserialise(const string
& s
) const
143 const char *ptr
= s
.data();
144 const char *end
= ptr
+ s
.size();
145 double c
= unserialise_double(&ptr
, end
);
146 if (rare(ptr
!= end
))
147 throw Xapian::SerialisationError("Extra data in BB2Weight::unserialise()");
148 return new BB2Weight(c
);
152 BB2Weight::get_sumpart(Xapian::termcount wdf
, Xapian::termcount len
,
153 Xapian::termcount
, Xapian::termcount
) const
155 if (wdf
== 0) return 0.0;
157 double wdfn
= wdf
* log2(1 + c_product_avlen
/ len
);
159 double F
= get_collection_freq();
161 // Clamp wdfn to at most (F - 1) to avoid ill-defined log calculations in
163 if (rare(wdfn
>= F
- 1))
166 // Clamp N to at least 2 to avoid ill-defined log calculations in
168 Xapian::doccount N
= get_collection_size();
169 Xapian::doccount N_less_2
= rare(N
<= 2) ? 0 : N
- 2;
171 double y2
= F
- wdfn
;
172 double y1
= N_less_2
+ y2
;
173 double stirling
= stirling_value(wdfn
+ 1.0, y1
, stirling_constant_1
) -
174 stirling_value(wdfn
, y2
, stirling_constant_2
);
176 double B
= B_constant
/ (wdfn
+ 1.0);
177 double final_weight
= B
* (wt
+ stirling
);
178 if (rare(final_weight
< 0.0))
184 BB2Weight::get_maxpart() const
190 parameter_error(const char* message
)
192 Xapian::Weight::Internal::parameter_error(message
, "bb2");
196 BB2Weight::create_from_parameters(const char * p
) const
199 return new Xapian::BB2Weight();
201 if (!Xapian::Weight::Internal::double_param(&p
, &k
))
202 parameter_error("Parameter is invalid");
204 parameter_error("Extra data after parameter");
205 return new Xapian::BB2Weight(k
);