2 * @brief Xapian::BB2Weight class - the BB2 weighting scheme of the DFR framework.
4 /* Copyright (C) 2013,2014 Aarsh Shah
5 * Copyright (C) 2014,2015,2016,2017,2024 Olly Betts
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
24 #include "xapian/weight.h"
26 #include "weightinternal.h"
28 #include "serialise-double.h"
30 #include "xapian/error.h"
38 static double stirling_value(double difference
, double y
, double stirling_constant
)
40 return ((y
+ 0.5) * (stirling_constant
- log2(y
)) + (difference
* stirling_constant
));
43 BB2Weight::BB2Weight(double c
) : param_c(c
)
46 throw Xapian::InvalidArgumentError("Parameter c is invalid");
47 need_stat(AVERAGE_LENGTH
);
48 need_stat(DOC_LENGTH
);
49 need_stat(DOC_LENGTH_MIN
);
50 need_stat(DOC_LENGTH_MAX
);
51 need_stat(COLLECTION_SIZE
);
52 need_stat(COLLECTION_FREQ
);
60 BB2Weight::clone() const
62 return new BB2Weight(param_c
);
66 BB2Weight::init(double factor
)
69 // This object is for the term-independent contribution, and that's
70 // always zero for this scheme.
74 double wdfn_upper
= get_wdf_upper_bound();
76 if (wdfn_upper
== 0) {
81 c_product_avlen
= param_c
* get_average_length();
82 double wdfn_lower(1.0);
83 wdfn_lower
*= log2(1 + c_product_avlen
/ get_doclength_upper_bound());
84 wdfn_upper
*= log2(1 + c_product_avlen
/ get_doclength_lower_bound());
86 double F
= get_collection_freq();
88 // Clamp wdfn to at most (F - 1) to avoid ill-defined log calculations in
90 if (rare(wdfn_lower
>= F
- 1))
92 if (rare(wdfn_upper
>= F
- 1))
95 B_constant
= get_wqf() * factor
* (F
+ 1.0) / get_termfreq();
97 // Clamp N to at least 2 to avoid ill-defined log calculations in
99 double N
= rare(get_collection_size() <= 2) ? 2.0 : double(get_collection_size());
101 wt
= -1.0 / log(2.0) - log2(N
- 1.0);
102 stirling_constant_1
= log2(N
+ F
- 1.0);
103 stirling_constant_2
= log2(F
);
105 // Maximize the Stirling value to be used in the upper bound.
106 // Calculate the individual terms keeping the maximization of Stirling value
108 double y_min
= F
- wdfn_upper
;
109 double y_max
= N
+ F
- wdfn_lower
- 2.0;
111 double stirling_max
= stirling_value(wdfn_upper
+ 1.0, y_max
,
112 stirling_constant_1
) -
113 stirling_value(wdfn_lower
, y_min
,
114 stirling_constant_2
);
116 double B_max
= B_constant
/ (wdfn_lower
+ 1.0);
117 upper_bound
= B_max
* (wt
+ stirling_max
);
118 if (rare(upper_bound
< 0.0))
123 BB2Weight::name() const
129 BB2Weight::serialise() const
131 return serialise_double(param_c
);
135 BB2Weight::unserialise(const string
& s
) const
137 const char *ptr
= s
.data();
138 const char *end
= ptr
+ s
.size();
139 double c
= unserialise_double(&ptr
, end
);
140 if (rare(ptr
!= end
))
141 throw Xapian::SerialisationError("Extra data in BB2Weight::unserialise()");
142 return new BB2Weight(c
);
146 BB2Weight::get_sumpart(Xapian::termcount wdf
, Xapian::termcount len
,
147 Xapian::termcount
, Xapian::termcount
) const
149 if (wdf
== 0) return 0.0;
151 double wdfn
= wdf
* log2(1 + c_product_avlen
/ len
);
153 double F
= get_collection_freq();
155 // Clamp wdfn to at most (F - 1) to avoid ill-defined log calculations in
157 if (rare(wdfn
>= F
- 1))
160 // Clamp N to at least 2 to avoid ill-defined log calculations in
162 Xapian::doccount N
= get_collection_size();
163 Xapian::doccount N_less_2
= rare(N
<= 2) ? 0 : N
- 2;
165 double y2
= F
- wdfn
;
166 double y1
= N_less_2
+ y2
;
167 double stirling
= stirling_value(wdfn
+ 1.0, y1
, stirling_constant_1
) -
168 stirling_value(wdfn
, y2
, stirling_constant_2
);
170 double B
= B_constant
/ (wdfn
+ 1.0);
171 double final_weight
= B
* (wt
+ stirling
);
172 if (rare(final_weight
< 0.0))
178 BB2Weight::get_maxpart() const
185 parameter_error(const char* message
, const char* params
)
187 Xapian::Weight::Internal::parameter_error(message
, "bb2", params
);
191 BB2Weight::create_from_parameters(const char* params
) const
193 const char* p
= params
;
195 return new Xapian::BB2Weight();
197 if (!Xapian::Weight::Internal::double_param(&p
, &c
))
198 parameter_error("Parameter is invalid", params
);
200 parameter_error("Extra data after parameter", params
);
201 return new Xapian::BB2Weight(c
);