2 * @brief Xapian::DLHWeight class - The DLH weighting scheme of the DFR framework.
4 /* Copyright (C) 2013, 2014 Aarsh Shah
5 * Copyright (C) 2016,2017,2019 Olly Betts
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
24 #include "xapian/weight.h"
26 #include "xapian/error.h"
36 DLHWeight::clone() const
38 return new DLHWeight();
42 DLHWeight::init(double factor
)
45 // This object is for the term-independent contribution, and that's
46 // always zero for this scheme.
50 double wdf_upper
= get_wdf_upper_bound();
56 const double wdf_lower
= 1.0;
57 double len_upper
= get_doclength_upper_bound();
58 double len_lower
= get_doclength_lower_bound();
60 double F
= get_collection_freq();
62 // Calculate constant values to be used in get_sumpart().
63 log_constant
= get_total_length() / F
;
64 wqf_product_factor
= get_wqf() * factor
;
66 // Calculate values for the upper bound.
68 // w <= l, so if the allowed ranges overlap, max w/l is 1.0.
69 double max_wdf_over_l
= wdf_upper
< len_lower
? wdf_upper
/ len_lower
: 1.0;
71 // First term A: w/(w+.5)*log2(w/l*L) where L=total_len/coll_freq
73 // w/(w+.5) = 1-1/(2w+1) and w >= 1 so max A at w=w_max
74 // log2(w/l*L) maximised when w/l maximised
75 // so max A at w=w_max, l=max(l_min, w_max)
76 // If log < 0 => then A <= 0, so max A <= 0 and we want to minimise the
77 // factor outside the log.
78 double logged_expr
= max_wdf_over_l
* log_constant
;
79 double w_for_A
= logged_expr
> 1.0 ? wdf_upper
: wdf_lower
;
80 double A
= w_for_A
/ (w_for_A
+ 0.5) * log2(logged_expr
);
86 // The log is negative, and w <= l so B <= 0 and its maximum is the value
87 // as close to zero as possible. So smaller (l-w) is better and smaller
90 // This function is ill defined at w=l, but -> 0 as w -> l.
92 // If w=l is valid (i.e. len_lower > wdf_upper) then B = 0.
94 if (len_lower
> wdf_upper
) {
95 // If not, then minimising l-w gives us a candidate (i.e. w=wdf_upper
98 // The function is also 0 at w = 0 (there must be a local mimina at
99 // some value of w between 0 and l), so the other candidate is at
102 // We need to find the optimum value of l in this case, so
103 // differentiate the formula by l:
105 // d/dl: log2(1-w/l) + (l-w)*(1-w/l)/(l*log(2))
106 // = (log(1-w/l) + (1-w/l)²)/log(2)
107 // = (log(x) + x²)/log(2) [x=1-w/l]
109 // which is 0 at approx x=0.65291864
111 // x=1-w/l <=> l*(1-x)=w <=> l=w/(1-x) <=> l ~= 0.34708136*w
113 // but l >= w so we can't attain that (and the log isn't valid there).
115 // Gradient at (without loss of generality) l=2*w is:
116 // (log(0.5) + 0.25)/log(2)
117 // which is < 0 so want to minimise l, i.e. l=len_lower, so the other
118 // candidate is w=wdf_lower and l=len_lower.
120 // So evaluate both candidates and pick the larger:
121 double B1
= (len_lower
- wdf_lower
) * log2(1.0 - wdf_lower
/ len_lower
);
122 double B2
= (len_lower
- wdf_upper
) * log2(1.0 - wdf_upper
/ len_lower
);
126 /* An upper bound of the term used in the third log can be obtained by:
128 * 0.5 * log2(2.0 * M_PI * wdf * (1 - wdf / len))
130 * An upper bound on wdf * (1 - wdf / len) (and hence on the log, since
131 * log is a monotonically increasing function on positive numbers) can
132 * be obtained by plugging in the upper bound of the length and
133 * differentiating the term w.r.t wdf which gives the value of wdf at which
134 * the function attains maximum value - at wdf = len_upper / 2 (or if the
135 * wdf can't be that large, at wdf = wdf_upper): */
136 double wdf_var
= min(wdf_upper
, len_upper
/ 2.0);
137 double max_product
= wdf_var
* (1.0 - wdf_var
/ len_upper
);
139 /* An upper bound can also be obtained by taking the minimum and maximum
140 * wdf value in the formula as shown (which isn't useful now as it's always
141 * >= the bound above, but could be useful if we tracked bounds on wdf/len):
143 double min_wdf_to_len
= wdf_lower
/ len_upper
;
144 double max_product_2
= wdf_upper
* (1.0 - min_wdf_to_len
);
145 /* Take the minimum of the two upper bounds. */
146 max_product
= min(max_product
, max_product_2
);
148 double C
= 0.5 * log2(2.0 * M_PI
* max_product
) / (wdf_lower
+ 0.5);
149 upper_bound
= A
+ B
+ C
;
151 if (rare(upper_bound
< 0.0))
154 upper_bound
*= wqf_product_factor
;
158 DLHWeight::name() const
160 return "Xapian::DLHWeight";
164 DLHWeight::short_name() const
170 DLHWeight::serialise() const
176 DLHWeight::unserialise(const string
& s
) const
178 if (rare(!s
.empty()))
179 throw Xapian::SerialisationError("Extra data in DLHWeight::unserialise()");
180 return new DLHWeight();
184 DLHWeight::get_sumpart(Xapian::termcount wdf
, Xapian::termcount len
,
185 Xapian::termcount
, Xapian::termcount
) const
187 if (wdf
== 0 || wdf
== len
) return 0.0;
189 double wdf_to_len
= double(wdf
) / len
;
190 double one_minus_wdf_to_len
= 1.0 - wdf_to_len
;
192 double wt
= wdf
* log2(wdf_to_len
* log_constant
) +
193 (len
- wdf
) * log2(one_minus_wdf_to_len
) +
194 0.5 * log2(2.0 * M_PI
* wdf
* one_minus_wdf_to_len
);
195 if (rare(wt
<= 0.0)) return 0.0;
197 return wqf_product_factor
* wt
/ (wdf
+ 0.5);
201 DLHWeight::get_maxpart() const
207 DLHWeight::create_from_parameters(const char * p
) const
210 throw InvalidArgumentError("No parameters are required for DLHWeight");
211 return new Xapian::DLHWeight();