2 * @brief Xapian::LMWeight class - the Unigram Language Modelling formula.
4 /* Copyright (C) 2012 Gaurav Arora
5 * Copyright (C) 2016 Olly Betts
6 * Copyright (C) 2016 Vivek Pal
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
13 * This program is distributed in the hope that it will be useful
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
25 #include "xapian/weight.h"
29 #include "serialise-double.h"
31 #include "xapian/error.h"
40 LMWeight::clone() const {
41 return new LMWeight(param_log
, select_smoothing
, param_smoothing1
, param_smoothing2
);
45 LMWeight::init(double factor_
)
47 // weight_collection is really factor.
48 weight_collection
= factor_
;
50 /* Setting default values of the param_log to handle negative value of log.
51 * It is considered to be upperbound of document length.
52 * initializing param_log to upperbound of document_length.
55 if (param_log
== 0.0) {
56 param_log
= get_doclength_upper_bound();
59 /* Since the optimal parameter for Jelinek mercer smoothing
60 * is based on query length, so if query is title query changing
61 * default value of smoothing parameter.
64 if (select_smoothing
== JELINEK_MERCER_SMOOTHING
||
65 select_smoothing
== TWO_STAGE_SMOOTHING
) {
66 if (param_smoothing1
== 0.7) {
67 if (get_query_length() <= 2) {
68 param_smoothing1
= 0.1;
73 /* param_smoothing1 default value should be 2000 in case
74 * DIRICHLET_SMOOTHING is selected. Tweaking param_smoothing1
75 * if user supply his own value for param_smoothing1 value will not be set
76 * to 2000(default value)
78 if (select_smoothing
== DIRICHLET_SMOOTHING
) {
79 if (param_smoothing1
== 0.7) {
80 param_smoothing1
= 2000;
84 /* Setting param_smoothing1 and param_smoothing2 default value to used when
85 * DIRICHLET_PLUS_SMOOTHING is selected.*/
86 if (select_smoothing
== DIRICHLET_PLUS_SMOOTHING
) {
87 if (param_smoothing1
== 0.7) {
88 param_smoothing1
= 2000;
94 LMWeight::name() const
96 return "Xapian::LMWeight";
100 LMWeight::serialise() const
102 string result
= serialise_double(param_log
);
103 result
+= static_cast<unsigned char>(select_smoothing
);
104 result
+= serialise_double(param_smoothing1
);
105 result
+= serialise_double(param_smoothing2
);
110 LMWeight::unserialise(const string
& s
) const
112 const char *ptr
= s
.data();
113 const char *end
= ptr
+ s
.size();
114 double param_log_
= unserialise_double(&ptr
, end
);
115 type_smoothing select_smoothing_
= static_cast<type_smoothing
>(*(ptr
)++);
116 double param_smoothing1_
= unserialise_double(&ptr
, end
);
117 double param_smoothing2_
= unserialise_double(&ptr
, end
);
118 if (rare(ptr
!= end
))
119 throw Xapian::SerialisationError("Extra data in LMWeight::unserialise()");
120 return new LMWeight(param_log_
, select_smoothing_
, param_smoothing1_
, param_smoothing2_
);
124 LMWeight::get_sumpart(Xapian::termcount wdf
, Xapian::termcount len
,
125 Xapian::termcount uniqterm
) const
127 // Within Document Frequency of the term in document being considered.
128 double wdf_double
= wdf
;
129 // Length of the Document in terms of number of terms.
130 double len_double
= len
;
131 // variable to store weight contribution of term in the document scoring for LM.
134 /* In case the within document frequency of term is zero smoothing will
135 * be required and should be return instead of returning zero, as returning
136 * LM score are multiplication of contribution of all terms, due to absence
137 * of single term whole document is scored zero, hence apply collection
138 * frequency smoothing.
140 double wt_coll
= get_collection_freq() / double(get_total_length());
142 // Calculating weights considering different smoothing option available to user.
143 if (select_smoothing
== JELINEK_MERCER_SMOOTHING
) {
144 /* Maximum likelihood of current term, weight contribution of term in
145 * case query term is present in the document.
147 double weight_document
= wdf_double
/ len_double
;
148 weight_sum
= (param_smoothing1
* wt_coll
) +
149 ((1 - param_smoothing1
) * weight_document
);
150 } else if (select_smoothing
== DIRICHLET_SMOOTHING
) {
151 weight_sum
= (wdf_double
+ (param_smoothing1
* wt_coll
)) /
152 (len_double
+ param_smoothing1
);
153 } else if (select_smoothing
== DIRICHLET_PLUS_SMOOTHING
) {
154 /* In the Dir+ weighting formula, sumpart weight contribution is :-
156 * sum of log of (1 + (wdf/(param_smoothing1 * wt_coll))) and
157 * log of (1 + (delta/param_smoothing1 * wt_coll))).
158 * Since, sum of logs is log of product so weight_sum is calculated as product
159 * of terms in log in the Dir+ formula.
161 weight_sum
= (1 + (wdf_double
/ (param_smoothing1
* wt_coll
))) *
162 (1 + (param_smoothing2
/ (param_smoothing1
* wt_coll
)));
163 } else if (select_smoothing
== ABSOLUTE_DISCOUNT_SMOOTHING
) {
164 double uniqterm_double
= uniqterm
;
165 weight_sum
= ((((wdf_double
- param_smoothing1
) > 0) ? (wdf_double
- param_smoothing1
) : 0) / len_double
) + ((param_smoothing1
* wt_coll
* uniqterm_double
) / len_double
);
167 weight_sum
= (((1 - param_smoothing1
) * (wdf_double
+ (param_smoothing2
* wt_coll
)) / (len_double
+ param_smoothing2
)) + (param_smoothing1
* wt_coll
));
170 /* Since LM score is calculated with multiplication, instead of changing
171 * the current implementation log trick have been used to calculate the
172 * product since (sum of log is log of product and since aim is ranking
173 * ranking document by product or log of product won't make a large
174 * difference hence log(product) will be used for ranking.
176 double product
= weight_sum
* param_log
;
177 // weight_collection is really factor.
178 return (product
> 1.0) ? weight_collection
* log(product
) : 0;
182 LMWeight::get_maxpart() const
184 // Variable to store the collection frequency
186 // Store upper bound on wdf in variable wdf_max
187 double wdf_max
= get_wdf_upper_bound();
189 /* In case the within document frequency of term is zero smoothing will
190 * be required and should be return instead of returning zero, as
191 * returning LM score are multiplication of contribution of all terms,
192 * due to absence of single term whole document is scored zero, hence
193 * apply collection frequency smoothing.
195 double wt_coll
= get_collection_freq() / double(get_total_length());
197 // Calculating upper bound considering different smoothing option available to user.
198 if (select_smoothing
== JELINEK_MERCER_SMOOTHING
) {
199 upper_bound
= (param_smoothing1
* wt_coll
) + (1 - param_smoothing1
);
200 } else if (select_smoothing
== DIRICHLET_SMOOTHING
) {
201 upper_bound
= (get_doclength_upper_bound() + (param_smoothing1
* wt_coll
)) / (get_doclength_upper_bound() + param_smoothing1
);
202 } else if (select_smoothing
== DIRICHLET_PLUS_SMOOTHING
) {
203 upper_bound
= (1 + (wdf_max
/ (param_smoothing1
* wt_coll
))) *
204 (1 + (param_smoothing2
/ (param_smoothing1
* wt_coll
)));
205 } else if (select_smoothing
== ABSOLUTE_DISCOUNT_SMOOTHING
) {
206 upper_bound
= param_smoothing1
* wt_coll
+ 1;
208 upper_bound
= (((1 - param_smoothing1
) * (get_doclength_upper_bound() + (param_smoothing2
* wt_coll
)) / (get_doclength_upper_bound() + param_smoothing2
)) + (param_smoothing1
* wt_coll
));
211 /* Since weight are calculated using log trick, using same with the bounds. Refer
212 * comment in get_sumpart for the details.
214 double product
= upper_bound
* param_log
;
215 // weight_collection is really factor.
216 return (product
> 1.0) ? weight_collection
* log(product
) : 1.0;
219 /* The extra weight component in the Dir+ formula is :-
221 * |Q| * log (param_smoothing1 / (|D| + param_smoothing1))
223 * where, |Q| is total query length.
224 * |D| is total document length.
227 LMWeight::get_sumextra(Xapian::termcount len
, Xapian::termcount
) const
229 if (select_smoothing
== DIRICHLET_PLUS_SMOOTHING
) {
230 double extra_weight
= param_smoothing1
/ (len
+ param_smoothing1
);
231 return get_query_length() * log(extra_weight
);
237 LMWeight::get_maxextra() const
239 if (select_smoothing
== DIRICHLET_PLUS_SMOOTHING
) {
240 double extra_weight
= param_smoothing1
/ (get_doclength_lower_bound() + param_smoothing1
);
241 return get_query_length() * log(extra_weight
);