2 * @brief Xapian::TradWeight class - the "traditional" probabilistic formula
4 /* Copyright (C) 2009,2010,2011,2012,2014,2015,2017 Olly Betts
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
11 * This program is distributed in the hope that it will be useful
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
23 #include "xapian/weight.h"
24 #include "weightinternal.h"
28 #include "serialise-double.h"
30 #include "xapian/error.h"
40 TradWeight::clone() const
42 return new TradWeight(param_k
);
46 TradWeight::init(double factor
)
49 // This object is for the term-independent contribution, and that's
50 // always zero for this scheme.
54 Xapian::doccount tf
= get_termfreq();
57 if (get_rset_size() != 0) {
58 Xapian::doccount reltermfreq
= get_reltermfreq();
60 // There can't be more relevant documents indexed by a term than there
61 // are documents indexed by that term.
62 AssertRel(reltermfreq
,<=,tf
);
64 // There can't be more relevant documents indexed by a term than there
65 // are relevant documents.
66 AssertRel(reltermfreq
,<=,get_rset_size());
68 Xapian::doccount reldocs_not_indexed
= get_rset_size() - reltermfreq
;
70 // There can't be more relevant documents not indexed by a term than
71 // there are documents not indexed by that term.
72 AssertRel(reldocs_not_indexed
,<=,get_collection_size() - tf
);
74 Xapian::doccount Q
= get_collection_size() - reldocs_not_indexed
;
76 Xapian::doccount nonreldocs_indexed
= tf
- reltermfreq
;
77 double numerator
= (reltermfreq
+ 0.5) * (Q
- tf
+ 0.5);
78 double denom
= (reldocs_not_indexed
+ 0.5) * (nonreldocs_indexed
+ 0.5);
79 tw
= numerator
/ denom
;
81 tw
= (get_collection_size() - tf
+ 0.5) / (tf
+ 0.5);
86 // The "official" formula can give a negative termweight in unusual cases
87 // (without an RSet, when a term indexes more than half the documents in
88 // the database). These negative weights aren't actually helpful, and it
89 // is common for implementations to replace them with a small positive
92 // Truncating to zero doesn't seem a great approach in practice as it
93 // means that some terms in the query can have no effect at all on the
94 // ranking, and that some results can have zero weight, both of which
95 // are seem surprising.
97 // Xapian 1.0.x and earlier adjusted the termweight for any term indexing
98 // more than a third of documents, which seems rather "intrusive". That's
99 // what the code currently enabled does, but perhaps it would be better to
100 // do something else. (FIXME)
102 if (rare(tw
<= 1.0)) {
105 termweight
= log(tw
) * factor
;
108 if (tw
< 2) tw
= tw
* 0.5 + 1;
109 termweight
= log(tw
) * factor
;
112 LOGVALUE(WTCALC
, termweight
);
115 // If param_k is 0 then the document length doesn't affect the weight.
118 len_factor
= get_average_length();
119 // len_factor can be zero if all documents are empty (or the database is
121 if (len_factor
!= 0) len_factor
= param_k
/ len_factor
;
124 LOGVALUE(WTCALC
, len_factor
);
128 TradWeight::name() const
130 return "Xapian::TradWeight";
134 TradWeight::short_name() const
140 TradWeight::serialise() const
142 return serialise_double(param_k
);
146 TradWeight::unserialise(const string
& s
) const
148 const char *ptr
= s
.data();
149 const char *end
= ptr
+ s
.size();
150 double k
= unserialise_double(&ptr
, end
);
151 if (rare(ptr
!= end
))
152 throw Xapian::SerialisationError("Extra data in TradWeight::unserialise()");
153 return new TradWeight(k
);
157 TradWeight::get_sumpart(Xapian::termcount wdf
, Xapian::termcount len
,
158 Xapian::termcount
, Xapian::termcount
) const
160 double wdf_double
= wdf
;
161 return termweight
* (wdf_double
/ (len
* len_factor
+ wdf_double
));
165 TradWeight::get_maxpart() const
167 double wdf_max
= get_wdf_upper_bound();
168 Xapian::termcount doclen_lb
= get_doclength_lower_bound();
169 return termweight
* (wdf_max
/ (doclen_lb
* len_factor
+ wdf_max
));
173 parameter_error(const char* message
)
175 Xapian::Weight::Internal::parameter_error(message
, "trad");
179 TradWeight::create_from_parameters(const char * p
) const
182 return new Xapian::TradWeight();
184 if (!Xapian::Weight::Internal::double_param(&p
, &k
))
185 parameter_error("Parameter is invalid");
187 parameter_error("Extra data after parameter");
188 return new Xapian::TradWeight(k
);