2 * @brief Xapian::TfIdfWeight class - The TfIdf weighting scheme
4 /* Copyright (C) 2013 Aarsh Shah
5 * Copyright (C) 2016 Vivek Pal
6 * Copyright (C) 2016,2017 Olly Betts
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
13 * This program is distributed in the hope that it will be useful
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
25 #include "xapian/weight.h"
32 #include "xapian/error.h"
38 TfIdfWeight::TfIdfWeight(const std::string
&normals
)
39 : normalizations(normals
)
41 if (normalizations
.length() != 3 ||
42 !strchr("nbslL", normalizations
[0]) ||
43 !strchr("ntpfs", normalizations
[1]) ||
44 !strchr("n", normalizations
[2]))
45 throw Xapian::InvalidArgumentError("Normalization string is invalid");
46 if (normalizations
[1] != 'n') {
48 need_stat(COLLECTION_SIZE
);
53 if (normalizations
[0] == 'L') {
54 need_stat(DOC_LENGTH
);
55 need_stat(DOC_LENGTH_MIN
);
56 need_stat(DOC_LENGTH_MAX
);
57 need_stat(UNIQUE_TERMS
);
62 TfIdfWeight::clone() const
64 return new TfIdfWeight(normalizations
);
68 TfIdfWeight::init(double factor_
)
71 // This object is for the term-independent contribution, and that's
72 // always zero for this scheme.
76 factor
= get_wqf() * factor_
;
80 TfIdfWeight::name() const
82 return "Xapian::TfIdfWeight";
86 TfIdfWeight::serialise() const
88 return normalizations
;
92 TfIdfWeight::unserialise(const string
& s
) const
95 throw Xapian::SerialisationError("Extra data in TfIdfWeight::unserialise()");
96 return new TfIdfWeight(s
);
100 get_wdfn_for_L(Xapian::termcount wdf
, Xapian::termcount doclen
,
101 Xapian::termcount uniqterms
)
103 if (wdf
== 0) return 0;
104 double uniqterm_double
= uniqterms
;
105 double doclen_double
= doclen
;
107 if (doclen_double
== 0 || uniqterm_double
== 0)
110 wdf_avg
= doclen_double
/ uniqterm_double
;
111 double num
= 1 + log(double(wdf
));
112 double den
= 1 + log(wdf_avg
);
117 TfIdfWeight::get_sumpart(Xapian::termcount wdf
, Xapian::termcount doclen
,
118 Xapian::termcount uniqterms
) const
120 Xapian::doccount termfreq
= 1;
121 if (normalizations
[1] != 'n') termfreq
= get_termfreq();
123 if (normalizations
[0] != 'L') {
124 wt
= get_wdfn(wdf
, normalizations
[0]);
126 wt
= get_wdfn_for_L(wdf
, doclen
, uniqterms
);
128 wt
*= get_idfn(termfreq
, normalizations
[1]);
129 return get_wtn(wt
, normalizations
[2]) * factor
;
132 // An upper bound can be calculated simply on the basis of wdf_max as termfreq
133 // and N are constants.
135 TfIdfWeight::get_maxpart() const
137 Xapian::doccount termfreq
= 1;
138 if (normalizations
[1] != 'n') termfreq
= get_termfreq();
139 Xapian::termcount wdf_max
= get_wdf_upper_bound();
141 if (normalizations
[0] != 'L') {
142 wt
= get_wdfn(wdf_max
, normalizations
[0]);
144 Xapian::termcount len_min
= get_doclength_lower_bound();
145 wt
= get_wdfn_for_L(wdf_max
, len_min
, len_min
);
147 wt
*= get_idfn(termfreq
, normalizations
[1]);
148 return get_wtn(wt
, normalizations
[2]) * factor
;
151 // There is no extra per document component in the TfIdfWeighting scheme.
153 TfIdfWeight::get_sumextra(Xapian::termcount
, Xapian::termcount
) const
159 TfIdfWeight::get_maxextra() const
164 // Return normalized wdf, idf and weight depending on the normalization string.
166 TfIdfWeight::get_wdfn(Xapian::termcount wdf
, char c
) const
170 if (wdf
== 0) return 0;
175 if (wdf
== 0) return 0;
176 return (1 + log(double(wdf
)));
184 TfIdfWeight::get_idfn(Xapian::doccount termfreq
, char c
) const
187 if (c
!= 'n' && c
!= 'f') N
= get_collection_size();
192 // All documents are indexed by the term
193 if (N
== termfreq
) return 0;
194 return log((N
- termfreq
) / termfreq
);
196 return (1.0 / termfreq
);
198 return pow(log(N
/ termfreq
), 2.0);
201 return (log(N
/ termfreq
));
206 TfIdfWeight::get_wtn(double wt
, char c
) const