Revert "Fix integer type used by ESet"
[xapian.git] / xapian-core / expand / expandweight.h
blob8c60d3bfa1602e2c668d55e40601ee644cb5ca82
1 /** @file
2 * @brief Collate statistics and calculate the term weights for the ESet.
3 */
4 /* Copyright (C) 2007,2008,2009,2011,2016 Olly Betts
5 * Copyright (C) 2013 Aarsh Shah
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
22 #ifndef XAPIAN_INCLUDED_EXPANDWEIGHT_H
23 #define XAPIAN_INCLUDED_EXPANDWEIGHT_H
25 #include <xapian/database.h>
27 #include "api/termlist.h"
28 #include "internaltypes.h"
30 #include <string>
31 #include <vector>
33 namespace Xapian {
34 namespace Internal {
36 /// Collates statistics while calculating term weight in an ESet.
37 class ExpandStats {
38 /// Which databases in a multidb are included in termfreq.
39 std::vector<bool> dbs_seen;
41 /// Average document length in the whole database.
42 Xapian::doclength avlen;
44 /// The parameter k to be used for TradWeight query expansion.
45 double expand_k;
47 public:
48 /// Size of the subset of a multidb to which the value in termfreq applies.
49 Xapian::doccount dbsize;
51 /// Term frequency (for a multidb, may be for a subset of the databases).
52 Xapian::doccount termfreq;
54 /// The number of times the term occurs in the rset.
55 Xapian::termcount rcollection_freq;
57 /// The number of documents from the RSet indexed by the current term (r).
58 Xapian::doccount rtermfreq;
60 /// The multiplier to be used in TradWeight query expansion.
61 double multiplier;
63 /// Constructor for expansion schemes which do not require the "expand_k"
64 /// parameter.
65 explicit ExpandStats(Xapian::doclength avlen_)
66 : avlen(avlen_), expand_k(0), dbsize(0), termfreq(0),
67 rcollection_freq(0), rtermfreq(0), multiplier(0) {
70 /// Constructor for expansion schemes which require the "expand_k" parameter.
71 ExpandStats(Xapian::doclength avlen_, double expand_k_)
72 : avlen(avlen_), expand_k(expand_k_), dbsize(0), termfreq(0),
73 rcollection_freq(0), rtermfreq(0), multiplier(0) {
76 void accumulate(size_t shard_index,
77 Xapian::termcount wdf, Xapian::termcount doclen,
78 Xapian::doccount subtf, Xapian::doccount subdbsize)
80 // Boolean terms may have wdf == 0, but treat that as 1 so such terms
81 // get a non-zero weight.
82 if (wdf == 0) wdf = 1;
83 ++rtermfreq;
84 rcollection_freq += wdf;
86 multiplier += (expand_k + 1) * wdf / (expand_k * doclen / avlen + wdf);
88 // If we've not seen this sub-database before, then update dbsize and
89 // termfreq and note that we have seen it.
90 if (shard_index >= dbs_seen.size() || !dbs_seen[shard_index]) {
91 if (shard_index >= dbs_seen.size()) {
92 dbs_seen.resize(shard_index + 1);
94 dbs_seen[shard_index] = true;
95 dbsize += subdbsize;
96 termfreq += subtf;
100 /* Clear the statistics collected in the ExpandStats object before using it
101 * for a new term. */
102 void clear_stats()
104 dbs_seen.clear();
105 dbsize = 0;
106 termfreq = 0;
107 rcollection_freq = 0;
108 rtermfreq = 0;
109 multiplier = 0;
113 /// Class for calculating ESet term weights.
114 class ExpandWeight {
115 /// The combined database.
116 const Xapian::Database db;
118 /// The number of documents in the whole database.
119 Xapian::doccount dbsize;
121 /// Average document length in the whole database.
122 Xapian::doclength avlen;
124 /// The number of documents in the RSet.
125 Xapian::doccount rsize;
127 /// The collection frequency of the term.
128 Xapian::termcount collection_freq;
130 /// The total length of the database.
131 Xapian::totallength collection_len;
133 /** Should we calculate the exact term frequency when generating an ESet?
135 * This only has any effect if we're using a combined database.
137 * If this member is true, the exact term frequency will be obtained from
138 * the Database object. If this member is false, then an approximation is
139 * used to estimate the term frequency based on the term frequencies in
140 * the sub-databases which we see while collating term statistics, and the
141 * relative sizes of the sub-databases.
143 bool use_exact_termfreq;
145 public:
146 /** Constructor.
148 * @param db_ The database.
149 * @param rsize_ The number of documents in the RSet.
150 * @param use_exact_termfreq_ When expanding over a combined database,
151 * should we use the exact termfreq (if false
152 * a cheaper approximation is used).
154 ExpandWeight(const Xapian::Database &db_,
155 Xapian::doccount rsize_,
156 bool use_exact_termfreq_)
157 : db(db_), dbsize(db.get_doccount()), avlen(db.get_avlength()),
158 rsize(rsize_), collection_freq(0),
159 collection_len(avlen * dbsize + .5),
160 use_exact_termfreq(use_exact_termfreq_), stats(avlen) {}
162 /** Constructor.
164 * @param db_ The database.
165 * @param rsize_ The number of documents in the RSet.
166 * @param use_exact_termfreq_ When expanding over a combined database,
167 * should we use the exact termfreq (if false
168 * a cheaper approximation is used).
169 * @param expand_k_ The parameter for TradWeight query expansion.
171 ExpandWeight(const Xapian::Database &db_,
172 Xapian::doccount rsize_,
173 bool use_exact_termfreq_,
174 double expand_k_)
175 : db(db_), dbsize(db.get_doccount()), avlen(db.get_avlength()),
176 rsize(rsize_), collection_freq(0),
177 collection_len(avlen * dbsize + .5),
178 use_exact_termfreq(use_exact_termfreq_), stats(avlen, expand_k_) {}
180 /** Get the term statistics.
181 * @param merger The tree of TermList objects.
182 * @param term The current term name.
184 void collect_stats(TermList * merger, const std::string & term);
186 /// Calculate the weight.
187 virtual double get_weight() const = 0;
189 protected:
190 /// An ExpandStats object to accumulate statistics.
191 ExpandStats stats;
193 /// Return the average length of the database.
194 double get_avlen() const { return avlen; }
196 /// Return the number of documents in the RSet.
197 Xapian::doccount get_rsize() const { return rsize; }
199 /// Return the collection frequency of the term.
200 Xapian::termcount get_collection_freq() const { return collection_freq; }
202 /// Return the length of the collection.
203 Xapian::totallength get_collection_len() const { return collection_len; }
205 /// Return the size of the database.
206 Xapian::doccount get_dbsize() const { return dbsize; }
209 /** This class implements the TradWeight scheme for query expansion.
211 * It is the default scheme for query expansion.
213 class TradEWeight : public ExpandWeight {
214 public:
215 /** Constructor.
217 * @param db_ The database.
218 * @param rsize_ The number of documents in the RSet.
219 * @param use_exact_termfreq_ When expanding over a combined database,
220 * should we use the exact termfreq (if false
221 * a cheaper approximation is used).
222 * @param expand_k_ The parameter for TradWeight query expansion.
224 * All the parameters are passed to the parent ExpandWeight object.
226 TradEWeight(const Xapian::Database &db_,
227 Xapian::doccount rsize_,
228 bool use_exact_termfreq_,
229 double expand_k_)
230 : ExpandWeight(db_, rsize_, use_exact_termfreq_, expand_k_) { }
232 double get_weight() const;
235 /** This class implements the Bo1 scheme for query expansion.
237 * Bo1 is a representative scheme of the Divergence from Randomness Framework
238 * by Gianni Amati.
240 * This is a parameter free weighting scheme for query expansion and it uses
241 * the Bose-Einstein probabilistic distribution.
243 * For more information about the DFR Framework and the Bo1 scheme, please
244 * refer to Gianni Amati's PHD thesis.
246 class Bo1EWeight : public ExpandWeight {
247 public:
248 /** Constructor.
250 * @param db_ The database.
251 * @param rsize_ The number of documents in the RSet.
252 * @param use_exact_termfreq_ When expanding over a combined database,
253 * should we use the exact termfreq (if false
254 * a cheaper approximation is used).
256 * All the parameters are passed to the parent ExpandWeight object.
258 Bo1EWeight(const Xapian::Database &db_,
259 Xapian::doccount rsize_,
260 bool use_exact_termfreq_)
261 : ExpandWeight(db_, rsize_, use_exact_termfreq_) {}
263 double get_weight() const;
269 #endif // XAPIAN_INCLUDED_EXPANDWEIGHT_H