2 * @brief Collate statistics and calculate the term weights for the ESet.
4 /* Copyright (C) 2007,2008,2009,2011,2016 Olly Betts
5 * Copyright (C) 2013 Aarsh Shah
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
22 #ifndef XAPIAN_INCLUDED_EXPANDWEIGHT_H
23 #define XAPIAN_INCLUDED_EXPANDWEIGHT_H
25 #include <xapian/database.h>
27 #include "api/termlist.h"
28 #include "internaltypes.h"
36 /// Collates statistics while calculating term weight in an ESet.
38 /// Which databases in a multidb are included in termfreq.
39 std::vector
<bool> dbs_seen
;
41 /// Average document length in the whole database.
42 Xapian::doclength avlen
;
44 /// The parameter k to be used for TradWeight query expansion.
48 /// Size of the subset of a multidb to which the value in termfreq applies.
49 Xapian::doccount dbsize
;
51 /// Term frequency (for a multidb, may be for a subset of the databases).
52 Xapian::doccount termfreq
;
54 /// The number of times the term occurs in the rset.
55 Xapian::termcount rcollection_freq
;
57 /// The number of documents from the RSet indexed by the current term (r).
58 Xapian::doccount rtermfreq
;
60 /// The multiplier to be used in TradWeight query expansion.
63 /// Constructor for expansion schemes which do not require the "expand_k"
65 explicit ExpandStats(Xapian::doclength avlen_
)
66 : avlen(avlen_
), expand_k(0), dbsize(0), termfreq(0),
67 rcollection_freq(0), rtermfreq(0), multiplier(0) {
70 /// Constructor for expansion schemes which require the "expand_k" parameter.
71 ExpandStats(Xapian::doclength avlen_
, double expand_k_
)
72 : avlen(avlen_
), expand_k(expand_k_
), dbsize(0), termfreq(0),
73 rcollection_freq(0), rtermfreq(0), multiplier(0) {
76 void accumulate(size_t shard_index
,
77 Xapian::termcount wdf
, Xapian::termcount doclen
,
78 Xapian::doccount subtf
, Xapian::doccount subdbsize
)
80 // Boolean terms may have wdf == 0, but treat that as 1 so such terms
81 // get a non-zero weight.
82 if (wdf
== 0) wdf
= 1;
84 rcollection_freq
+= wdf
;
86 multiplier
+= (expand_k
+ 1) * wdf
/ (expand_k
* doclen
/ avlen
+ wdf
);
88 // If we've not seen this sub-database before, then update dbsize and
89 // termfreq and note that we have seen it.
90 if (shard_index
>= dbs_seen
.size() || !dbs_seen
[shard_index
]) {
91 if (shard_index
>= dbs_seen
.size()) {
92 dbs_seen
.resize(shard_index
+ 1);
94 dbs_seen
[shard_index
] = true;
100 /* Clear the statistics collected in the ExpandStats object before using it
107 rcollection_freq
= 0;
113 /// Class for calculating ESet term weights.
115 /// The combined database.
116 const Xapian::Database db
;
118 /// The number of documents in the whole database.
119 Xapian::doccount dbsize
;
121 /// Average document length in the whole database.
122 Xapian::doclength avlen
;
124 /// The number of documents in the RSet.
125 Xapian::doccount rsize
;
127 /// The collection frequency of the term.
128 Xapian::termcount collection_freq
;
130 /// The total length of the database.
131 Xapian::totallength collection_len
;
133 /** Should we calculate the exact term frequency when generating an ESet?
135 * This only has any effect if we're using a combined database.
137 * If this member is true, the exact term frequency will be obtained from
138 * the Database object. If this member is false, then an approximation is
139 * used to estimate the term frequency based on the term frequencies in
140 * the sub-databases which we see while collating term statistics, and the
141 * relative sizes of the sub-databases.
143 bool use_exact_termfreq
;
148 * @param db_ The database.
149 * @param rsize_ The number of documents in the RSet.
150 * @param use_exact_termfreq_ When expanding over a combined database,
151 * should we use the exact termfreq (if false
152 * a cheaper approximation is used).
154 ExpandWeight(const Xapian::Database
&db_
,
155 Xapian::doccount rsize_
,
156 bool use_exact_termfreq_
)
157 : db(db_
), dbsize(db
.get_doccount()), avlen(db
.get_avlength()),
158 rsize(rsize_
), collection_freq(0),
159 collection_len(avlen
* dbsize
+ .5),
160 use_exact_termfreq(use_exact_termfreq_
), stats(avlen
) {}
164 * @param db_ The database.
165 * @param rsize_ The number of documents in the RSet.
166 * @param use_exact_termfreq_ When expanding over a combined database,
167 * should we use the exact termfreq (if false
168 * a cheaper approximation is used).
169 * @param expand_k_ The parameter for TradWeight query expansion.
171 ExpandWeight(const Xapian::Database
&db_
,
172 Xapian::doccount rsize_
,
173 bool use_exact_termfreq_
,
175 : db(db_
), dbsize(db
.get_doccount()), avlen(db
.get_avlength()),
176 rsize(rsize_
), collection_freq(0),
177 collection_len(avlen
* dbsize
+ .5),
178 use_exact_termfreq(use_exact_termfreq_
), stats(avlen
, expand_k_
) {}
180 /** Get the term statistics.
181 * @param merger The tree of TermList objects.
182 * @param term The current term name.
184 void collect_stats(TermList
* merger
, const std::string
& term
);
186 /// Calculate the weight.
187 virtual double get_weight() const = 0;
190 /// An ExpandStats object to accumulate statistics.
193 /// Return the average length of the database.
194 double get_avlen() const { return avlen
; }
196 /// Return the number of documents in the RSet.
197 Xapian::doccount
get_rsize() const { return rsize
; }
199 /// Return the collection frequency of the term.
200 Xapian::termcount
get_collection_freq() const { return collection_freq
; }
202 /// Return the length of the collection.
203 Xapian::totallength
get_collection_len() const { return collection_len
; }
205 /// Return the size of the database.
206 Xapian::doccount
get_dbsize() const { return dbsize
; }
209 /** This class implements the TradWeight scheme for query expansion.
211 * It is the default scheme for query expansion.
213 class TradEWeight
: public ExpandWeight
{
217 * @param db_ The database.
218 * @param rsize_ The number of documents in the RSet.
219 * @param use_exact_termfreq_ When expanding over a combined database,
220 * should we use the exact termfreq (if false
221 * a cheaper approximation is used).
222 * @param expand_k_ The parameter for TradWeight query expansion.
224 * All the parameters are passed to the parent ExpandWeight object.
226 TradEWeight(const Xapian::Database
&db_
,
227 Xapian::doccount rsize_
,
228 bool use_exact_termfreq_
,
230 : ExpandWeight(db_
, rsize_
, use_exact_termfreq_
, expand_k_
) { }
232 double get_weight() const;
235 /** This class implements the Bo1 scheme for query expansion.
237 * Bo1 is a representative scheme of the Divergence from Randomness Framework
240 * This is a parameter free weighting scheme for query expansion and it uses
241 * the Bose-Einstein probabilistic distribution.
243 * For more information about the DFR Framework and the Bo1 scheme, please
244 * refer to Gianni Amati's PHD thesis.
246 class Bo1EWeight
: public ExpandWeight
{
250 * @param db_ The database.
251 * @param rsize_ The number of documents in the RSet.
252 * @param use_exact_termfreq_ When expanding over a combined database,
253 * should we use the exact termfreq (if false
254 * a cheaper approximation is used).
256 * All the parameters are passed to the parent ExpandWeight object.
258 Bo1EWeight(const Xapian::Database
&db_
,
259 Xapian::doccount rsize_
,
260 bool use_exact_termfreq_
)
261 : ExpandWeight(db_
, rsize_
, use_exact_termfreq_
) {}
263 double get_weight() const;
269 #endif // XAPIAN_INCLUDED_EXPANDWEIGHT_H