2 * @brief Xapian::Weight::Internal class, holding database and term statistics.
4 /* Copyright (C) 2007 Lemur Consulting Ltd
5 * Copyright (C) 2009,2010,2011,2013,2014,2015,2020 Olly Betts
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
22 #ifndef XAPIAN_INCLUDED_WEIGHTINTERNAL_H
23 #define XAPIAN_INCLUDED_WEIGHTINTERNAL_H
25 #include "xapian/weight.h"
27 #include "xapian/database.h"
28 #include "xapian/query.h"
30 #include "backends/database.h"
31 #include "internaltypes.h"
37 /// The frequencies for a term.
39 Xapian::doccount termfreq
;
40 Xapian::doccount reltermfreq
;
41 Xapian::termcount collfreq
;
44 TermFreqs() : termfreq(0), reltermfreq(0), collfreq(0), max_part(0.0) {}
45 TermFreqs(Xapian::doccount termfreq_
,
46 Xapian::doccount reltermfreq_
,
47 Xapian::termcount collfreq_
,
48 double max_part_
= 0.0)
49 : termfreq(termfreq_
),
50 reltermfreq(reltermfreq_
),
52 max_part(max_part_
) {}
54 void operator+=(const TermFreqs
& other
) {
55 termfreq
+= other
.termfreq
;
56 reltermfreq
+= other
.reltermfreq
;
57 collfreq
+= other
.collfreq
;
58 max_part
+= other
.max_part
;
61 /// Return a std::string describing this object.
62 std::string
get_description() const;
69 /** Class to hold statistics for a given collection. */
70 class Weight::Internal
{
71 #ifdef XAPIAN_ASSERTIONS
72 /** Number of sub-databases. */
75 /** True if we've finalised the stats.
77 * Used for assertions.
79 mutable bool finalised
= false;
83 /** Total length of all documents in the collection. */
84 Xapian::totallength total_length
= 0;
86 /** Number of documents in the collection. */
87 Xapian::doccount collection_size
= 0;
89 /** Number of relevant documents in the collection. */
90 Xapian::doccount rset_size
= 0;
92 /** Has max_part been set for any term?
94 * If not, we can avoid having to serialise max_part.
96 bool have_max_part
= false;
98 /** Database to get the bounds on doclength and wdf from. */
104 /** Map of term frequencies and relevant term frequencies for the
106 std::map
<std::string
, TermFreqs
> termfreqs
;
110 /** Add in the supplied statistics from a sub-database.
112 * Used for remote databases, where we pass across a serialised stats
113 * object, unserialise it, and add it to our total.
115 Internal
& operator+=(const Internal
& inc
);
117 void merge(const Weight::Internal
& o
);
119 void set_query(const Xapian::Query
&query_
) {
124 /// Accumulate the rtermfreqs for terms in the query.
125 void accumulate_stats(const Xapian::Database::Internal
&sub_db
,
126 const Xapian::RSet
&rset
);
128 /** Get the frequencies for the given term.
130 * termfreq is "n_t", the number of documents in the collection indexed by
133 * reltermfreq is "r_t", the number of relevant documents in the
134 * collection indexed by the given term.
136 * collfreq is the total number of occurrences of the term in all
139 bool get_stats(const std::string
& term
,
140 Xapian::doccount
& termfreq
,
141 Xapian::doccount
& reltermfreq
,
142 Xapian::termcount
& collfreq
) const {
143 #ifdef XAPIAN_ASSERTIONS
146 // We pass an empty std::string for term when calculating the extra
149 termfreq
= collection_size
;
150 collfreq
= collection_size
;
151 reltermfreq
= rset_size
;
155 auto i
= termfreqs
.find(term
);
156 if (i
== termfreqs
.end()) {
157 termfreq
= reltermfreq
= collfreq
= 0;
161 termfreq
= i
->second
.termfreq
;
162 reltermfreq
= i
->second
.reltermfreq
;
163 collfreq
= i
->second
.collfreq
;
167 /// Get just the termfreq.
168 bool get_stats(const std::string
& term
,
169 Xapian::doccount
& termfreq
) const {
170 Xapian::doccount dummy1
;
171 Xapian::termcount dummy2
;
172 return get_stats(term
, termfreq
, dummy1
, dummy2
);
175 /// Get the termweight.
176 bool get_termweight(const std::string
& term
, double & termweight
) const {
177 #ifdef XAPIAN_ASSERTIONS
185 auto i
= termfreqs
.find(term
);
186 if (i
== termfreqs
.end()) {
190 termweight
= i
->second
.max_part
;
194 /** Get the minimum and maximum termweights.
196 * Used by the snippet code.
198 void get_max_termweight(double & min_tw
, double & max_tw
) {
199 auto i
= termfreqs
.begin();
200 while (i
!= termfreqs
.end() && i
->second
.max_part
== 0.0) ++i
;
201 if (rare(i
== termfreqs
.end())) {
202 min_tw
= max_tw
= 0.0;
205 min_tw
= max_tw
= i
->second
.max_part
;
206 while (++i
!= termfreqs
.end()) {
207 double max_part
= i
->second
.max_part
;
208 if (max_part
> max_tw
) {
210 } else if (max_part
< min_tw
&& max_part
!= 0.0) {
216 /// Set max_part for a term.
217 void set_max_part(const std::string
& term
, double max_part
) {
218 have_max_part
= true;
219 Assert(!term
.empty());
220 auto i
= termfreqs
.find(term
);
221 if (i
!= termfreqs
.end())
222 i
->second
.max_part
+= max_part
;
225 Xapian::doclength
get_average_length() const {
226 #ifdef XAPIAN_ASSERTIONS
229 if (rare(collection_size
== 0)) return 0;
230 return Xapian::doclength(total_length
) / collection_size
;
233 /** Set the "bounds" stats from Database @a db. */
234 void set_bounds_from_db(const Xapian::Database
&db_
) {
239 /// Return a std::string describing this object.
240 std::string
get_description() const;
245 #endif // XAPIAN_INCLUDED_WEIGHTINTERNAL_H