2 * @brief Xapian::Weight::Internal class, holding database and term statistics.
4 /* Copyright (C) 2007 Lemur Consulting Ltd
5 * Copyright (C) 2009,2010,2011,2013,2014,2015,2020,2024 Olly Betts
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
22 #ifndef XAPIAN_INCLUDED_WEIGHTINTERNAL_H
23 #define XAPIAN_INCLUDED_WEIGHTINTERNAL_H
25 #include "xapian/weight.h"
27 #include "xapian/database.h"
28 #include "xapian/error.h"
29 #include "xapian/query.h"
31 #include "backends/databaseinternal.h"
32 #include "internaltypes.h"
34 #include "stringutils.h"
42 #include <string_view>
43 #ifdef HAVE_STD_FROM_CHARS_DOUBLE
52 /// The frequencies for a term.
54 Xapian::doccount termfreq
= 0;
55 Xapian::doccount reltermfreq
= 0;
56 Xapian::termcount collfreq
= 0;
58 double max_part
= 0.0;
61 TermFreqs(Xapian::doccount termfreq_
,
62 Xapian::doccount reltermfreq_
,
63 Xapian::termcount collfreq_
,
64 double max_part_
= 0.0)
65 : termfreq(termfreq_
),
66 reltermfreq(reltermfreq_
),
68 max_part(max_part_
) {}
70 void operator+=(const TermFreqs
& other
) {
71 termfreq
+= other
.termfreq
;
72 reltermfreq
+= other
.reltermfreq
;
73 collfreq
+= other
.collfreq
;
74 // max_part shouldn't be set yet.
75 Assert(max_part
== 0.0);
76 Assert(other
.max_part
== 0.0);
79 void operator*=(double factor
) {
80 termfreq
= Xapian::doccount(termfreq
* factor
+ 0.5);
81 reltermfreq
= Xapian::doccount(reltermfreq
* factor
+ 0.5);
82 collfreq
= Xapian::termcount(collfreq
* factor
+ 0.5);
85 void operator/=(unsigned x
) {
91 /// Return a std::string describing this object.
92 std::string
get_description() const;
99 using Xapian::Internal::TermFreqs
;
105 /** Class to hold statistics for a given collection. */
106 class Weight::Internal
{
107 #ifdef XAPIAN_ASSERTIONS
108 /** Number of sub-databases. */
111 /** True if we've finalised the stats.
113 * Used for assertions.
115 mutable bool finalised
= false;
119 /** Total length of all documents in the collection. */
120 Xapian::totallength total_length
= 0;
122 /** Number of documents in the collection. */
123 Xapian::doccount collection_size
= 0;
125 /** Number of relevant documents in the collection. */
126 Xapian::doccount rset_size
= 0;
128 /// A lower bound on the minimum length of any document in the database.
129 Xapian::termcount db_doclength_lower_bound
= 0;
131 /// An upper bound on the maximum length of any document in the database.
132 Xapian::termcount db_doclength_upper_bound
= 0;
134 /// A lower bound on the number of unique terms in any document.
135 Xapian::termcount db_unique_terms_lower_bound
= 0;
137 /// An upper bound on the number of unique terms in any document.
138 Xapian::termcount db_unique_terms_upper_bound
= 0;
140 /** Has max_part been set for any term?
142 * If not, we can avoid having to serialise max_part.
144 bool have_max_part
= false;
149 /** Map of term frequencies and relevant term frequencies for the
151 std::map
<std::string
, TermFreqs
, std::less
<>> termfreqs
;
155 /** Add in the supplied statistics from a sub-database.
157 * Used for remote databases, where we pass across a serialised stats
158 * object, unserialise it, and add it to our total.
160 Internal
& operator+=(const Internal
& inc
);
162 void merge(const Weight::Internal
& o
);
164 void set_query(const Xapian::Query
&query_
) {
169 /// Accumulate the rtermfreqs for terms in the query.
170 void accumulate_stats(const Xapian::Database::Internal
&sub_db
,
171 const Xapian::RSet
&rset
);
173 /** Get the frequencies for the given term.
175 * termfreq is "n_t", the number of documents in the collection indexed by
178 * reltermfreq is "r_t", the number of relevant documents in the
179 * collection indexed by the given term.
181 * collfreq is the total number of occurrences of the term in all
184 bool get_stats(std::string_view term
,
185 Xapian::doccount
& termfreq
,
186 Xapian::doccount
& reltermfreq
,
187 Xapian::termcount
& collfreq
) const {
188 #ifdef XAPIAN_ASSERTIONS
191 // We pass an empty std::string for term when calculating the extra
194 termfreq
= collection_size
;
195 collfreq
= collection_size
;
196 reltermfreq
= rset_size
;
200 auto i
= termfreqs
.find(term
);
201 if (i
== termfreqs
.end()) {
202 termfreq
= reltermfreq
= collfreq
= 0;
206 termfreq
= i
->second
.termfreq
;
207 reltermfreq
= i
->second
.reltermfreq
;
208 collfreq
= i
->second
.collfreq
;
212 /// Get just the termfreq.
213 bool get_stats(std::string_view term
,
214 Xapian::doccount
& termfreq
) const {
215 Xapian::doccount dummy1
;
216 Xapian::termcount dummy2
;
217 return get_stats(term
, termfreq
, dummy1
, dummy2
);
220 /// Get the termweight.
221 bool get_termweight(std::string_view term
, double& termweight
) const {
222 #ifdef XAPIAN_ASSERTIONS
230 auto i
= termfreqs
.find(term
);
231 if (i
== termfreqs
.end()) {
235 termweight
= i
->second
.max_part
;
239 /** Get the minimum and maximum termweights.
241 * Used by the snippet code.
243 void get_max_termweight(double & min_tw
, double & max_tw
) {
244 auto i
= termfreqs
.begin();
245 while (i
!= termfreqs
.end() && i
->second
.max_part
== 0.0) ++i
;
246 if (rare(i
== termfreqs
.end())) {
247 min_tw
= max_tw
= 0.0;
250 min_tw
= max_tw
= i
->second
.max_part
;
251 while (++i
!= termfreqs
.end()) {
252 double max_part
= i
->second
.max_part
;
253 if (max_part
> max_tw
) {
255 } else if (max_part
< min_tw
&& max_part
!= 0.0) {
261 /// Set max_part for a term.
262 void set_max_part(const std::string
& term
, double max_part
) {
263 Assert(!term
.empty());
264 auto i
= termfreqs
.find(term
);
265 if (i
!= termfreqs
.end()) {
266 have_max_part
= true;
267 double& val
= i
->second
.max_part
;
268 val
= std::max(val
, max_part
);
272 Xapian::doclength
get_average_length() const {
273 #ifdef XAPIAN_ASSERTIONS
276 // We shortcut an empty shard and avoid creating a postlist tree for
277 // it, and all shards must be empty for collection_size to be zero.
278 Assert(collection_size
);
279 return Xapian::doclength(total_length
) / collection_size
;
282 /// Return a std::string describing this object.
283 std::string
get_description() const;
285 static bool double_param(const char ** p
, double * ptr_val
) {
286 #ifdef HAVE_STD_FROM_CHARS_DOUBLE
287 const char* startptr
= *p
;
288 // Unlike strtod(), std::from_chars() doesn't skip leading whitespace.
289 while (C_isspace(*startptr
)) ++startptr
;
290 const char* endptr
= startptr
+ std::strlen(startptr
);
292 const auto& r
= std::from_chars(startptr
, endptr
, v
);
293 if (r
.ec
!= std::errc()) {
301 double v
= strtod(*p
, &end
);
302 if (*p
== end
|| errno
) return false;
309 static bool param_name(const char** p
, std::string
& name
) {
312 if (*q
== '\0') break;
315 if (q
== *p
) return false;
322 static void parameter_error(const char* msg
,
323 const std::string
& scheme
,
324 const char* params
) {
333 throw InvalidArgumentError(m
);
339 #endif // XAPIAN_INCLUDED_WEIGHTINTERNAL_H