[ci] Fix clang-santisers job for GHA change
[xapian.git] / xapian-core / weight / weightinternal.h
blob614d0697b52ae1ecdee2726012c5a5943bac694b
1 /** @file
2 * @brief Xapian::Weight::Internal class, holding database and term statistics.
3 */
4 /* Copyright (C) 2007 Lemur Consulting Ltd
5 * Copyright (C) 2009,2010,2011,2013,2014,2015,2020,2024 Olly Betts
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
22 #ifndef XAPIAN_INCLUDED_WEIGHTINTERNAL_H
23 #define XAPIAN_INCLUDED_WEIGHTINTERNAL_H
25 #include "xapian/weight.h"
27 #include "xapian/database.h"
28 #include "xapian/error.h"
29 #include "xapian/query.h"
31 #include "backends/databaseinternal.h"
32 #include "internaltypes.h"
33 #include "omassert.h"
34 #include "stringutils.h"
36 #include <algorithm>
37 #include <cerrno>
38 #include <cstdlib>
39 #include <functional>
40 #include <map>
41 #include <string>
42 #include <string_view>
43 #ifdef HAVE_STD_FROM_CHARS_DOUBLE
44 # include <cstring>
45 # include <charconv>
46 #endif
48 namespace Xapian {
50 namespace Internal {
52 /// The frequencies for a term.
53 struct TermFreqs {
54 Xapian::doccount termfreq = 0;
55 Xapian::doccount reltermfreq = 0;
56 Xapian::termcount collfreq = 0;
58 double max_part = 0.0;
60 TermFreqs() {}
61 TermFreqs(Xapian::doccount termfreq_,
62 Xapian::doccount reltermfreq_,
63 Xapian::termcount collfreq_,
64 double max_part_ = 0.0)
65 : termfreq(termfreq_),
66 reltermfreq(reltermfreq_),
67 collfreq(collfreq_),
68 max_part(max_part_) {}
70 void operator+=(const TermFreqs & other) {
71 termfreq += other.termfreq;
72 reltermfreq += other.reltermfreq;
73 collfreq += other.collfreq;
74 // max_part shouldn't be set yet.
75 Assert(max_part == 0.0);
76 Assert(other.max_part == 0.0);
79 void operator*=(double factor) {
80 termfreq = Xapian::doccount(termfreq * factor + 0.5);
81 reltermfreq = Xapian::doccount(reltermfreq * factor + 0.5);
82 collfreq = Xapian::termcount(collfreq * factor + 0.5);
85 void operator/=(unsigned x) {
86 termfreq /= x;
87 reltermfreq /= x;
88 collfreq /= x;
91 /// Return a std::string describing this object.
92 std::string get_description() const;
99 using Xapian::Internal::TermFreqs;
101 namespace Xapian {
103 class RSet;
105 /** Class to hold statistics for a given collection. */
106 class Weight::Internal {
107 #ifdef XAPIAN_ASSERTIONS
108 /** Number of sub-databases. */
109 size_t subdbs = 0;
111 /** True if we've finalised the stats.
113 * Used for assertions.
115 mutable bool finalised = false;
116 #endif
118 public:
119 /** Total length of all documents in the collection. */
120 Xapian::totallength total_length = 0;
122 /** Number of documents in the collection. */
123 Xapian::doccount collection_size = 0;
125 /** Number of relevant documents in the collection. */
126 Xapian::doccount rset_size = 0;
128 /// A lower bound on the minimum length of any document in the database.
129 Xapian::termcount db_doclength_lower_bound = 0;
131 /// An upper bound on the maximum length of any document in the database.
132 Xapian::termcount db_doclength_upper_bound = 0;
134 /// A lower bound on the number of unique terms in any document.
135 Xapian::termcount db_unique_terms_lower_bound = 0;
137 /// An upper bound on the number of unique terms in any document.
138 Xapian::termcount db_unique_terms_upper_bound = 0;
140 /** Has max_part been set for any term?
142 * If not, we can avoid having to serialise max_part.
144 bool have_max_part = false;
146 /** The query. */
147 Xapian::Query query;
149 /** Map of term frequencies and relevant term frequencies for the
150 * collection. */
151 std::map<std::string, TermFreqs, std::less<>> termfreqs;
153 Internal() { }
155 /** Add in the supplied statistics from a sub-database.
157 * Used for remote databases, where we pass across a serialised stats
158 * object, unserialise it, and add it to our total.
160 Internal & operator+=(const Internal & inc);
162 void merge(const Weight::Internal& o);
164 void set_query(const Xapian::Query &query_) {
165 AssertEq(subdbs, 0);
166 query = query_;
169 /// Accumulate the rtermfreqs for terms in the query.
170 void accumulate_stats(const Xapian::Database::Internal &sub_db,
171 const Xapian::RSet &rset);
173 /** Get the frequencies for the given term.
175 * termfreq is "n_t", the number of documents in the collection indexed by
176 * the given term.
178 * reltermfreq is "r_t", the number of relevant documents in the
179 * collection indexed by the given term.
181 * collfreq is the total number of occurrences of the term in all
182 * documents.
184 bool get_stats(std::string_view term,
185 Xapian::doccount & termfreq,
186 Xapian::doccount & reltermfreq,
187 Xapian::termcount & collfreq) const {
188 #ifdef XAPIAN_ASSERTIONS
189 finalised = true;
190 #endif
191 // We pass an empty std::string for term when calculating the extra
192 // weight.
193 if (term.empty()) {
194 termfreq = collection_size;
195 collfreq = collection_size;
196 reltermfreq = rset_size;
197 return true;
200 auto i = termfreqs.find(term);
201 if (i == termfreqs.end()) {
202 termfreq = reltermfreq = collfreq = 0;
203 return false;
206 termfreq = i->second.termfreq;
207 reltermfreq = i->second.reltermfreq;
208 collfreq = i->second.collfreq;
209 return true;
212 /// Get just the termfreq.
213 bool get_stats(std::string_view term,
214 Xapian::doccount & termfreq) const {
215 Xapian::doccount dummy1;
216 Xapian::termcount dummy2;
217 return get_stats(term, termfreq, dummy1, dummy2);
220 /// Get the termweight.
221 bool get_termweight(std::string_view term, double& termweight) const {
222 #ifdef XAPIAN_ASSERTIONS
223 finalised = true;
224 #endif
225 termweight = 0.0;
226 if (term.empty()) {
227 return false;
230 auto i = termfreqs.find(term);
231 if (i == termfreqs.end()) {
232 return false;
235 termweight = i->second.max_part;
236 return true;
239 /** Get the minimum and maximum termweights.
241 * Used by the snippet code.
243 void get_max_termweight(double & min_tw, double & max_tw) {
244 auto i = termfreqs.begin();
245 while (i != termfreqs.end() && i->second.max_part == 0.0) ++i;
246 if (rare(i == termfreqs.end())) {
247 min_tw = max_tw = 0.0;
248 return;
250 min_tw = max_tw = i->second.max_part;
251 while (++i != termfreqs.end()) {
252 double max_part = i->second.max_part;
253 if (max_part > max_tw) {
254 max_tw = max_part;
255 } else if (max_part < min_tw && max_part != 0.0) {
256 min_tw = max_part;
261 /// Set max_part for a term.
262 void set_max_part(const std::string & term, double max_part) {
263 Assert(!term.empty());
264 auto i = termfreqs.find(term);
265 if (i != termfreqs.end()) {
266 have_max_part = true;
267 double& val = i->second.max_part;
268 val = std::max(val, max_part);
272 Xapian::doclength get_average_length() const {
273 #ifdef XAPIAN_ASSERTIONS
274 finalised = true;
275 #endif
276 // We shortcut an empty shard and avoid creating a postlist tree for
277 // it, and all shards must be empty for collection_size to be zero.
278 Assert(collection_size);
279 return Xapian::doclength(total_length) / collection_size;
282 /// Return a std::string describing this object.
283 std::string get_description() const;
285 static bool double_param(const char ** p, double * ptr_val) {
286 #ifdef HAVE_STD_FROM_CHARS_DOUBLE
287 const char* startptr = *p;
288 // Unlike strtod(), std::from_chars() doesn't skip leading whitespace.
289 while (C_isspace(*startptr)) ++startptr;
290 const char* endptr = startptr + std::strlen(startptr);
291 double v;
292 const auto& r = std::from_chars(startptr, endptr, v);
293 if (r.ec != std::errc()) {
294 return false;
296 *p = r.ptr;
297 *ptr_val = v;
298 #else
299 char *end;
300 errno = 0;
301 double v = strtod(*p, &end);
302 if (*p == end || errno) return false;
303 *p = end;
304 *ptr_val = v;
305 #endif
306 return true;
309 static bool param_name(const char** p, std::string& name) {
310 const char* q = *p;
311 while (*q != ' ') {
312 if (*q == '\0') break;
313 name += *(q)++;
315 if (q == *p) return false;
316 if (*q == ' ') q++;
317 *p = q;
318 return true;
321 [[noreturn]]
322 static void parameter_error(const char* msg,
323 const std::string& scheme,
324 const char* params) {
325 std::string m(msg);
326 m += ": '";
327 m += scheme;
328 if (*params) {
329 m += ' ';
330 m += params;
332 m += "'";
333 throw InvalidArgumentError(m);
339 #endif // XAPIAN_INCLUDED_WEIGHTINTERNAL_H