[ci] Fix clang-santisers job for GHA change
[xapian.git] / xapian-core / tests / dbcheck.cc
blobe40bed5e2c9b4ad6c0c196b09f26da7306550993
1 /** @file
2 * @brief test database contents and consistency.
3 */
4 /* Copyright 2009 Richard Boulton
5 * Copyright 2010,2015 Olly Betts
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
20 * USA
23 #include <config.h>
25 #include "dbcheck.h"
27 #include "str.h"
28 #include "testsuite.h"
30 using namespace std;
32 string
33 positions_to_string(Xapian::PositionIterator & it,
34 const Xapian::PositionIterator & end,
35 Xapian::termcount * count)
37 string result;
38 bool need_comma = false;
39 Xapian::termcount c = 0;
40 while (it != end) {
41 if (need_comma)
42 result += ", ";
43 result += str(*it);
44 need_comma = true;
45 ++it;
46 ++c;
48 if (count) {
49 *count = c;
51 return result;
54 string
55 postlist_to_string(const Xapian::Database & db, const string & tname)
57 string result;
58 bool need_comma = false;
60 for (Xapian::PostingIterator p = db.postlist_begin(tname);
61 p != db.postlist_end(tname);
62 ++p) {
63 if (need_comma)
64 result += ", ";
66 Xapian::PositionIterator it(p.positionlist_begin());
67 string posrepr = positions_to_string(it, p.positionlist_end());
68 if (!posrepr.empty()) {
69 posrepr = ", pos=[" + posrepr + "]";
72 result += "(" + str(*p) +
73 ", doclen=" + str(p.get_doclength()) +
74 ", wdf=" + str(p.get_wdf()) +
75 posrepr + ")";
76 need_comma = true;
78 return result;
81 string
82 docterms_to_string(const Xapian::Database & db, Xapian::docid did)
84 string result;
85 bool need_comma = false;
87 for (Xapian::TermIterator t = db.termlist_begin(did);
88 t != db.termlist_end(did);
89 ++t) {
90 Xapian::PositionIterator it(t.positionlist_begin());
91 string posrepr = positions_to_string(it, t.positionlist_end());
92 if (!posrepr.empty()) {
93 posrepr = ", pos=[" + posrepr + "]";
95 if (need_comma)
96 result += ", ";
97 result += "Term(" + *t + ", wdf=" + str(t.get_wdf()) + posrepr;
98 result += ")";
99 need_comma = true;
101 return result;
104 string
105 docstats_to_string(const Xapian::Database & db, Xapian::docid did)
107 string result;
109 result += "len=" + str(db.get_doclength(did));
111 return result;
114 string
115 termstats_to_string(const Xapian::Database & db, const string & term)
117 string result;
119 result += "tf=" + str(db.get_termfreq(term));
120 result += ",cf=" + str(db.get_collection_freq(term));
122 return result;
125 void
126 dbcheck(const Xapian::Database & db,
127 Xapian::doccount expected_doccount,
128 Xapian::docid expected_lastdocid)
130 TEST_EQUAL(db.get_doccount(), expected_doccount);
131 TEST_EQUAL(db.get_lastdocid(), expected_lastdocid);
133 // Note - may not be a very big type, but we're only expecting to use this
134 // for small databases, so should be fine.
135 unsigned long totlen = 0;
137 // A map from term to a representation of the posting list for that term.
138 // We build this up from the documents, and then check it against the
139 // equivalent built up from the posting lists.
140 map<string, string> posting_reprs;
141 map<Xapian::valueno, string> value_reprs;
143 Xapian::termcount doclen_lower_bound = Xapian::termcount(-1);
144 Xapian::termcount doclen_upper_bound = 0;
146 for (Xapian::PostingIterator dociter = db.postlist_begin(string());
147 dociter != db.postlist_end(string());
148 ++dociter) {
149 Xapian::docid did = *dociter;
150 TEST_EQUAL(dociter.get_wdf(), 1);
151 Xapian::Document doc(db.get_document(did));
152 Xapian::termcount doclen(db.get_doclength(did));
153 Xapian::termcount unique_terms(db.get_unique_terms(did));
154 if (doclen < doclen_lower_bound)
155 doclen_lower_bound = doclen;
156 if (doclen > doclen_upper_bound)
157 doclen_upper_bound = doclen;
158 totlen += doclen;
160 Xapian::termcount found_termcount = 0;
161 Xapian::termcount found_unique_terms = 0;
162 Xapian::termcount wdf_sum = 0;
163 Xapian::TermIterator t, t2;
164 for (t = doc.termlist_begin(), t2 = db.termlist_begin(did);
165 t != doc.termlist_end();
166 ++t, ++t2) {
167 TEST(t2 != db.termlist_end(did));
169 ++found_termcount;
170 auto wdf = t.get_wdf();
171 if (wdf) ++found_unique_terms;
172 wdf_sum += wdf;
174 TEST_EQUAL(*t, *t2);
175 TEST_EQUAL(t.get_wdf(), t2.get_wdf());
176 TEST_EQUAL(db.get_termfreq(*t), t.get_termfreq());
177 TEST_EQUAL(db.get_termfreq(*t), t2.get_termfreq());
179 // Check the position lists are equal.
180 Xapian::termcount tc1, tc2;
181 Xapian::PositionIterator it1(t.positionlist_begin());
182 string posrepr = positions_to_string(it1, t.positionlist_end(), &tc1);
183 Xapian::PositionIterator it2(t2.positionlist_begin());
184 string posrepr2 = positions_to_string(it2, t2.positionlist_end(), &tc2);
185 TEST_EQUAL(posrepr, posrepr2);
186 TEST_EQUAL(tc1, tc2);
187 TEST_EQUAL(tc1, t.positionlist_count());
189 // Make a representation of the posting.
190 if (!posrepr.empty()) {
191 posrepr = ",[" + posrepr + "]";
193 string posting_repr = "(" + str(did) + "," +
194 str(t.get_wdf()) + "/" + str(doclen) +
195 posrepr + ")";
197 // Append the representation to the list for the term.
198 map<string, string>::iterator i = posting_reprs.find(*t);
199 if (i == posting_reprs.end()) {
200 posting_reprs[*t] = posting_repr;
201 } else {
202 i->second += "," + posting_repr;
206 Xapian::termcount vcount = 0;
207 for (Xapian::ValueIterator v = doc.values_begin();
208 v != doc.values_end();
209 ++v, ++vcount) {
210 TEST((*v).size() != 0);
211 string value_repr = "(" + str(did) + "," + *v + ")";
213 // Append the values to the value lists.
214 map<Xapian::valueno, string>::iterator i;
215 i = value_reprs.find(v.get_valueno());
216 if (i == value_reprs.end()) {
217 value_reprs[v.get_valueno()] = value_repr;
218 } else {
219 i->second += "," + value_repr;
222 TEST_EQUAL(vcount, doc.values_count());
223 TEST(t2 == db.termlist_end(did));
224 Xapian::termcount expected_termcount = doc.termlist_count();
225 TEST_EQUAL(expected_termcount, found_termcount);
226 // Ideally this would be equal, but currently we don't store the
227 // unique_terms values but calculate them, and scanning the termlist
228 // of each document would be slow, so instead get_unique_terms(did)
229 // returns min(doclen, termcount) at present.
230 TEST_REL(unique_terms, >=, found_unique_terms);
231 TEST_REL(unique_terms, <=, found_termcount);
232 TEST_REL(unique_terms, <=, doclen);
233 TEST_EQUAL(doclen, wdf_sum);
236 TEST_REL(doclen_lower_bound, >=, db.get_doclength_lower_bound());
237 TEST_REL(doclen_upper_bound, <=, db.get_doclength_upper_bound());
239 Xapian::TermIterator t;
240 map<string, string>::const_iterator i;
241 for (t = db.allterms_begin(), i = posting_reprs.begin();
242 t != db.allterms_end();
243 ++t, ++i) {
244 TEST(db.term_exists(*t));
245 TEST(i != posting_reprs.end());
246 TEST_EQUAL(i->first, *t);
248 Xapian::doccount tf_count = 0;
249 Xapian::termcount cf_count = 0;
250 Xapian::termcount wdf_upper_bound = 0;
251 string posting_repr;
252 bool need_comma = false;
253 for (Xapian::PostingIterator p = db.postlist_begin(*t);
254 p != db.postlist_end(*t);
255 ++p) {
256 if (need_comma) {
257 posting_repr += ",";
260 ++tf_count;
261 cf_count += p.get_wdf();
263 Xapian::PositionIterator it(p.positionlist_begin());
264 string posrepr = positions_to_string(it, p.positionlist_end());
265 if (!posrepr.empty()) {
266 posrepr = ",[" + posrepr + "]";
268 posting_repr += "(" + str(*p) + "," +
269 str(p.get_wdf()) + "/" +
270 str(p.get_doclength()) + posrepr + ")";
271 if (wdf_upper_bound < p.get_wdf())
272 wdf_upper_bound = p.get_wdf();
273 need_comma = true;
276 TEST_EQUAL(posting_repr, i->second);
277 TEST_EQUAL(tf_count, t.get_termfreq());
278 TEST_EQUAL(tf_count, db.get_termfreq(*t));
279 TEST_EQUAL(cf_count, db.get_collection_freq(*t));
280 TEST_REL(wdf_upper_bound, <=, db.get_wdf_upper_bound(*t));
282 TEST(i == posting_reprs.end());
284 map<Xapian::valueno, string>::const_iterator j;
285 for (j = value_reprs.begin(); j != value_reprs.end(); ++j) {
286 string value_repr;
287 string value_lower_bound;
288 string value_upper_bound;
289 bool first = true;
290 for (Xapian::ValueIterator v = db.valuestream_begin(j->first);
291 v != db.valuestream_end(j->first); ++v) {
292 if (first) {
293 value_lower_bound = *v;
294 value_upper_bound = *v;
295 first = false;
296 } else {
297 value_repr += ",";
298 if (*v > value_upper_bound) {
299 value_upper_bound = *v;
301 if (*v < value_lower_bound) {
302 value_lower_bound = *v;
305 value_repr += "(" + str(v.get_docid()) + "," + *v + ")";
307 TEST_EQUAL(value_repr, j->second);
308 try {
309 TEST_REL(value_upper_bound, <=, db.get_value_upper_bound(j->first));
310 TEST_REL(value_lower_bound, >=, db.get_value_lower_bound(j->first));
311 } catch (const Xapian::UnimplementedError &) {
312 // Skip the checks if the methods to get the bounds aren't
313 // implemented for this backend.
317 if (expected_doccount == 0) {
318 TEST_EQUAL(0, db.get_avlength());
319 } else {
320 TEST_EQUAL_DOUBLE(double(totlen) / expected_doccount,
321 db.get_avlength());