Update for 1.4.20
[xapian.git] / xapian-core / tests / dbcheck.cc
blob2e59ffb7063bd3fec7bc7f8da4ac279879b72514
1 /** @file
2 * @brief test database contents and consistency.
3 */
4 /* Copyright 2009 Richard Boulton
5 * Copyright 2010,2015 Olly Betts
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
20 * USA
23 #include <config.h>
25 #include "dbcheck.h"
27 #include "str.h"
28 #include "testsuite.h"
30 using namespace std;
32 string
33 positions_to_string(Xapian::PositionIterator & it,
34 const Xapian::PositionIterator & end,
35 Xapian::termcount * count)
37 string result;
38 bool need_comma = false;
39 Xapian::termcount c = 0;
40 while (it != end) {
41 if (need_comma)
42 result += ", ";
43 result += str(*it);
44 need_comma = true;
45 ++it;
46 ++c;
48 if (count) {
49 *count = c;
51 return result;
54 string
55 postlist_to_string(const Xapian::Database & db, const string & tname)
57 string result;
58 bool need_comma = false;
60 for (Xapian::PostingIterator p = db.postlist_begin(tname);
61 p != db.postlist_end(tname);
62 ++p) {
63 if (need_comma)
64 result += ", ";
66 Xapian::PositionIterator it(p.positionlist_begin());
67 string posrepr = positions_to_string(it, p.positionlist_end());
68 if (!posrepr.empty()) {
69 posrepr = ", pos=[" + posrepr + "]";
72 result += "(" + str(*p) +
73 ", doclen=" + str(p.get_doclength()) +
74 ", wdf=" + str(p.get_wdf()) +
75 posrepr + ")";
76 need_comma = true;
78 return result;
81 string
82 docterms_to_string(const Xapian::Database & db, Xapian::docid did)
84 string result;
85 bool need_comma = false;
87 for (Xapian::TermIterator t = db.termlist_begin(did);
88 t != db.termlist_end(did);
89 ++t) {
90 Xapian::PositionIterator it(t.positionlist_begin());
91 string posrepr = positions_to_string(it, t.positionlist_end());
92 if (!posrepr.empty()) {
93 posrepr = ", pos=[" + posrepr + "]";
95 if (need_comma)
96 result += ", ";
97 result += "Term(" + *t + ", wdf=" + str(t.get_wdf()) + posrepr;
98 result += ")";
99 need_comma = true;
101 return result;
104 string
105 docstats_to_string(const Xapian::Database & db, Xapian::docid did)
107 string result;
109 result += "len=" + str(db.get_doclength(did));
111 return result;
114 string
115 termstats_to_string(const Xapian::Database & db, const string & term)
117 string result;
119 result += "tf=" + str(db.get_termfreq(term));
120 result += ",cf=" + str(db.get_collection_freq(term));
122 return result;
125 void
126 dbcheck(const Xapian::Database & db,
127 Xapian::doccount expected_doccount,
128 Xapian::docid expected_lastdocid)
130 TEST_EQUAL(db.get_doccount(), expected_doccount);
131 TEST_EQUAL(db.get_lastdocid(), expected_lastdocid);
133 // Note - may not be a very big type, but we're only expecting to use this
134 // for small databases, so should be fine.
135 unsigned long totlen = 0;
137 // A map from term to a representation of the posting list for that term.
138 // We build this up from the documents, and then check it against the
139 // equivalent built up from the posting lists.
140 map<string, string> posting_reprs;
141 map<Xapian::valueno, string> value_reprs;
143 Xapian::termcount doclen_lower_bound = Xapian::termcount(-1);
144 Xapian::termcount doclen_upper_bound = 0;
146 for (Xapian::PostingIterator dociter = db.postlist_begin(string());
147 dociter != db.postlist_end(string());
148 ++dociter) {
149 Xapian::docid did = *dociter;
150 TEST_EQUAL(dociter.get_wdf(), 1);
151 Xapian::Document doc(db.get_document(did));
152 Xapian::termcount doclen(db.get_doclength(did));
153 Xapian::termcount unique_terms(db.get_unique_terms(did));
154 if (doclen < doclen_lower_bound)
155 doclen_lower_bound = doclen;
156 if (doclen > doclen_upper_bound)
157 doclen_upper_bound = doclen;
158 totlen += doclen;
160 Xapian::termcount found_termcount = 0;
161 Xapian::termcount found_unique_terms = 0;
162 Xapian::termcount wdf_sum = 0;
163 Xapian::TermIterator t, t2;
164 for (t = doc.termlist_begin(), t2 = db.termlist_begin(did);
165 t != doc.termlist_end();
166 ++t, ++t2) {
167 TEST(t2 != db.termlist_end(did));
169 ++found_termcount;
170 auto wdf = t.get_wdf();
171 if (wdf) ++found_unique_terms;
172 wdf_sum += wdf;
174 TEST_EQUAL(*t, *t2);
175 TEST_EQUAL(t.get_wdf(), t2.get_wdf());
176 TEST_EQUAL(db.get_termfreq(*t), t.get_termfreq());
177 TEST_EQUAL(db.get_termfreq(*t), t2.get_termfreq());
179 // Check the position lists are equal.
180 Xapian::termcount tc1, tc2;
181 Xapian::PositionIterator it1(t.positionlist_begin());
182 string posrepr = positions_to_string(it1, t.positionlist_end(), &tc1);
183 Xapian::PositionIterator it2(t2.positionlist_begin());
184 string posrepr2 = positions_to_string(it2, t2.positionlist_end(), &tc2);
185 TEST_EQUAL(posrepr, posrepr2);
186 TEST_EQUAL(tc1, tc2);
187 try {
188 TEST_EQUAL(tc1, t.positionlist_count());
189 } catch (const Xapian::UnimplementedError &) {
190 // positionlist_count() isn't implemented for remote databases.
193 // Make a representation of the posting.
194 if (!posrepr.empty()) {
195 posrepr = ",[" + posrepr + "]";
197 string posting_repr = "(" + str(did) + "," +
198 str(t.get_wdf()) + "/" + str(doclen) +
199 posrepr + ")";
201 // Append the representation to the list for the term.
202 map<string, string>::iterator i = posting_reprs.find(*t);
203 if (i == posting_reprs.end()) {
204 posting_reprs[*t] = posting_repr;
205 } else {
206 i->second += "," + posting_repr;
210 Xapian::termcount vcount = 0;
211 for (Xapian::ValueIterator v = doc.values_begin();
212 v != doc.values_end();
213 ++v, ++vcount) {
214 TEST((*v).size() != 0);
215 string value_repr = "(" + str(did) + "," + *v + ")";
217 // Append the values to the value lists.
218 map<Xapian::valueno, string>::iterator i;
219 i = value_reprs.find(v.get_valueno());
220 if (i == value_reprs.end()) {
221 value_reprs[v.get_valueno()] = value_repr;
222 } else {
223 i->second += "," + value_repr;
226 TEST_EQUAL(vcount, doc.values_count());
227 TEST(t2 == db.termlist_end(did));
228 Xapian::termcount expected_termcount = doc.termlist_count();
229 TEST_EQUAL(expected_termcount, found_termcount);
230 // Ideally this would be equal, but currently we don't store the
231 // unique_terms values but calculate them, and scanning the termlist
232 // of each document would be slow, so instead get_unique_terms(did)
233 // returns min(doclen, termcount) at present.
234 TEST_REL(unique_terms, >=, found_unique_terms);
235 TEST_REL(unique_terms, <=, found_termcount);
236 TEST_REL(unique_terms, <=, doclen);
237 TEST_EQUAL(doclen, wdf_sum);
240 TEST_REL(doclen_lower_bound, >=, db.get_doclength_lower_bound());
241 TEST_REL(doclen_upper_bound, <=, db.get_doclength_upper_bound());
243 Xapian::TermIterator t;
244 map<string, string>::const_iterator i;
245 for (t = db.allterms_begin(), i = posting_reprs.begin();
246 t != db.allterms_end();
247 ++t, ++i) {
248 TEST(db.term_exists(*t));
249 TEST(i != posting_reprs.end());
250 TEST_EQUAL(i->first, *t);
252 Xapian::doccount tf_count = 0;
253 Xapian::termcount cf_count = 0;
254 Xapian::termcount wdf_upper_bound = 0;
255 string posting_repr;
256 bool need_comma = false;
257 for (Xapian::PostingIterator p = db.postlist_begin(*t);
258 p != db.postlist_end(*t);
259 ++p) {
260 if (need_comma) {
261 posting_repr += ",";
264 ++tf_count;
265 cf_count += p.get_wdf();
267 Xapian::PositionIterator it(p.positionlist_begin());
268 string posrepr = positions_to_string(it, p.positionlist_end());
269 if (!posrepr.empty()) {
270 posrepr = ",[" + posrepr + "]";
272 posting_repr += "(" + str(*p) + "," +
273 str(p.get_wdf()) + "/" +
274 str(p.get_doclength()) + posrepr + ")";
275 if (wdf_upper_bound < p.get_wdf())
276 wdf_upper_bound = p.get_wdf();
277 need_comma = true;
280 TEST_EQUAL(posting_repr, i->second);
281 TEST_EQUAL(tf_count, t.get_termfreq());
282 TEST_EQUAL(tf_count, db.get_termfreq(*t));
283 TEST_EQUAL(cf_count, db.get_collection_freq(*t));
284 TEST_REL(wdf_upper_bound, <=, db.get_wdf_upper_bound(*t));
286 TEST(i == posting_reprs.end());
288 map<Xapian::valueno, string>::const_iterator j;
289 for (j = value_reprs.begin(); j != value_reprs.end(); ++j) {
290 string value_repr;
291 string value_lower_bound;
292 string value_upper_bound;
293 bool first = true;
294 for (Xapian::ValueIterator v = db.valuestream_begin(j->first);
295 v != db.valuestream_end(j->first); ++v) {
296 if (first) {
297 value_lower_bound = *v;
298 value_upper_bound = *v;
299 first = false;
300 } else {
301 value_repr += ",";
302 if (*v > value_upper_bound) {
303 value_upper_bound = *v;
305 if (*v < value_lower_bound) {
306 value_lower_bound = *v;
309 value_repr += "(" + str(v.get_docid()) + "," + *v + ")";
311 TEST_EQUAL(value_repr, j->second);
312 try {
313 TEST_REL(value_upper_bound, <=, db.get_value_upper_bound(j->first));
314 TEST_REL(value_lower_bound, >=, db.get_value_lower_bound(j->first));
315 } catch (const Xapian::UnimplementedError &) {
316 // Skip the checks if the methods to get the bounds aren't
317 // implemented for this backend.
321 if (expected_doccount == 0) {
322 TEST_EQUAL(0, db.get_avlength());
323 } else {
324 TEST_EQUAL_DOUBLE(double(totlen) / expected_doccount,
325 db.get_avlength());