Fix integer type used by ESet
[xapian.git] / xapian-core / bin / xapian-delve.cc
blobe9360c5061ed9695bd525d793833ff334dc5cd33
1 /** @file
2 * @brief Allow inspection of the contents of a Xapian database
3 */
4 /* Copyright 1999,2000,2001 BrightStation PLC
5 * Copyright 2002 Ananova Ltd
6 * Copyright 2002-2022 Olly Betts
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
21 * USA
24 #include <config.h>
26 #include <xapian.h>
28 #include <algorithm>
29 #include <ios>
30 #include <iostream>
31 #include <vector>
33 #include "gnu_getopt.h"
35 #include <cerrno>
36 #include <cstring>
37 #include <cstdlib>
38 #include "unicode/description_append.h"
40 #include "unicode/description_append.cc"
42 using namespace Xapian;
43 using namespace std;
45 static char separator = ' ';
47 static int verbose = 0;
48 static bool showvalues = false;
49 static bool showdocdata = false;
50 static bool count_zero_length_docs = false;
52 // How to decode document values.
53 static enum {
54 VALUE_ESCAPE,
55 VALUE_SORTABLE_SERIALISE,
56 VALUE_PACKED_INT,
57 VALUE_RAW
58 } value_decode = VALUE_ESCAPE;
60 #define PROG_NAME "delve"
61 #define PROG_DESC "Inspect the contents of a Xapian database"
63 static void show_usage() {
64 cout << "Usage: " PROG_NAME " [OPTIONS] DATABASE...\n\n"
65 "Options:\n"
66 " -a show all terms in the database\n"
67 " -A <prefix> show all terms in the database with given prefix\n"
68 " -r <recno> for term list(s)\n"
69 " -t <term> for posting list(s)\n"
70 " -t <term> -r <recno> for position list(s)\n"
71 " -s, --stemmer=LANG set the stemming language, the default is 'none'\n"
72 " -1 output one list entry per line\n"
73 " -V[<type>]<valueno> output value valueno for each document referred to\n"
74 " (or each document in the database if no -r options).\n"
75 " <type> can be:\n"
76 " E: escape in a C-like way (default)\n"
77 " I: decode as a packed integer\n"
78 " R: show the raw value (which may contain binary data,\n"
79 " newlines, invalid UTF-8, etc)\n"
80 " S: decode using Xapian::sortable_unserialise()\n"
81 " -V[<type>] output all values for each document referred to.\n"
82 " <type> is as above.\n"
83 " -d output document data for each document referred to\n"
84 " -z for db, count documents with length 0\n"
85 " -v extra info (wdf and len for postlist;\n"
86 " wdf and termfreq for termlist; number of terms for db;\n"
87 " termfreq when showing all terms; value bounds and freq\n"
88 " when showing all values in a slot)\n"
89 " -vv even more info (also show collection freq and wdf\n"
90 " upper bound for terms)\n"
91 " --help display this help and exit\n"
92 " --version output version information and exit" << endl;
95 static void
96 show_db_stats(Database &db)
98 // Display a few database stats.
99 cout << "UUID = " << db.get_uuid() << endl;
100 cout << "number of documents = " << db.get_doccount() << endl;
101 cout << "average document length = " << db.get_avlength() << endl;
102 cout << "document length lower bound = " << db.get_doclength_lower_bound()
103 << endl;
104 cout << "document length upper bound = " << db.get_doclength_upper_bound()
105 << endl;
106 cout << "highest document id ever used = " << db.get_lastdocid() << endl;
107 cout << boolalpha;
108 cout << "has positional information = " << db.has_positions() << endl;
109 cout << "revision = ";
110 if (db.size() > 1) {
111 cout << "N/A (sharded DB)\n";
112 } else {
113 try {
114 cout << db.get_revision() << endl;
115 } catch (const Xapian::InvalidOperationError& e) {
116 cout << e.get_description() << endl;
117 } catch (const Xapian::UnimplementedError& e) {
118 cout << "N/A (" << e.get_msg() << ")\n";
121 cout << "currently open for writing = ";
122 try {
123 cout << db.locked() << endl;
124 } catch (const Xapian::Error& e) {
125 cout << e.get_description() << endl;
128 if (count_zero_length_docs) {
129 Xapian::doccount empty_docs = 0;
130 if (db.get_total_length() == 0) {
131 // All documents are empty.
132 empty_docs = db.get_doccount();
133 } else {
134 Xapian::PostingIterator d = db.postlist_begin(string());
135 while (d != db.postlist_end(string())) {
136 if (d.get_doclength() == 0)
137 ++empty_docs;
138 ++d;
141 cout << "number of zero-length documents = " << empty_docs << endl;
144 if (verbose) {
145 // To find the number of terms, we have to count them!
146 // This will take a few seconds or minutes, so only do it if -v
147 // was specified.
148 termcount terms = 0;
149 TermIterator t = db.allterms_begin();
150 while (t != db.allterms_end()) {
151 ++terms;
152 ++t;
154 cout << "number of distinct terms = " << terms << endl;
158 static void
159 decode_and_show_value(const string& value)
161 switch (value_decode) {
162 case VALUE_ESCAPE: {
163 string esc;
164 description_append(esc, value);
165 cout << esc;
166 break;
168 case VALUE_SORTABLE_SERIALISE:
169 cout << Xapian::sortable_unserialise(value);
170 break;
171 case VALUE_PACKED_INT: {
172 unsigned long long i = 0;
173 for (unsigned char ch : value) {
174 i = (i << 8) | ch;
176 cout << i;
177 break;
179 default: // VALUE_RAW
180 cout << value;
181 break;
185 static void
186 show_values(Database &db, docid docid, char sep)
188 Document doc = db.get_document(docid);
189 ValueIterator v = doc.values_begin();
190 while (v != doc.values_end()) {
191 cout << sep << v.get_valueno() << ':';
192 decode_and_show_value(*v);
193 ++v;
197 static void
198 show_values(Database &db,
199 vector<docid>::const_iterator i,
200 vector<docid>::const_iterator end)
202 while (i != end) {
203 cout << "Values for record #" << *i << ':';
204 show_values(db, *i, separator);
205 cout << endl;
206 ++i;
210 static void
211 show_value(Database &db,
212 vector<docid>::const_iterator i,
213 vector<docid>::const_iterator end,
214 Xapian::valueno slot)
216 while (i != end) {
217 Xapian::docid did = *i;
218 cout << "Value " << slot << " for record #" << did << ": ";
219 decode_and_show_value(db.get_document(did).get_value(slot));
220 cout << endl;
221 ++i;
225 static void
226 show_docdata(Database &db, docid docid, char sep)
228 cout << sep << "[" << db.get_document(docid).get_data() << ']';
231 static void
232 show_docdata(Database &db,
233 vector<docid>::const_iterator i,
234 vector<docid>::const_iterator end)
236 while (i != end) {
237 cout << "Data for record #" << *i << ':' << endl;
238 cout << db.get_document(*i).get_data() << endl;
239 ++i;
243 static void
244 show_termlist(const Database &db, Xapian::docid did,
245 const char * all_pfx = NULL)
247 TermIterator t, tend;
248 if (all_pfx) {
249 t = db.allterms_begin(all_pfx);
250 tend = db.allterms_end(all_pfx);
251 cout << "All terms in database";
252 if (all_pfx[0])
253 cout << " with prefix \"" << all_pfx << "\"";
254 } else {
255 t = db.termlist_begin(did);
256 tend = db.termlist_end(did);
257 cout << "Term List for record #" << did;
259 if (verbose) {
260 cout << " (";
261 if (did != 0)
262 cout << "wdf, ";
263 cout << "termfreq";
264 if (verbose > 1)
265 cout << ", collection freq, wdf upper bound";
266 cout << ')';
268 cout << ':';
270 while (t != tend) {
271 const string & term = *t;
272 cout << separator << term;
273 if (verbose) {
274 if (did != 0)
275 cout << ' ' << t.get_wdf();
276 cout << ' ' << t.get_termfreq();
277 if (verbose > 1) {
278 cout << ' ' << db.get_collection_freq(term)
279 << ' ' << db.get_wdf_upper_bound(term);
282 ++t;
284 cout << endl;
287 static void
288 show_termlists(Database &db,
289 vector<docid>::const_iterator i,
290 vector<docid>::const_iterator end)
292 // Display termlists
293 while (i != end) {
294 show_termlist(db, *i);
295 ++i;
300 main(int argc, char **argv) try {
301 if (argc > 1 && argv[1][0] == '-') {
302 if (strcmp(argv[1], "--help") == 0) {
303 cout << PROG_NAME " - " PROG_DESC "\n\n";
304 show_usage();
305 exit(0);
307 if (strcmp(argv[1], "--version") == 0) {
308 cout << PROG_NAME " - " PACKAGE_STRING << endl;
309 exit(0);
313 const char * all_terms = NULL;
314 vector<docid> recnos;
315 vector<string> terms;
316 vector<string> dbs;
317 Stem stemmer;
319 valueno slot = 0; // Avoid "may be used uninitialised" warnings.
320 bool slot_set = false;
322 int c;
323 while ((c = gnu_getopt(argc, argv, "aA:r:t:s:1vV::dz")) != -1) {
324 switch (c) {
325 case 'a':
326 all_terms = "";
327 break;
328 case 'A':
329 all_terms = optarg;
330 break;
331 case 'r': {
332 char * end;
333 errno = 0;
334 unsigned long n = strtoul(optarg, &end, 10);
335 if (optarg == end || *end) {
336 cout << "Non-numeric document id: " << optarg << endl;
337 exit(1);
339 Xapian::docid did(n);
340 if (errno == ERANGE || n == 0 || did != n) {
341 cout << "Document id out of range: " << optarg << endl;
342 exit(1);
344 recnos.push_back(did);
345 break;
347 case 't':
348 terms.push_back(optarg);
349 break;
350 case 's':
351 stemmer = Stem(optarg);
352 break;
353 case '1':
354 separator = '\n';
355 break;
356 case 'V':
357 if (optarg) {
358 switch (*optarg) {
359 case 'R':
360 value_decode = VALUE_RAW;
361 ++optarg;
362 break;
363 case 'I':
364 value_decode = VALUE_PACKED_INT;
365 ++optarg;
366 break;
367 case 'S':
368 value_decode = VALUE_SORTABLE_SERIALISE;
369 ++optarg;
370 break;
371 case 'E':
372 value_decode = VALUE_ESCAPE;
373 ++optarg;
374 break;
376 char * end;
377 errno = 0;
378 unsigned long n = strtoul(optarg, &end, 10);
379 if (optarg == end || *end) {
380 cout << "Non-numeric value slot: " << optarg << endl;
381 exit(1);
383 slot = Xapian::valueno(n);
384 if (errno == ERANGE || slot != n) {
385 cout << "Value slot out of range: " << optarg << endl;
386 exit(1);
388 slot_set = true;
389 } else {
390 showvalues = true;
392 break;
393 case 'd':
394 showdocdata = true;
395 break;
396 case 'v':
397 ++verbose;
398 break;
399 case 'z':
400 count_zero_length_docs = true;
401 break;
402 default:
403 show_usage();
404 exit(1);
408 while (argv[optind]) dbs.push_back(argv[optind++]);
410 if (dbs.empty()) {
411 show_usage();
412 exit(1);
415 std::sort(recnos.begin(), recnos.end());
417 Database db;
419 vector<string>::const_iterator i;
420 for (i = dbs.begin(); i != dbs.end(); ++i) {
421 try {
422 db.add_database(Database(*i));
423 } catch (const Error &e) {
424 cerr << "Error opening database '" << *i << "': ";
425 cerr << e.get_description() << endl;
426 return 1;
431 if (!all_terms && terms.empty() && recnos.empty() && !slot_set) {
432 // Show some statistics about the database.
433 show_db_stats(db);
434 return 0;
437 if (all_terms) {
438 show_termlist(db, 0, all_terms);
441 if (!recnos.empty()) {
442 if (showvalues) {
443 show_values(db, recnos.begin(), recnos.end());
444 } else if (slot_set) {
445 show_value(db, recnos.begin(), recnos.end(), slot);
448 if (showdocdata) {
449 show_docdata(db, recnos.begin(), recnos.end());
451 } else {
452 if (slot_set) {
453 cout << "Value " << slot;
454 if (verbose) {
455 cout << " (lower bound=";
456 decode_and_show_value(db.get_value_lower_bound(slot));
457 cout << " upper bound=";
458 decode_and_show_value(db.get_value_upper_bound(slot));
459 cout << " freq=" << db.get_value_freq(slot) << ")";
461 cout << " for each document:";
462 ValueIterator it = db.valuestream_begin(slot);
463 while (it != db.valuestream_end(slot)) {
464 cout << separator << it.get_docid() << ':';
465 decode_and_show_value(*it);
466 ++it;
468 cout << endl;
472 if (terms.empty()) {
473 show_termlists(db, recnos.begin(), recnos.end());
474 return 0;
477 vector<string>::const_iterator i;
478 for (i = terms.begin(); i != terms.end(); ++i) {
479 string term = stemmer(*i);
480 PostingIterator p = db.postlist_begin(term);
481 PostingIterator pend = db.postlist_end(term);
482 if (p == pend) {
483 cout << "term '" << term << "' not in database\n";
484 continue;
486 if (recnos.empty()) {
487 // Display posting list
488 cout << "Posting List for term '" << term << "' (termfreq "
489 << db.get_termfreq(term) << ", collfreq "
490 << db.get_collection_freq(term) << ", wdf_max "
491 << db.get_wdf_upper_bound(term) << "):";
492 while (p != pend) {
493 cout << separator << *p;
494 if (verbose) {
495 cout << ' ' << p.get_wdf() << ' ' << p.get_doclength();
497 if (showvalues) show_values(db, *p, ' ');
498 if (showdocdata) show_docdata(db, *p, ' ');
499 ++p;
501 cout << endl;
502 } else {
503 // Display position lists
504 vector<docid>::const_iterator j;
505 for (j = recnos.begin(); j != recnos.end(); ++j) {
506 p.skip_to(*j);
507 if (p == pend || *p != *j) {
508 cout << "term '" << term <<
509 "' doesn't index document #" << *j << endl;
510 } else {
511 cout << "Position List for term '" << term
512 << "', record #" << *j << ':';
513 try {
514 PositionIterator pos = p.positionlist_begin();
515 while (pos != p.positionlist_end()) {
516 cout << separator << *pos;
517 ++pos;
519 cout << endl;
520 } catch (const Error &e) {
521 cerr << "Error: " << e.get_description() << endl;
527 } catch (const Error &e) {
528 cerr << "\nError: " << e.get_description() << endl;
529 return 1;