2 * @brief Allow inspection of the contents of a Xapian database
4 /* Copyright 1999,2000,2001 BrightStation PLC
5 * Copyright 2002 Ananova Ltd
6 * Copyright 2002-2022 Olly Betts
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
33 #include "gnu_getopt.h"
38 #include "unicode/description_append.h"
40 #include "unicode/description_append.cc"
42 using namespace Xapian
;
45 static char separator
= ' ';
47 static int verbose
= 0;
48 static bool showvalues
= false;
49 static bool showdocdata
= false;
50 static bool count_zero_length_docs
= false;
52 // How to decode document values.
55 VALUE_SORTABLE_SERIALISE
,
58 } value_decode
= VALUE_ESCAPE
;
60 #define PROG_NAME "delve"
61 #define PROG_DESC "Inspect the contents of a Xapian database"
63 static void show_usage() {
64 cout
<< "Usage: " PROG_NAME
" [OPTIONS] DATABASE...\n\n"
66 " -a show all terms in the database\n"
67 " -A <prefix> show all terms in the database with given prefix\n"
68 " -r <recno> for term list(s)\n"
69 " -t <term> for posting list(s)\n"
70 " -t <term> -r <recno> for position list(s)\n"
71 " -s, --stemmer=LANG set the stemming language, the default is 'none'\n"
72 " -1 output one list entry per line\n"
73 " -V[<type>]<valueno> output value valueno for each document referred to\n"
74 " (or each document in the database if no -r options).\n"
76 " E: escape in a C-like way (default)\n"
77 " I: decode as a packed integer\n"
78 " R: show the raw value (which may contain binary data,\n"
79 " newlines, invalid UTF-8, etc)\n"
80 " S: decode using Xapian::sortable_unserialise()\n"
81 " -V[<type>] output all values for each document referred to.\n"
82 " <type> is as above.\n"
83 " -d output document data for each document referred to\n"
84 " -z for db, count documents with length 0\n"
85 " -v extra info (wdf and len for postlist;\n"
86 " wdf and termfreq for termlist; number of terms for db;\n"
87 " termfreq when showing all terms; value bounds and freq\n"
88 " when showing all values in a slot)\n"
89 " -vv even more info (also show collection freq and wdf\n"
90 " upper bound for terms)\n"
91 " --help display this help and exit\n"
92 " --version output version information and exit" << endl
;
96 show_db_stats(Database
&db
)
98 // Display a few database stats.
99 cout
<< "UUID = " << db
.get_uuid() << endl
;
100 cout
<< "number of documents = " << db
.get_doccount() << endl
;
101 cout
<< "average document length = " << db
.get_avlength() << endl
;
102 cout
<< "document length lower bound = " << db
.get_doclength_lower_bound()
104 cout
<< "document length upper bound = " << db
.get_doclength_upper_bound()
106 cout
<< "highest document id ever used = " << db
.get_lastdocid() << endl
;
108 cout
<< "has positional information = " << db
.has_positions() << endl
;
109 cout
<< "revision = ";
111 cout
<< "N/A (sharded DB)\n";
114 cout
<< db
.get_revision() << endl
;
115 } catch (const Xapian::InvalidOperationError
& e
) {
116 cout
<< e
.get_description() << endl
;
117 } catch (const Xapian::UnimplementedError
& e
) {
118 cout
<< "N/A (" << e
.get_msg() << ")\n";
121 cout
<< "currently open for writing = ";
123 cout
<< db
.locked() << endl
;
124 } catch (const Xapian::Error
& e
) {
125 cout
<< e
.get_description() << endl
;
128 if (count_zero_length_docs
) {
129 Xapian::doccount empty_docs
= 0;
130 if (db
.get_total_length() == 0) {
131 // All documents are empty.
132 empty_docs
= db
.get_doccount();
134 Xapian::PostingIterator d
= db
.postlist_begin(string());
135 while (d
!= db
.postlist_end(string())) {
136 if (d
.get_doclength() == 0)
141 cout
<< "number of zero-length documents = " << empty_docs
<< endl
;
145 // To find the number of terms, we have to count them!
146 // This will take a few seconds or minutes, so only do it if -v
149 TermIterator t
= db
.allterms_begin();
150 while (t
!= db
.allterms_end()) {
154 cout
<< "number of distinct terms = " << terms
<< endl
;
159 decode_and_show_value(const string
& value
)
161 switch (value_decode
) {
164 description_append(esc
, value
);
168 case VALUE_SORTABLE_SERIALISE
:
169 cout
<< Xapian::sortable_unserialise(value
);
171 case VALUE_PACKED_INT
: {
172 unsigned long long i
= 0;
173 for (unsigned char ch
: value
) {
179 default: // VALUE_RAW
186 show_values(Database
&db
, docid docid
, char sep
)
188 Document doc
= db
.get_document(docid
);
189 ValueIterator v
= doc
.values_begin();
190 while (v
!= doc
.values_end()) {
191 cout
<< sep
<< v
.get_valueno() << ':';
192 decode_and_show_value(*v
);
198 show_values(Database
&db
,
199 vector
<docid
>::const_iterator i
,
200 vector
<docid
>::const_iterator end
)
203 cout
<< "Values for record #" << *i
<< ':';
204 show_values(db
, *i
, separator
);
211 show_value(Database
&db
,
212 vector
<docid
>::const_iterator i
,
213 vector
<docid
>::const_iterator end
,
214 Xapian::valueno slot
)
217 Xapian::docid did
= *i
;
218 cout
<< "Value " << slot
<< " for record #" << did
<< ": ";
219 decode_and_show_value(db
.get_document(did
).get_value(slot
));
226 show_docdata(Database
&db
, docid docid
, char sep
)
228 cout
<< sep
<< "[" << db
.get_document(docid
).get_data() << ']';
232 show_docdata(Database
&db
,
233 vector
<docid
>::const_iterator i
,
234 vector
<docid
>::const_iterator end
)
237 cout
<< "Data for record #" << *i
<< ':' << endl
;
238 cout
<< db
.get_document(*i
).get_data() << endl
;
244 show_termlist(const Database
&db
, Xapian::docid did
,
245 const char * all_pfx
= NULL
)
247 TermIterator t
, tend
;
249 t
= db
.allterms_begin(all_pfx
);
250 tend
= db
.allterms_end(all_pfx
);
251 cout
<< "All terms in database";
253 cout
<< " with prefix \"" << all_pfx
<< "\"";
255 t
= db
.termlist_begin(did
);
256 tend
= db
.termlist_end(did
);
257 cout
<< "Term List for record #" << did
;
265 cout
<< ", collection freq, wdf upper bound";
271 const string
& term
= *t
;
272 cout
<< separator
<< term
;
275 cout
<< ' ' << t
.get_wdf();
276 cout
<< ' ' << t
.get_termfreq();
278 cout
<< ' ' << db
.get_collection_freq(term
)
279 << ' ' << db
.get_wdf_upper_bound(term
);
288 show_termlists(Database
&db
,
289 vector
<docid
>::const_iterator i
,
290 vector
<docid
>::const_iterator end
)
294 show_termlist(db
, *i
);
300 main(int argc
, char **argv
) try {
301 if (argc
> 1 && argv
[1][0] == '-') {
302 if (strcmp(argv
[1], "--help") == 0) {
303 cout
<< PROG_NAME
" - " PROG_DESC
"\n\n";
307 if (strcmp(argv
[1], "--version") == 0) {
308 cout
<< PROG_NAME
" - " PACKAGE_STRING
<< endl
;
313 const char * all_terms
= NULL
;
314 vector
<docid
> recnos
;
315 vector
<string
> terms
;
319 valueno slot
= 0; // Avoid "may be used uninitialised" warnings.
320 bool slot_set
= false;
323 while ((c
= gnu_getopt(argc
, argv
, "aA:r:t:s:1vV::dz")) != -1) {
334 unsigned long n
= strtoul(optarg
, &end
, 10);
335 if (optarg
== end
|| *end
) {
336 cout
<< "Non-numeric document id: " << optarg
<< endl
;
339 Xapian::docid
did(n
);
340 if (errno
== ERANGE
|| n
== 0 || did
!= n
) {
341 cout
<< "Document id out of range: " << optarg
<< endl
;
344 recnos
.push_back(did
);
348 terms
.push_back(optarg
);
351 stemmer
= Stem(optarg
);
360 value_decode
= VALUE_RAW
;
364 value_decode
= VALUE_PACKED_INT
;
368 value_decode
= VALUE_SORTABLE_SERIALISE
;
372 value_decode
= VALUE_ESCAPE
;
378 unsigned long n
= strtoul(optarg
, &end
, 10);
379 if (optarg
== end
|| *end
) {
380 cout
<< "Non-numeric value slot: " << optarg
<< endl
;
383 slot
= Xapian::valueno(n
);
384 if (errno
== ERANGE
|| slot
!= n
) {
385 cout
<< "Value slot out of range: " << optarg
<< endl
;
400 count_zero_length_docs
= true;
408 while (argv
[optind
]) dbs
.push_back(argv
[optind
++]);
415 std::sort(recnos
.begin(), recnos
.end());
419 vector
<string
>::const_iterator i
;
420 for (i
= dbs
.begin(); i
!= dbs
.end(); ++i
) {
422 db
.add_database(Database(*i
));
423 } catch (const Error
&e
) {
424 cerr
<< "Error opening database '" << *i
<< "': ";
425 cerr
<< e
.get_description() << endl
;
431 if (!all_terms
&& terms
.empty() && recnos
.empty() && !slot_set
) {
432 // Show some statistics about the database.
438 show_termlist(db
, 0, all_terms
);
441 if (!recnos
.empty()) {
443 show_values(db
, recnos
.begin(), recnos
.end());
444 } else if (slot_set
) {
445 show_value(db
, recnos
.begin(), recnos
.end(), slot
);
449 show_docdata(db
, recnos
.begin(), recnos
.end());
453 cout
<< "Value " << slot
;
455 cout
<< " (lower bound=";
456 decode_and_show_value(db
.get_value_lower_bound(slot
));
457 cout
<< " upper bound=";
458 decode_and_show_value(db
.get_value_upper_bound(slot
));
459 cout
<< " freq=" << db
.get_value_freq(slot
) << ")";
461 cout
<< " for each document:";
462 ValueIterator it
= db
.valuestream_begin(slot
);
463 while (it
!= db
.valuestream_end(slot
)) {
464 cout
<< separator
<< it
.get_docid() << ':';
465 decode_and_show_value(*it
);
473 show_termlists(db
, recnos
.begin(), recnos
.end());
477 vector
<string
>::const_iterator i
;
478 for (i
= terms
.begin(); i
!= terms
.end(); ++i
) {
479 string term
= stemmer(*i
);
480 PostingIterator p
= db
.postlist_begin(term
);
481 PostingIterator pend
= db
.postlist_end(term
);
483 cout
<< "term '" << term
<< "' not in database\n";
486 if (recnos
.empty()) {
487 // Display posting list
488 cout
<< "Posting List for term '" << term
<< "' (termfreq "
489 << db
.get_termfreq(term
) << ", collfreq "
490 << db
.get_collection_freq(term
) << ", wdf_max "
491 << db
.get_wdf_upper_bound(term
) << "):";
493 cout
<< separator
<< *p
;
495 cout
<< ' ' << p
.get_wdf() << ' ' << p
.get_doclength();
497 if (showvalues
) show_values(db
, *p
, ' ');
498 if (showdocdata
) show_docdata(db
, *p
, ' ');
503 // Display position lists
504 vector
<docid
>::const_iterator j
;
505 for (j
= recnos
.begin(); j
!= recnos
.end(); ++j
) {
507 if (p
== pend
|| *p
!= *j
) {
508 cout
<< "term '" << term
<<
509 "' doesn't index document #" << *j
<< endl
;
511 cout
<< "Position List for term '" << term
512 << "', record #" << *j
<< ':';
514 PositionIterator pos
= p
.positionlist_begin();
515 while (pos
!= p
.positionlist_end()) {
516 cout
<< separator
<< *pos
;
520 } catch (const Error
&e
) {
521 cerr
<< "Error: " << e
.get_description() << endl
;
527 } catch (const Error
&e
) {
528 cerr
<< "\nError: " << e
.get_description() << endl
;