Fix tg_termpos1 for 64-bit termpos
[xapian.git] / xapian-applications / omega / omindexcheck.cc
blob378e25022b00002ef141c77fbe15837808b61912
1 /** @file
2 * @brief Auxiliary program of omindextest
3 */
4 /* Copyright (C) 2019 Bruno Baruffaldi
5 * Copyright (C) 2020 Parth Kapadia
6 * Copyright (C) 2021,2022,2023 Olly Betts
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
21 * USA
24 #include <config.h>
26 #include <xapian.h>
28 #include <algorithm>
29 #include <iomanip>
30 #include <iostream>
31 #include <string>
32 #include <unordered_map>
33 #include <vector>
35 #include "values.h"
37 using namespace std;
39 enum test_result { PASS, FAIL };
41 // Macro to mark optional terms.
43 // If there are optional terms in a testcase, either all or none must be
44 // present. This is useful with libextractor where the required plugin
45 // may not be installed, but libextractor will still return generic metadata.
47 // This works by appending '\xff', which we remove before comparing. This
48 // means terms ending with this byte can't be used in testcases, but it's
49 // invalid in UTF-8 so probably not a problematic limitation.
50 #define OPT(T) (T "\xff")
52 struct testcase {
53 vector<string> terms;
55 vector<pair<Xapian::valueno, string>> values;
57 testcase(vector<string> v)
58 : terms(std::move(v)) {}
60 testcase(vector<string> v, vector<pair<Xapian::valueno, string>> v2)
61 : terms(std::move(v)), values(std::move(v2)) {}
65 static unordered_map<string, testcase> tests;
67 static void
68 index_test()
70 tests.insert({"plaintext/iso88591.txt",
71 {{"à", "d'après", "françois", "idée", "réalisation"}}});
72 tests.insert({"plaintext/utf16be-bom.txt",
73 {{"Zjoey", "Zfood", "Zедой", "Z喬伊不分享食物"}}});
74 tests.insert({"plaintext/utf16le-bom.txt",
75 {{"Zjoey", "Zfood", "Zедой", "Z喬伊不分享食物"}}});
76 tests.insert({"plaintext/utf8.txt",
77 {{"Zjoey", "Zfood", "Zедой", "Z喬伊不分享食物"}}});
78 tests.insert({"plaintext/utf8-bom.txt",
79 {{"Zjoey", "Zfood", "Zедой", "Z喬伊不分享食物"}}});
80 tests.insert({"test-csv.csv",
81 {{"ZFcsv", "Zbreak", "Zwere"}}});
82 tests.insert({"test-html.html",
83 {{"Ajeroen", "ZAoom", "ZSworld", "Shello", "Zchapter"}}});
84 tests.insert({"svg/diagram.svg",
85 {{"Sdiagram", "Timage/svg+xml", "Zstart"}}});
86 tests.insert({"svg/diagram.svgz",
87 {{"Sdiagram", "Timage/svg+xml-compressed", "Zstart"}}});
88 #ifdef HAVE_GMIME
89 tests.insert({"email/html.eml",
90 {{"Aexample", "Ame", "Aorg", "Auser", "Shtml",
91 "Tmessage/rfc822",
92 "XMID:E1p1II7-008OVw-1w@example.org",
93 "XTOada", "XTOexample", "XTOorg", "XTOuser",
94 "html", "message", "test"},
95 {{{VALUE_CREATED, "c\x8a\xb4\xb3"}, // 1670034611
96 {VALUE_SIZE, Xapian::sortable_serialise(450)},
97 {VALUE_MD5, "y.<0RW\xb0\xf4\xd2+\xa8\x09\xde\xff|\x0d"}
98 }}}});
99 tests.insert({"email/text.eml",
100 {{"Aexample", "Ame", "Aorg", "Auser", "Stext", "Tmessage/rfc822",
101 "XMID:E1p1II7-008OVw-1v@example.org",
102 "XTOexample", "XTOorg", "XTOuser",
103 "comment1", "comment2", "keyword1", "keyword2",
104 "message", "plain", "text"},
105 {{{VALUE_CREATED, "c\x8a\xb4\xb3"}, // 1670034611
106 {VALUE_SIZE, Xapian::sortable_serialise(477)},
107 {VALUE_MD5,
108 "C\x7f\x17;;\x87\x91\x5c\x05?\x83\x14\xec\xaa\xad\x94"}
109 }}}});
110 #endif
111 #if defined HAVE_POPPLER
112 tests.insert({"pdf/poppler.pdf",
113 {{"ZFpoppler", "Zsub", "Ztext", "Ztitl", "Zpie"}}});
114 #endif
115 #if defined HAVE_LIBEBOOK
116 tests.insert({"fb2/hello.fb2",
117 {{"Ajeroen", "Aooms", "Zbeauti", "Zchapter", "Zdocument",
118 "Zooms3", "Zsubsect", "Ztoday", "ZFhello", "Zoutlin"}}});
119 tests.insert({"fb2/lang-name.fb2",
120 {{"Adavid", "Atardon", "ZAdavid", "ZFlang", "Zeel",
121 "Zhovercraft", "Zlanguag"}}});
122 tests.insert({"fb2/lang2.fb2",
123 {{"Adavid", "Atardon", "ZAtardon", "Zúhořů", "Zhovercraft",
124 "Zlanguag", "Zmój", "Zof", "Zpełen", "Zwęgorzi",
125 "Zpoduszkowiec", "Zvznášedlo"}}});
126 tests.insert({"lrf/hello.lrf",
127 {{"ZFhello", "Zhello", "Zworld"}}});
128 tests.insert({"pdb/PalmDOC-hello.pdb",
129 {{"Shello", "Sworld", "ZFpalmdoc", "Zworld"}}});
130 tests.insert({"pdb/PeanutPress-hello.pdb",
131 {{"Fpeanutpress", "ZFhello", "Zhello"}}});
132 tests.insert({"pdb/test.pdb",
133 {{"Sdemodemo", "ZFtest", "Zwherearew"}}});
134 #endif
135 #if defined HAVE_LIBETONYEK
136 tests.insert({"iwork/test-keynote.key",
137 {{"ZFkeynot", "Zbold", "Znow", "Zsubtitl"}}});
138 tests.insert({"iwork/test-pages.pages",
139 {{"ZFpage", "Zfull", "Zhovercraft", "Zwęgorzi"}}});
140 #endif
141 #if defined HAVE_TESSERACT
142 tests.insert({"image/Test1.gif",
143 {{"Znoisyimag", "Zocr", "Ztesseract"}}});
144 tests.insert({"image/Test2.pgm",
145 {{"ZFtest2", "Znoisyimag", "Ztesseract"}}});
146 tests.insert({"image/Test3.ppm",
147 {{"ZFtest3", "Zocr", "Ztest"}}});
148 tests.insert({"image/Test4.tiff",
149 {{"Znoisyimag", "Ztesseract", "Zto"}}});
150 tests.insert({"image/Test5.webp",
151 {{"Znoisyimag", "Ztesseract", "Ztest"}}});
152 tests.insert({"image/poster-2.jpg",
153 {{"ZFposter", "Zby", "Zproperti", "Zsurveil", "Zvideo"}}});
154 tests.insert({"image/poster.jpg",
155 {{"Zbicycl", "Zride", "Zroller", "Zskateboard"}}});
156 tests.insert({"image/scan-page.png",
157 {{"Zannual", "Zfed", "Zreturn", "Zwhile"}}});
158 #endif
159 #define OFFICE_TESTCASES(PREFIX) \
160 /* blank file */ \
161 /* pass the test if no terms are found */ \
162 tests.insert({PREFIX "opendoc/blank.odt", \
163 {{}}}); \
164 /* corrupted file (ODP) */ \
165 /* tests.insert({PREFIX "corrupt_file.odp", {"ZSnatur"}}); */ \
167 /* ODF */ \
168 tests.insert({PREFIX "opendoc/test.odt", \
169 {{"Zедой", "Z喬伊不分享食物"}}}); \
170 tests.insert({PREFIX "opendoc/text.odt", \
171 {{"Stesttitle", NOTLO("Aolly",) "Zsampl", "Zhead", "Ztext", \
172 "Zhello", "Zworld"}}}); \
173 tests.insert({PREFIX "opendoc/text_template.ott", \
174 {{"Zjane", "Zdoe", "Zstructur"}}}); \
175 NOTLO(\
176 tests.insert({PREFIX "opendoc/presentation.odp", \
177 {{"Zfascin", "Zfact", "Zpustak", "Zmahal", "Zmillion", \
178 "Zpeopl", "Zbirthday", "501"}}}); \
180 tests.insert({PREFIX "opendoc/presentation_template.otp", \
181 {{"ZSalizarin", "Zhead", "Zworld", "Ztext"}}}); \
182 tests.insert({PREFIX "opendoc/spreadsheet.ods", \
183 {{"Zhello", "Zworld", "Zsampl", "2"}}}); \
184 tests.insert({PREFIX "opendoc/spreadsheet_template.ots", \
185 {{"Zfood", "Zpasta", "Zpercentag", "40"}}}); \
186 tests.insert({PREFIX "opendoc/draw.odg", \
187 {{"Zparth", "Zkapadia"}}}); \
189 /* Apache OpenOffice */ \
190 tests.insert({PREFIX "staroffice/calc.sxc", \
191 {{"Ztoy", "Zproduct", "Zcost", "Zquantiti", "Zcardboard"}}}); \
192 tests.insert({PREFIX "staroffice/calc_template.stc", \
193 {{NOTLO("ZSpurchas", "ZStemplat",) "Zproduct", "Zquantiti", \
194 "Zsampl"}}}); \
195 tests.insert({PREFIX "staroffice/text.sxw", \
196 {{"Zhello", "Zsampl", "Zopenoffic", "Zwriter"}}}); \
197 tests.insert({PREFIX "staroffice/text_template.stw", \
198 {{"Zhello", "Zworld", "Zsampl", "Zhead", \
199 NOTLO("ZStemplat", "ZStext")}}}); \
200 tests.insert({PREFIX "staroffice/presentation.sxi", \
201 {{"Zhead", "Zhello", "Zopenoffic", "Zimpress"}}}); \
202 tests.insert({PREFIX "staroffice/presentation_template.sti", \
203 {{NOTLO("ZSproject", "ZSresearch",) "Zhead", "Ztext"}}}); \
205 /* Microsoft XML formats */ \
206 tests.insert({PREFIX "msxml/Book.xlsx", \
207 {{"Zmodi", "Zgood", "Zemploye"}}}); \
208 tests.insert({PREFIX "msxml/2sheets.xlsx", \
209 {{NOTLO("0.123456",) LO("0.12346",) \
210 "123.456", "15", "2021", \
211 NOTLO("3.14159265358979",) LO("3.14159",) \
212 "43", "55", "Aolly", "Ssheet", "Stitle", "xmas"}}}); \
213 tests.insert({PREFIX "msxml/Doc.docx", \
214 {{"Zедой", "Z喬伊不分享食物", "ZSbakeri"}}}); \
215 tests.insert({PREFIX "msxml/Nature.pptx", \
216 {{"ZSnatur", "Zbeauti", "Zsampl"}}}); \
217 tests.insert({PREFIX "msxml/vnd.ms-xpsdocument_xpstest.xps", \
218 {{"second", "header", "footer"}}});
219 #if defined HAVE_LIBARCHIVE
220 # define LO(...)
221 # define NOTLO(...) __VA_ARGS__
222 OFFICE_TESTCASES("")
223 # undef NOTLO
224 # undef LO
225 #endif
226 #if defined HAVE_LIBREOFFICEKIT_LIBREOFFICEKIT_HXX
227 # define LO(...) __VA_ARGS__
228 # define NOTLO(...)
229 OFFICE_TESTCASES("lok-")
230 # undef NOTLO
231 # undef LO
232 #endif
233 #if defined HAVE_LIBABW
234 // Title term is not being tested here because some older versions of Libabw
235 // lack a bug fix for the title to be handled properly. (< libabw-0.1.2)
236 tests.insert({"abw/test.abw",
237 {{"ZAparth", "Zabiword", "Zsampl", "Zdocument"}}});
238 tests.insert({"abw/macbeth.zabw",
239 {{"Ashakespeare", "Awilliam", "Smacbeth",
240 "ambition", "macduff", "shall"}}});
241 #else
242 // Indexed using AbiwordParser class, which doesn't currently handle metadata.
243 tests.insert({"abw/test.abw",
244 {{"Zabiword", "Zsampl", "Zdocument"}}});
245 tests.insert({"abw/macbeth.zabw",
246 {{"ambition", "macduff", "shall"}}});
247 #endif
248 tests.insert({"abw/test1.abw",
249 {{"Zедой", "Z喬伊不分享食物"}}});
250 tests.insert({"abw/Friendly-Letter.awt",
251 {{"address", "addressee", "body", "dear", "sincerely"}}});
252 #if defined HAVE_LIBCDR
253 // .cdr versions >= 16 are not included in the tests as they will work
254 // correctly only with libcdr >= 0.1.6
255 tests.insert({"cdr/test1.cdr",
256 {{"Zalgerian", "Zcalibri"}}});
257 tests.insert({"cdr/test2.cdr",
258 {{"Zедой", "Z喬伊不分享食物", "Zdocument"}}});
259 #endif
260 #if defined HAVE_LIBEXTRACTOR
261 // Testcase for libextractor need to allow for the required plugin not
262 // being installed as libextractor still returns generic metadata.
263 tests.insert({"video/file_example_OGG_480_1_7mg.ogg",
264 {{"Eogg", "Tvideo/ogg",
265 OPT("Searth"), OPT("Splanet")}}});
266 tests.insert({"video/file_example_AVI_480_750kB.avi",
267 {{"Eavi", "Tvideo/x-msvideo",
268 OPT("Zcodec"), OPT("Zh264"),
269 OPT("480x270"), OPT("msvideo"), OPT("30"), OPT("fps")}}});
270 tests.insert({"audio/file_example_OOG_1MG.ogg",
271 {{"Eogg", "Taudio/ogg",
272 OPT("Akevin"), OPT("Amacleod"),
273 OPT("Simpact"), OPT("ZSmoderato")}}});
274 tests.insert({"audio/file_example_WAV_1MG.wav",
275 {{"Ewav", "Taudio/x-wav",
276 OPT("Zstereo"), OPT("wav"), OPT("Zms")}}});
277 #endif
278 #if defined HAVE_LIBGEPUB
279 tests.insert({"epub/epub2test.epub",
280 {{"Eepub", "Tapplication/epub+zip", "Aolly",
281 "book", "chapter", "welcome"}}});
282 tests.insert({"epub/epub3test.epub",
283 {{"Eepub", "Tapplication/epub+zip", "Aolly",
284 "book", "chapter", "welcome"}}});
285 #endif
286 #if defined HAVE_LIBMWAW
287 tests.insert({"apple_works/test_word.cwk",
288 {{"Aparth", "Sword", "Zhello", "Zdocument"}}});
289 tests.insert({"apple_works/test_spreadsheet.cwk",
290 {{"Aparth", "Sspreadsheet", "Zpizza", "220"}}});
291 tests.insert({"apple_works/test_draw.cwk",
292 {{"Zdraw", "Zsampl", "Zgraphic"}}});
293 #endif
296 static void
297 escape(const string& s, std::ostream& stream)
299 for (unsigned char ch : s) {
300 if (ch >= 0x20 && ch < 127 && ch != '\\') {
301 stream << ch;
302 } else {
303 stream << "\\x"
304 << std::hex << std::setfill('0') << std::setw(2) << int(ch);
309 static test_result
310 compare_test(testcase& test, const Xapian::Document& doc, const string& file)
312 sort(test.terms.begin(), test.terms.end());
313 Xapian::TermIterator term_iterator = doc.termlist_begin();
314 bool all_required_terms_exist = true;
315 string missing_optional;
316 bool no_optional = true;
317 for (auto& i : test.terms) {
318 if (i.back() == '\xff') {
319 string t(i, 0, i.size() - 1);
320 term_iterator.skip_to(t);
321 // Optional term.
322 if (term_iterator == doc.termlist_end() || *term_iterator != t) {
323 missing_optional += ' ';
324 missing_optional += t;
325 } else {
326 no_optional = false;
328 } else {
329 auto t = i;
330 term_iterator.skip_to(t);
331 if (term_iterator == doc.termlist_end() || *term_iterator != t) {
332 cerr << file << ": error: Term " << t
333 << " should index this file but doesn't\n";
334 all_required_terms_exist = false;
339 bool values_ok = true;
340 for (auto& i : test.values) {
341 const string& v = doc.get_value(i.first);
342 if (v != i.second) {
343 cerr << file << ": error: Value slot " << i.first << " should be ";
344 escape(i.second, cerr);
345 cerr << " not ";
346 escape(v, cerr);
347 cerr << '\n';
348 values_ok = false;
351 if (!values_ok) {
352 return FAIL;
354 if (!missing_optional.empty() && !no_optional) {
355 cerr << file << ": error: Only some of the optional terms index this "
356 "file, missing:" << missing_optional << '\n';
357 } else if (all_required_terms_exist) {
358 // All terms found (including degenerate case where no terms are listed
359 // to check for).
360 return PASS;
362 cerr << "Expected at least these terms:";
363 for (auto& t : test.terms) {
364 if (t.back() == '\xff') {
365 // Optional term.
366 cerr << " OPT(" << t.substr(0, t.size() - 1) << ')';
367 } else {
368 cerr << ' ' << t;
371 cerr << "\nFull list of terms actually present:";
372 for (term_iterator = doc.termlist_begin();
373 term_iterator != doc.termlist_end();
374 ++term_iterator) {
375 cerr << ' ' << *term_iterator;
377 cerr << '\n';
378 return FAIL;
382 main(int argc, char** argv)
384 test_result result = PASS;
385 if (argc <= 1)
386 return 1;
388 Xapian::Database db(argv[1]);
390 index_test();
391 for (auto t = db.allterms_begin("U"); t != db.allterms_end("U"); ++t) {
392 const string& term = *t;
393 string url(term, 2);
394 Xapian::PostingIterator p = db.postlist_begin(term);
395 if (p == db.postlist_end(term)) {
396 // This shouldn't be possible.
397 cerr << "Term " << term << " doesn't index anything?!\n";
398 result = FAIL;
399 continue;
401 Xapian::docid did = *p;
402 Xapian::Document doc = db.get_document(did);
403 auto iter = tests.find(url);
404 if (iter != tests.end()) {
405 test_result individual_result = compare_test(iter->second, doc,
406 url);
407 if (individual_result == FAIL)
408 result = FAIL;
409 tests.erase(iter);
411 if (++p != db.postlist_end(term)) {
412 cerr << "URL term " << term << " indexes more than one document\n";
413 result = FAIL;
417 for (auto t : tests) {
418 cerr << "Testcase for URL " << t.first << " wasn't exercised\n";
419 result = FAIL;
422 return result == FAIL ? 1 : 0;