2 * @brief Auxiliary program of omindextest
4 /* Copyright (C) 2019 Bruno Baruffaldi
5 * Copyright (C) 2020 Parth Kapadia
6 * Copyright (C) 2021,2022,2023 Olly Betts
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
32 #include <unordered_map>
39 enum test_result
{ PASS
, FAIL
};
41 // Macro to mark optional terms.
43 // If there are optional terms in a testcase, either all or none must be
44 // present. This is useful with libextractor where the required plugin
45 // may not be installed, but libextractor will still return generic metadata.
47 // This works by appending '\xff', which we remove before comparing. This
48 // means terms ending with this byte can't be used in testcases, but it's
49 // invalid in UTF-8 so probably not a problematic limitation.
50 #define OPT(T) (T "\xff")
55 vector
<pair
<Xapian::valueno
, string
>> values
;
57 testcase(vector
<string
> v
)
58 : terms(std::move(v
)) {}
60 testcase(vector
<string
> v
, vector
<pair
<Xapian::valueno
, string
>> v2
)
61 : terms(std::move(v
)), values(std::move(v2
)) {}
65 static unordered_map
<string
, testcase
> tests
;
70 tests
.insert({"plaintext/iso88591.txt",
71 {{"à", "d'après", "françois", "idée", "réalisation"}}});
72 tests
.insert({"plaintext/utf16be-bom.txt",
73 {{"Zjoey", "Zfood", "Zедой", "Z喬伊不分享食物"}}});
74 tests
.insert({"plaintext/utf16le-bom.txt",
75 {{"Zjoey", "Zfood", "Zедой", "Z喬伊不分享食物"}}});
76 tests
.insert({"plaintext/utf8.txt",
77 {{"Zjoey", "Zfood", "Zедой", "Z喬伊不分享食物"}}});
78 tests
.insert({"plaintext/utf8-bom.txt",
79 {{"Zjoey", "Zfood", "Zедой", "Z喬伊不分享食物"}}});
80 tests
.insert({"test-csv.csv",
81 {{"ZFcsv", "Zbreak", "Zwere"}}});
82 tests
.insert({"test-html.html",
83 {{"Ajeroen", "ZAoom", "ZSworld", "Shello", "Zchapter"}}});
84 tests
.insert({"svg/diagram.svg",
85 {{"Sdiagram", "Timage/svg+xml", "Zstart"}}});
86 tests
.insert({"svg/diagram.svgz",
87 {{"Sdiagram", "Timage/svg+xml-compressed", "Zstart"}}});
89 tests
.insert({"email/html.eml",
90 {{"Aexample", "Ame", "Aorg", "Auser", "Shtml",
92 "XMID:E1p1II7-008OVw-1w@example.org",
93 "XTOada", "XTOexample", "XTOorg", "XTOuser",
94 "html", "message", "test"},
95 {{{VALUE_CREATED
, "c\x8a\xb4\xb3"}, // 1670034611
96 {VALUE_SIZE
, Xapian::sortable_serialise(450)},
97 {VALUE_MD5
, "y.<0RW\xb0\xf4\xd2+\xa8\x09\xde\xff|\x0d"}
99 tests
.insert({"email/text.eml",
100 {{"Aexample", "Ame", "Aorg", "Auser", "Stext", "Tmessage/rfc822",
101 "XMID:E1p1II7-008OVw-1v@example.org",
102 "XTOexample", "XTOorg", "XTOuser",
103 "comment1", "comment2", "keyword1", "keyword2",
104 "message", "plain", "text"},
105 {{{VALUE_CREATED
, "c\x8a\xb4\xb3"}, // 1670034611
106 {VALUE_SIZE
, Xapian::sortable_serialise(477)},
108 "C\x7f\x17;;\x87\x91\x5c\x05?\x83\x14\xec\xaa\xad\x94"}
111 #if defined HAVE_POPPLER
112 tests
.insert({"pdf/poppler.pdf",
113 {{"ZFpoppler", "Zsub", "Ztext", "Ztitl", "Zpie"}}});
115 #if defined HAVE_LIBEBOOK
116 tests
.insert({"fb2/hello.fb2",
117 {{"Ajeroen", "Aooms", "Zbeauti", "Zchapter", "Zdocument",
118 "Zooms3", "Zsubsect", "Ztoday", "ZFhello", "Zoutlin"}}});
119 tests
.insert({"fb2/lang-name.fb2",
120 {{"Adavid", "Atardon", "ZAdavid", "ZFlang", "Zeel",
121 "Zhovercraft", "Zlanguag"}}});
122 tests
.insert({"fb2/lang2.fb2",
123 {{"Adavid", "Atardon", "ZAtardon", "Zúhořů", "Zhovercraft",
124 "Zlanguag", "Zmój", "Zof", "Zpełen", "Zwęgorzi",
125 "Zpoduszkowiec", "Zvznášedlo"}}});
126 tests
.insert({"lrf/hello.lrf",
127 {{"ZFhello", "Zhello", "Zworld"}}});
128 tests
.insert({"pdb/PalmDOC-hello.pdb",
129 {{"Shello", "Sworld", "ZFpalmdoc", "Zworld"}}});
130 tests
.insert({"pdb/PeanutPress-hello.pdb",
131 {{"Fpeanutpress", "ZFhello", "Zhello"}}});
132 tests
.insert({"pdb/test.pdb",
133 {{"Sdemodemo", "ZFtest", "Zwherearew"}}});
135 #if defined HAVE_LIBETONYEK
136 tests
.insert({"iwork/test-keynote.key",
137 {{"ZFkeynot", "Zbold", "Znow", "Zsubtitl"}}});
138 tests
.insert({"iwork/test-pages.pages",
139 {{"ZFpage", "Zfull", "Zhovercraft", "Zwęgorzi"}}});
141 #if defined HAVE_TESSERACT
142 tests
.insert({"image/Test1.gif",
143 {{"Znoisyimag", "Zocr", "Ztesseract"}}});
144 tests
.insert({"image/Test2.pgm",
145 {{"ZFtest2", "Znoisyimag", "Ztesseract"}}});
146 tests
.insert({"image/Test3.ppm",
147 {{"ZFtest3", "Zocr", "Ztest"}}});
148 tests
.insert({"image/Test4.tiff",
149 {{"Znoisyimag", "Ztesseract", "Zto"}}});
150 tests
.insert({"image/Test5.webp",
151 {{"Znoisyimag", "Ztesseract", "Ztest"}}});
152 tests
.insert({"image/poster-2.jpg",
153 {{"ZFposter", "Zby", "Zproperti", "Zsurveil", "Zvideo"}}});
154 tests
.insert({"image/poster.jpg",
155 {{"Zbicycl", "Zride", "Zroller", "Zskateboard"}}});
156 tests
.insert({"image/scan-page.png",
157 {{"Zannual", "Zfed", "Zreturn", "Zwhile"}}});
159 #define OFFICE_TESTCASES(PREFIX) \
161 /* pass the test if no terms are found */ \
162 tests.insert({PREFIX "opendoc/blank.odt", \
164 /* corrupted file (ODP) */ \
165 /* tests.insert({PREFIX "corrupt_file.odp", {"ZSnatur"}}); */ \
168 tests.insert({PREFIX "opendoc/test.odt", \
169 {{"Zедой", "Z喬伊不分享食物"}}}); \
170 tests.insert({PREFIX "opendoc/text.odt", \
171 {{"Stesttitle", NOTLO("Aolly",) "Zsampl", "Zhead", "Ztext", \
172 "Zhello", "Zworld"}}}); \
173 tests.insert({PREFIX "opendoc/text_template.ott", \
174 {{"Zjane", "Zdoe", "Zstructur"}}}); \
176 tests.insert({PREFIX "opendoc/presentation.odp", \
177 {{"Zfascin", "Zfact", "Zpustak", "Zmahal", "Zmillion", \
178 "Zpeopl", "Zbirthday", "501"}}}); \
180 tests.insert({PREFIX "opendoc/presentation_template.otp", \
181 {{"ZSalizarin", "Zhead", "Zworld", "Ztext"}}}); \
182 tests.insert({PREFIX "opendoc/spreadsheet.ods", \
183 {{"Zhello", "Zworld", "Zsampl", "2"}}}); \
184 tests.insert({PREFIX "opendoc/spreadsheet_template.ots", \
185 {{"Zfood", "Zpasta", "Zpercentag", "40"}}}); \
186 tests.insert({PREFIX "opendoc/draw.odg", \
187 {{"Zparth", "Zkapadia"}}}); \
189 /* Apache OpenOffice */ \
190 tests.insert({PREFIX "staroffice/calc.sxc", \
191 {{"Ztoy", "Zproduct", "Zcost", "Zquantiti", "Zcardboard"}}}); \
192 tests.insert({PREFIX "staroffice/calc_template.stc", \
193 {{NOTLO("ZSpurchas", "ZStemplat",) "Zproduct", "Zquantiti", \
195 tests.insert({PREFIX "staroffice/text.sxw", \
196 {{"Zhello", "Zsampl", "Zopenoffic", "Zwriter"}}}); \
197 tests.insert({PREFIX "staroffice/text_template.stw", \
198 {{"Zhello", "Zworld", "Zsampl", "Zhead", \
199 NOTLO("ZStemplat", "ZStext")}}}); \
200 tests.insert({PREFIX "staroffice/presentation.sxi", \
201 {{"Zhead", "Zhello", "Zopenoffic", "Zimpress"}}}); \
202 tests.insert({PREFIX "staroffice/presentation_template.sti", \
203 {{NOTLO("ZSproject", "ZSresearch",) "Zhead", "Ztext"}}}); \
205 /* Microsoft XML formats */ \
206 tests.insert({PREFIX "msxml/Book.xlsx", \
207 {{"Zmodi", "Zgood", "Zemploye"}}}); \
208 tests.insert({PREFIX "msxml/2sheets.xlsx", \
209 {{NOTLO("0.123456",) LO("0.12346",) \
210 "123.456", "15", "2021", \
211 NOTLO("3.14159265358979",) LO("3.14159",) \
212 "43", "55", "Aolly", "Ssheet", "Stitle", "xmas"}}}); \
213 tests.insert({PREFIX "msxml/Doc.docx", \
214 {{"Zедой", "Z喬伊不分享食物", "ZSbakeri"}}}); \
215 tests.insert({PREFIX "msxml/Nature.pptx", \
216 {{"ZSnatur", "Zbeauti", "Zsampl"}}}); \
217 tests.insert({PREFIX "msxml/vnd.ms-xpsdocument_xpstest.xps", \
218 {{"second", "header", "footer"}}});
219 #if defined HAVE_LIBARCHIVE
221 # define NOTLO(...) __VA_ARGS__
226 #if defined HAVE_LIBREOFFICEKIT_LIBREOFFICEKIT_HXX
227 # define LO(...) __VA_ARGS__
229 OFFICE_TESTCASES("lok-")
233 #if defined HAVE_LIBABW
234 // Title term is not being tested here because some older versions of Libabw
235 // lack a bug fix for the title to be handled properly. (< libabw-0.1.2)
236 tests
.insert({"abw/test.abw",
237 {{"ZAparth", "Zabiword", "Zsampl", "Zdocument"}}});
238 tests
.insert({"abw/macbeth.zabw",
239 {{"Ashakespeare", "Awilliam", "Smacbeth",
240 "ambition", "macduff", "shall"}}});
242 // Indexed using AbiwordParser class, which doesn't currently handle metadata.
243 tests
.insert({"abw/test.abw",
244 {{"Zabiword", "Zsampl", "Zdocument"}}});
245 tests
.insert({"abw/macbeth.zabw",
246 {{"ambition", "macduff", "shall"}}});
248 tests
.insert({"abw/test1.abw",
249 {{"Zедой", "Z喬伊不分享食物"}}});
250 tests
.insert({"abw/Friendly-Letter.awt",
251 {{"address", "addressee", "body", "dear", "sincerely"}}});
252 #if defined HAVE_LIBCDR
253 // .cdr versions >= 16 are not included in the tests as they will work
254 // correctly only with libcdr >= 0.1.6
255 tests
.insert({"cdr/test1.cdr",
256 {{"Zalgerian", "Zcalibri"}}});
257 tests
.insert({"cdr/test2.cdr",
258 {{"Zедой", "Z喬伊不分享食物", "Zdocument"}}});
260 #if defined HAVE_LIBEXTRACTOR
261 // Testcase for libextractor need to allow for the required plugin not
262 // being installed as libextractor still returns generic metadata.
263 tests
.insert({"video/file_example_OGG_480_1_7mg.ogg",
264 {{"Eogg", "Tvideo/ogg",
265 OPT("Searth"), OPT("Splanet")}}});
266 tests
.insert({"video/file_example_AVI_480_750kB.avi",
267 {{"Eavi", "Tvideo/x-msvideo",
268 OPT("Zcodec"), OPT("Zh264"),
269 OPT("480x270"), OPT("msvideo"), OPT("30"), OPT("fps")}}});
270 tests
.insert({"audio/file_example_OOG_1MG.ogg",
271 {{"Eogg", "Taudio/ogg",
272 OPT("Akevin"), OPT("Amacleod"),
273 OPT("Simpact"), OPT("ZSmoderato")}}});
274 tests
.insert({"audio/file_example_WAV_1MG.wav",
275 {{"Ewav", "Taudio/x-wav",
276 OPT("Zstereo"), OPT("wav"), OPT("Zms")}}});
278 #if defined HAVE_LIBGEPUB
279 tests
.insert({"epub/epub2test.epub",
280 {{"Eepub", "Tapplication/epub+zip", "Aolly",
281 "book", "chapter", "welcome"}}});
282 tests
.insert({"epub/epub3test.epub",
283 {{"Eepub", "Tapplication/epub+zip", "Aolly",
284 "book", "chapter", "welcome"}}});
286 #if defined HAVE_LIBMWAW
287 tests
.insert({"apple_works/test_word.cwk",
288 {{"Aparth", "Sword", "Zhello", "Zdocument"}}});
289 tests
.insert({"apple_works/test_spreadsheet.cwk",
290 {{"Aparth", "Sspreadsheet", "Zpizza", "220"}}});
291 tests
.insert({"apple_works/test_draw.cwk",
292 {{"Zdraw", "Zsampl", "Zgraphic"}}});
297 escape(const string
& s
, std::ostream
& stream
)
299 for (unsigned char ch
: s
) {
300 if (ch
>= 0x20 && ch
< 127 && ch
!= '\\') {
304 << std::hex
<< std::setfill('0') << std::setw(2) << int(ch
);
310 compare_test(testcase
& test
, const Xapian::Document
& doc
, const string
& file
)
312 sort(test
.terms
.begin(), test
.terms
.end());
313 Xapian::TermIterator term_iterator
= doc
.termlist_begin();
314 bool all_required_terms_exist
= true;
315 string missing_optional
;
316 bool no_optional
= true;
317 for (auto& i
: test
.terms
) {
318 if (i
.back() == '\xff') {
319 string
t(i
, 0, i
.size() - 1);
320 term_iterator
.skip_to(t
);
322 if (term_iterator
== doc
.termlist_end() || *term_iterator
!= t
) {
323 missing_optional
+= ' ';
324 missing_optional
+= t
;
330 term_iterator
.skip_to(t
);
331 if (term_iterator
== doc
.termlist_end() || *term_iterator
!= t
) {
332 cerr
<< file
<< ": error: Term " << t
333 << " should index this file but doesn't\n";
334 all_required_terms_exist
= false;
339 bool values_ok
= true;
340 for (auto& i
: test
.values
) {
341 const string
& v
= doc
.get_value(i
.first
);
343 cerr
<< file
<< ": error: Value slot " << i
.first
<< " should be ";
344 escape(i
.second
, cerr
);
354 if (!missing_optional
.empty() && !no_optional
) {
355 cerr
<< file
<< ": error: Only some of the optional terms index this "
356 "file, missing:" << missing_optional
<< '\n';
357 } else if (all_required_terms_exist
) {
358 // All terms found (including degenerate case where no terms are listed
362 cerr
<< "Expected at least these terms:";
363 for (auto& t
: test
.terms
) {
364 if (t
.back() == '\xff') {
366 cerr
<< " OPT(" << t
.substr(0, t
.size() - 1) << ')';
371 cerr
<< "\nFull list of terms actually present:";
372 for (term_iterator
= doc
.termlist_begin();
373 term_iterator
!= doc
.termlist_end();
375 cerr
<< ' ' << *term_iterator
;
382 main(int argc
, char** argv
)
384 test_result result
= PASS
;
388 Xapian::Database
db(argv
[1]);
391 for (auto t
= db
.allterms_begin("U"); t
!= db
.allterms_end("U"); ++t
) {
392 const string
& term
= *t
;
394 Xapian::PostingIterator p
= db
.postlist_begin(term
);
395 if (p
== db
.postlist_end(term
)) {
396 // This shouldn't be possible.
397 cerr
<< "Term " << term
<< " doesn't index anything?!\n";
401 Xapian::docid did
= *p
;
402 Xapian::Document doc
= db
.get_document(did
);
403 auto iter
= tests
.find(url
);
404 if (iter
!= tests
.end()) {
405 test_result individual_result
= compare_test(iter
->second
, doc
,
407 if (individual_result
== FAIL
)
411 if (++p
!= db
.postlist_end(term
)) {
412 cerr
<< "URL term " << term
<< " indexes more than one document\n";
417 for (auto t
: tests
) {
418 cerr
<< "Testcase for URL " << t
.first
<< " wasn't exercised\n";
422 return result
== FAIL
? 1 : 0;