[ci] Update macos jobs
[xapian.git] / xapian-applications / omega / index_file.cc
blob12ef841093f8a2d4cea0236903b785dc1ead7429
1 /** @file
2 * @brief Handle indexing a document from a file
3 */
4 /* Copyright 1999,2000,2001 BrightStation PLC
5 * Copyright 2001,2005 James Aylett
6 * Copyright 2001,2002 Ananova Ltd
7 * Copyright 2002-2023 Olly Betts
8 * Copyright 2009 Frank J Bruzzaniti
9 * Copyright 2012 Mihai Bivol
10 * Copyright 2019 Bruno Baruffaldi
11 * Copyright 2020 Parth Kapadia
13 * This program is free software; you can redistribute it and/or
14 * modify it under the terms of the GNU General Public License as
15 * published by the Free Software Foundation; either version 2 of the
16 * License, or (at your option) any later version.
18 * This program is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU General Public License for more details.
23 * You should have received a copy of the GNU General Public License
24 * along with this program; if not, write to the Free Software
25 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
26 * USA
29 #include <config.h>
31 #include "index_file.h"
33 #include <algorithm>
34 #include <iostream>
35 #include <limits>
36 #include <string>
37 #include <map>
38 #include <vector>
40 #include <sys/types.h>
41 #include "safeunistd.h"
42 #include <cassert>
43 #include <cerrno>
44 #include <cstdio>
45 #include <cstdlib>
46 #include <cstring>
47 #include "safefcntl.h"
48 #include <ctime>
50 #include <xapian.h>
52 #include "abiwordparser.h"
53 #include "append_filename_arg.h"
54 #include "atomparser.h"
55 #include "datetime.h"
56 #include "diritor.h"
57 #include "failed.h"
58 #include "hashterm.h"
59 #include "htmlparser.h"
60 #include "md5wrap.h"
61 #include "mimemap.h"
62 #include "msxmlparser.h"
63 #include "opendocmetaparser.h"
64 #include "opendocparser.h"
65 #include "pkglibbindir.h"
66 #include "runfilter.h"
67 #include "sample.h"
68 #include "str.h"
69 #include "stringutils.h"
70 #include "svgparser.h"
71 #include "tmpdir.h"
72 #include "utf8convert.h"
73 #include "values.h"
74 #include "worker.h"
75 #include "xlsxparser.h"
76 #include "xpsparser.h"
78 using namespace std;
80 static Xapian::WritableDatabase db;
81 static Xapian::TermGenerator indexer;
83 static Xapian::doccount old_docs_not_seen;
84 static Xapian::docid old_lastdocid;
85 static vector<bool> updated;
87 static bool verbose;
88 static bool retry_failed;
89 static bool use_ctime;
90 static dup_action_type dup_action;
91 static bool ignore_exclusions;
92 static bool description_as_sample;
93 static bool date_terms;
95 static time_t last_altered_max;
96 static size_t sample_size;
97 static size_t title_size;
98 static size_t max_ext_len;
100 static empty_body_type empty_body;
102 static string root;
103 static string site_term, host_term;
105 static Failed failed;
107 map<string, Filter> commands;
109 static void
110 mark_as_seen(Xapian::docid did)
112 if (usual(did < updated.size() && !updated[did])) {
113 updated[did] = true;
114 --old_docs_not_seen;
118 void
119 skip(const string& urlterm, const string& context, const string& msg,
120 off_t size, time_t last_mod, unsigned flags)
122 failed.add(urlterm, last_mod, size);
124 if (!verbose || (flags & SKIP_SHOW_FILENAME)) {
125 if (!verbose && (flags & SKIP_VERBOSE_ONLY)) return;
126 cout << context << ": ";
129 cout << "Skipping - " << msg << endl;
132 static void
133 skip_cmd_failed(const string& urlterm, const string& context,
134 const char* const cmd[],
135 off_t size, time_t last_mod)
137 string message;
138 const char* sep = "['";
139 for (auto i = cmd; *i; ++i) {
140 message += sep;
141 message += *i;
142 sep = "', '";
144 message += "'] failed";
145 skip(urlterm, context, message, size, last_mod);
148 static void
149 skip_cmd_failed(const string& urlterm, const string& context, const string& cmd,
150 off_t size, time_t last_mod)
152 skip(urlterm, context, "\"" + cmd + "\" failed", size, last_mod);
155 static void
156 skip_meta_tag(const string& urlterm, const string& context,
157 off_t size, time_t last_mod)
159 skip(urlterm, context, "indexing disallowed by meta tag", size, last_mod);
162 static void
163 skip_unknown_mimetype(const string& urlterm, const string& context,
164 const string& mimetype, off_t size, time_t last_mod)
166 skip(urlterm, context, "unknown MIME type '" + mimetype + "'",
167 size, last_mod);
170 void
171 index_add_default_libraries()
173 #if defined HAVE_POPPLER
174 Worker* omindex_poppler = new Worker("omindex_poppler");
175 index_library("application/pdf", omindex_poppler);
176 #endif
177 #if defined HAVE_LIBEBOOK
178 Worker* omindex_libebook = new Worker("omindex_libebook");
179 index_library("application/vnd.palm", omindex_libebook);
180 index_library("application/x-fictionbook+xml", omindex_libebook);
181 index_library("application/x-zip-compressed-fb2", omindex_libebook);
182 index_library("application/x-sony-bbeb", omindex_libebook);
183 index_library("application/x-tcr-ebook", omindex_libebook);
184 index_library("application/x-qioo-ebook", omindex_libebook);
185 #endif
186 #if defined HAVE_LIBETONYEK
187 Worker* omindex_libetonyek = new Worker("omindex_libetonyek");
188 index_library("application/vnd.apple.keynote", omindex_libetonyek);
189 index_library("application/vnd.apple.pages", omindex_libetonyek);
190 index_library("application/vnd.apple.numbers", omindex_libetonyek);
191 #endif
192 #if defined HAVE_LIBGEPUB
193 Worker* omindex_libgepub = new Worker("omindex_libgepub");
194 index_library("application/epub+zip", omindex_libgepub);
195 #endif
196 #if defined HAVE_TESSERACT
197 Worker* omindex_tesseract = new Worker("omindex_tesseract");
198 index_library("image/gif", omindex_tesseract);
199 index_library("image/jpeg", omindex_tesseract);
200 index_library("image/png", omindex_tesseract);
201 index_library("image/webp", omindex_tesseract);
202 index_library("image/tiff", omindex_tesseract);
203 index_library("image/x-portable-bitmap", omindex_tesseract);
204 index_library("image/x-portable-graymap", omindex_tesseract);
205 index_library("image/x-portable-anymap", omindex_tesseract);
206 index_library("image/x-portable-pixmap", omindex_tesseract);
207 #endif
208 #if defined HAVE_GMIME
209 Worker* omindex_gmime = new Worker("omindex_gmime");
210 index_library("message/rfc822", omindex_gmime);
211 index_library("message/news", omindex_gmime);
212 #endif
213 #if defined HAVE_LIBARCHIVE
214 Worker* omindex_libarchive = new Worker("omindex_libarchive");
215 index_library("application/oxps", omindex_libarchive);
216 index_library("application/vnd.ms-xpsdocument", omindex_libarchive);
217 index_library("application/vnd.oasis.opendocument.text",
218 omindex_libarchive);
219 index_library("application/vnd.oasis.opendocument.spreadsheet",
220 omindex_libarchive);
221 index_library("application/vnd.oasis.opendocument.presentation",
222 omindex_libarchive);
223 index_library("application/vnd.oasis.opendocument.graphics",
224 omindex_libarchive);
225 index_library("application/vnd.oasis.opendocument.chart",
226 omindex_libarchive);
227 index_library("application/vnd.oasis.opendocument.formula",
228 omindex_libarchive);
229 index_library("application/vnd.oasis.opendocument.database",
230 omindex_libarchive);
231 index_library("application/vnd.oasis.opendocument.image",
232 omindex_libarchive);
233 index_library("application/vnd.oasis.opendocument.text-master",
234 omindex_libarchive);
235 index_library("application/vnd.oasis.opendocument.text-template",
236 omindex_libarchive);
237 index_library("application/vnd.oasis.opendocument.spreadsheet-template",
238 omindex_libarchive);
239 index_library("application/vnd.oasis.opendocument.presentation-template",
240 omindex_libarchive);
241 index_library("application/vnd.oasis.opendocument.graphics-template",
242 omindex_libarchive);
243 index_library("application/vnd.oasis.opendocument.chart-template",
244 omindex_libarchive);
245 index_library("application/vnd.oasis.opendocument.formula-template",
246 omindex_libarchive);
247 index_library("application/vnd.oasis.opendocument.image-template",
248 omindex_libarchive);
249 index_library("application/vnd.oasis.opendocument.text-web",
250 omindex_libarchive);
251 index_library("application/vnd.sun.xml.calc",
252 omindex_libarchive);
253 index_library("application/vnd.sun.xml.calc.template",
254 omindex_libarchive);
255 index_library("application/vnd.sun.xml.draw",
256 omindex_libarchive);
257 index_library("application/vnd.sun.xml.draw.template",
258 omindex_libarchive);
259 index_library("application/vnd.sun.xml.impress",
260 omindex_libarchive);
261 index_library("application/vnd.sun.xml.impress.template",
262 omindex_libarchive);
263 index_library("application/vnd.sun.xml.math",
264 omindex_libarchive);
265 index_library("application/vnd.sun.xml.writer",
266 omindex_libarchive);
267 index_library("application/vnd.sun.xml.writer.global",
268 omindex_libarchive);
269 index_library("application/vnd.sun.xml.writer.template",
270 omindex_libarchive);
271 index_library("application/vnd.openxmlformats-officedocument."
272 "wordprocessingml.document", omindex_libarchive);
273 index_library("application/vnd.openxmlformats-officedocument."
274 "wordprocessingml.template", omindex_libarchive);
275 index_library("application/vnd.openxmlformats-officedocument."
276 "spreadsheetml.sheet", omindex_libarchive);
277 index_library("application/vnd.openxmlformats-officedocument."
278 "spreadsheetml.template", omindex_libarchive);
279 index_library("application/vnd.openxmlformats-officedocument."
280 "presentationml.presentation", omindex_libarchive);
281 index_library("application/vnd.openxmlformats-officedocument."
282 "presentationml.slideshow", omindex_libarchive);
283 index_library("application/vnd.openxmlformats-officedocument."
284 "presentationml.template", omindex_libarchive);
285 #endif
286 #if defined HAVE_LIBABW
287 Worker* omindex_libabw = new Worker("omindex_libabw");
288 index_library("application/x-abiword", omindex_libabw);
289 index_library("application/x-abiword-compressed", omindex_libabw);
290 #endif
291 #if defined HAVE_LIBCDR
292 Worker* omindex_libcdr = new Worker("omindex_libcdr");
293 index_library("image/x-coreldraw", omindex_libcdr);
294 #endif
295 #if defined HAVE_LIBEXTRACTOR
296 Worker* omindex_libextractor = new Worker("omindex_libextractor");
297 index_library("video/mpeg", omindex_libextractor);
298 index_library("video/x-flv", omindex_libextractor);
299 index_library("video/x-msvideo", omindex_libextractor);
300 index_library("video/x-ms-asf", omindex_libextractor);
301 index_library("video/quicktime", omindex_libextractor);
302 index_library("video/ogg", omindex_libextractor);
303 index_library("audio/flac", omindex_libextractor);
304 index_library("audio/mpeg", omindex_libextractor);
305 index_library("audio/ogg", omindex_libextractor);
306 index_library("audio/x-wav", omindex_libextractor);
307 index_library("audio/x-mod", omindex_libextractor);
308 index_library("audio/x-s3m", omindex_libextractor);
309 #endif
310 #if defined HAVE_LIBMWAW
311 Worker* omindex_libmwaw = new Worker("omindex_libmwaw");
312 index_library("application/clarisworks", omindex_libmwaw);
313 index_library("image/x-pict", omindex_libmwaw);
314 #endif
317 void
318 index_add_default_filters()
320 // Command needs to be run using /bin/sh.
321 auto USE_SHELL = Filter::USE_SHELL;
322 // Currently none of these commands needs USE_SHELL.
323 (void)USE_SHELL;
324 // Input should be piped to stdin.
325 auto PIPE_IN = Filter::PIPE_IN;
326 // Filename can be /dev/stdin (which must be seekable).
327 auto SEEK_DEV_STDIN = Filter::SEEK_DEV_STDIN;
328 // Filename can be /dev/stdin (which can be a pipe).
329 auto PIPE_DEV_STDIN = Filter::PIPE_DEV_STDIN;
330 index_command("application/msword",
331 Filter("antiword -mUTF-8.txt -", PIPE_IN));
332 index_command("application/vnd.ms-excel",
333 Filter("xls2csv -c' ' -q0 -dutf-8", PIPE_DEV_STDIN));
334 index_command("application/vnd.ms-powerpoint",
335 Filter("catppt -dutf-8", PIPE_DEV_STDIN));
336 // Looking at the source of wpd2html and wpd2text I think both output
337 // UTF-8, but it's hard to be sure without sample Unicode .wpd files
338 // as they don't seem to be at all well documented.
339 index_command("application/vnd.wordperfect",
340 Filter("wpd2text", SEEK_DEV_STDIN));
341 // wps2text produces UTF-8 output from the sample files I've tested.
342 index_command("application/vnd.ms-works",
343 Filter("wps2text", SEEK_DEV_STDIN));
344 // Output is UTF-8 according to "man djvutxt". Generally this seems to
345 // be true, though some examples from djvu.org generate isolated byte
346 // 0x95 in a context which suggests it might be intended to be a bullet
347 // (as it is in CP1252).
348 index_command("image/vnd.djvu", Filter("djvutxt -", PIPE_IN));
349 index_command("text/markdown",
350 Filter("markdown", "text/html", PIPE_IN));
351 // The --text option unhelpfully converts all non-ASCII characters to "?"
352 // so we use --html instead, which produces HTML entities. The --nopict
353 // option suppresses exporting picture files as pictNNNN.wmf in the current
354 // directory. Note that this option was ignored in some older versions,
355 // but it was fixed in unrtf 0.20.4.
356 index_command("application/rtf",
357 Filter("unrtf --nopict --html 2>/dev/null", "text/html",
358 PIPE_IN));
359 index_command("text/rtf",
360 Filter("unrtf --nopict --html 2>/dev/null", "text/html",
361 PIPE_IN));
362 index_command("text/x-rst",
363 Filter("rst2html", "text/html", PIPE_IN));
364 index_command("application/x-mspublisher",
365 Filter("pub2xhtml", "text/html", SEEK_DEV_STDIN));
366 index_command("application/vnd.ms-outlook",
367 Filter(get_pkglibbindir() + "/outlookmsg2html",
368 "text/html", SEEK_DEV_STDIN));
369 index_command("application/vnd.ms-visio.drawing",
370 Filter("vsd2xhtml", "image/svg+xml", SEEK_DEV_STDIN));
371 index_command("application/vnd.ms-visio.stencil",
372 Filter("vsd2xhtml", "image/svg+xml", SEEK_DEV_STDIN));
373 index_command("application/vnd.ms-visio.template",
374 Filter("vsd2xhtml", "image/svg+xml", SEEK_DEV_STDIN));
375 index_command("application/vnd.visio",
376 Filter("vsd2xhtml", "image/svg+xml", SEEK_DEV_STDIN));
377 // pod2text's output character set doesn't seem to be documented, but from
378 // inspecting the source it looks like it's probably iso-8859-1. We need
379 // to pass "--errors=stderr" or else minor POD formatting errors cause a
380 // file not to be indexed.
381 index_command("text/x-perl",
382 Filter("pod2text --errors=stderr",
383 "text/plain", "iso-8859-1", PIPE_IN));
384 // FIXME: -e0 means "UTF-8", but that results in "fi", "ff", "ffi", etc
385 // appearing as single ligatures. For European languages, it's actually
386 // better to use -e2 (ISO-8859-1) and then convert, so let's do that for
387 // now until we handle Unicode "compatibility decompositions".
388 index_command("application/x-dvi",
389 Filter("catdvi -e2 -s", "text/plain", "iso-8859-1", PIPE_IN));
390 // Simplistic - ought to look in index.rdf files for filename and character
391 // set.
392 index_command("application/x-maff",
393 Filter("unzip -p %f '*/*.*htm*'", "text/html", "iso-8859-1",
394 SEEK_DEV_STDIN));
395 index_command("application/x-mimearchive",
396 Filter(get_pkglibbindir() + "/mhtml2html", "text/html",
397 PIPE_DEV_STDIN));
398 index_command("message/news",
399 Filter(get_pkglibbindir() + "/rfc822tohtml", "text/html",
400 PIPE_DEV_STDIN));
401 index_command("message/rfc822",
402 Filter(get_pkglibbindir() + "/rfc822tohtml", "text/html",
403 PIPE_DEV_STDIN));
404 index_command("text/vcard",
405 Filter(get_pkglibbindir() + "/vcard2text", PIPE_DEV_STDIN));
406 index_command("application/vnd.apple.keynote",
407 Filter("key2text", SEEK_DEV_STDIN));
408 index_command("application/vnd.apple.numbers",
409 Filter("numbers2text", SEEK_DEV_STDIN));
410 index_command("application/vnd.apple.pages",
411 Filter("pages2text", SEEK_DEV_STDIN));
414 void
415 index_init(const string& dbpath, const Xapian::Stem& stemmer,
416 const string& root_, const string& site_term_,
417 const string& host_term_,
418 empty_body_type empty_body_, dup_action_type dup_action_,
419 size_t sample_size_, size_t title_size_, size_t max_ext_len_,
420 bool overwrite, bool retry_failed_,
421 bool delete_removed_documents, bool verbose_, bool use_ctime_,
422 bool spelling, bool ignore_exclusions_, bool description_as_sample_,
423 bool date_terms_)
425 root = root_;
426 site_term = site_term_;
427 host_term = host_term_;
428 empty_body = empty_body_;
429 dup_action = dup_action_;
430 sample_size = sample_size_;
431 title_size = title_size_;
432 max_ext_len = max_ext_len_;
433 verbose = verbose_;
434 use_ctime = use_ctime_;
435 ignore_exclusions = ignore_exclusions_;
436 description_as_sample = description_as_sample_;
437 date_terms = date_terms_;
439 if (!overwrite) {
440 db = Xapian::WritableDatabase(dbpath, Xapian::DB_CREATE_OR_OPEN);
441 old_docs_not_seen = db.get_doccount();
442 // Handle an initially empty database exactly the same way as when
443 // overwrite is true.
444 if (old_docs_not_seen != 0) {
445 old_lastdocid = db.get_lastdocid();
446 if (delete_removed_documents) {
447 // + 1 so that old_lastdocid is a valid subscript.
448 updated.resize(old_lastdocid + 1);
450 Xapian::valueno slot = use_ctime ? VALUE_CTIME : VALUE_LASTMOD;
451 string ubound = db.get_value_upper_bound(slot);
452 if (!ubound.empty())
453 last_altered_max = binary_string_to_int(ubound);
455 } else {
456 db = Xapian::WritableDatabase(dbpath, Xapian::DB_CREATE_OR_OVERWRITE);
459 if (spelling) {
460 indexer.set_database(db);
461 indexer.set_flags(indexer.FLAG_SPELLING);
463 indexer.set_stemmer(stemmer);
465 runfilter_init();
467 failed.init(db);
469 if (overwrite) {
470 // There are no failures to retry, so setting this flag doesn't
471 // change the outcome, but does mean we avoid the overhead of
472 // checking for a previous failure.
473 retry_failed = true;
474 } else if (retry_failed_) {
475 failed.clear();
476 retry_failed = true;
477 } else {
478 // If there are no existing failures, setting this flag doesn't
479 // change the outcome, but does mean we avoid the overhead of
480 // checking for a previous failure.
481 retry_failed = failed.empty();
485 static void
486 parse_pdfinfo_field(const char* p, const char* end, string& out,
487 const char* field, size_t len)
489 if (size_t(end - p) > len && memcmp(p, field, len) == 0) {
490 p += len;
491 while (p != end && *p == ' ')
492 ++p;
493 if (p != end && (end[-1] != '\r' || --end != p))
494 out.assign(p, end - p);
498 #define PARSE_PDFINFO_FIELD(P, END, OUT, FIELD) \
499 parse_pdfinfo_field((P), (END), (OUT), FIELD":", CONST_STRLEN(FIELD) + 1)
501 static void
502 parse_pdf_metainfo(const string& pdfinfo, string& author, string& title,
503 string& keywords, string& topic, int& pages)
505 const char* p = pdfinfo.data();
506 const char* end = p + pdfinfo.size();
507 while (p != end) {
508 const char* start = p;
509 p = static_cast<const char*>(memchr(p, '\n', end - p));
510 const char* eol;
511 if (p) {
512 eol = p;
513 ++p;
514 } else {
515 p = eol = end;
517 switch (*start) {
518 case 'A':
519 PARSE_PDFINFO_FIELD(start, eol, author, "Author");
520 break;
521 case 'K':
522 PARSE_PDFINFO_FIELD(start, eol, keywords, "Keywords");
523 break;
524 case 'P': {
525 string s;
526 PARSE_PDFINFO_FIELD(start, eol, s, "Pages");
527 if (!s.empty())
528 pages = atoi(s.c_str());
529 break;
531 case 'S':
532 PARSE_PDFINFO_FIELD(start, eol, topic, "Subject");
533 break;
534 case 'T':
535 PARSE_PDFINFO_FIELD(start, eol, title, "Title");
536 break;
541 static void
542 get_pdf_metainfo(int fd, string& author, string& title,
543 string& keywords, string& topic, int& pages)
545 try {
546 string pdfinfo;
547 static const char* const cmd[] = {
548 "pdfinfo", "-enc", "UTF-8", "-", NULL
550 run_filter(fd, cmd);
551 parse_pdf_metainfo(pdfinfo, author, title, keywords, topic, pages);
552 } catch (const ReadError&) {
553 // It's probably best to index the document even if pdfinfo fails.
557 static void
558 get_pdf_metainfo(const string& file, string& author, string& title,
559 string& keywords, string& topic, int& pages)
561 try {
562 const char* cmd[] = {
563 "pdfinfo", "-enc", "UTF-8", NULL, NULL
565 cmd[3] = file.c_str();
566 parse_pdf_metainfo(stdout_to_string(cmd),
567 author, title, keywords, topic, pages);
568 } catch (const ReadError&) {
569 // It's probably best to index the document even if pdfinfo fails.
573 static void
574 generate_sample_from_csv(const string& csv_data, string& sample)
576 // Add 3 to allow for a 4 byte utf-8 sequence being appended when
577 // output is sample_size - 1 bytes long. Use csv_data.size() if smaller
578 // since the user might reasonably set sample_size really high.
579 sample.reserve(min(sample_size + 3, csv_data.size()));
580 size_t last_word_end = 0;
581 bool in_space = true;
582 bool in_quotes = false;
583 for (Xapian::Utf8Iterator i(csv_data); i != Xapian::Utf8Iterator(); ++i) {
584 unsigned ch = *i;
586 if (!in_quotes) {
587 // If not already in double quotes, '"' starts quoting and
588 // ',' starts a new field.
589 if (ch == '"') {
590 in_quotes = true;
591 continue;
593 if (ch == ',')
594 ch = ' ';
595 } else if (ch == '"') {
596 // In double quotes, '"' either ends double quotes, or
597 // if followed by another '"', means a literal '"'.
598 if (++i == Xapian::Utf8Iterator())
599 break;
600 ch = *i;
601 if (ch != '"') {
602 in_quotes = false;
603 if (ch == ',')
604 ch = ' ';
608 if (ch <= ' ' || ch == 0xa0) {
609 // FIXME: if all the whitespace characters between two
610 // words are 0xa0 (non-breaking space) then perhaps we
611 // should output 0xa0.
612 if (in_space)
613 continue;
614 last_word_end = sample.size();
615 sample += ' ';
616 in_space = true;
617 } else {
618 Xapian::Unicode::append_utf8(sample, ch);
619 in_space = false;
622 if (sample.size() >= sample_size) {
623 // Need to truncate sample.
624 if (last_word_end <= sample_size / 2) {
625 // Monster word! We'll have to just split it.
626 sample.replace(sample_size - 3, string::npos, "...", 3);
627 } else {
628 sample.replace(last_word_end, string::npos, " ...", 4);
630 break;
635 static bool
636 index_check_existing(const string& urlterm, time_t last_altered,
637 Xapian::docid& did)
639 switch (dup_action) {
640 case DUP_SKIP: {
641 Xapian::PostingIterator p = db.postlist_begin(urlterm);
642 if (p != db.postlist_end(urlterm)) {
643 if (verbose)
644 cout << "already indexed, not updating" << endl;
645 did = *p;
646 mark_as_seen(did);
647 return true;
649 break;
651 case DUP_CHECK_LAZILY: {
652 // If last_altered > last_altered_max, we know for sure that the
653 // file is new or updated.
654 if (last_altered > last_altered_max) {
655 return false;
658 Xapian::PostingIterator p = db.postlist_begin(urlterm);
659 if (p != db.postlist_end(urlterm)) {
660 did = *p;
661 Xapian::Document doc = db.get_document(did);
662 Xapian::valueno slot = use_ctime ? VALUE_CTIME : VALUE_LASTMOD;
663 string value = doc.get_value(slot);
664 time_t old_last_altered = binary_string_to_int(value);
665 if (last_altered <= old_last_altered) {
666 if (verbose)
667 cout << "already indexed" << endl;
668 // The docid should be in updated - the only valid
669 // exception is if the URL was long and hashed to the
670 // same URL as an existing document indexed in the same
671 // batch.
672 mark_as_seen(did);
673 return true;
676 break;
679 return false;
682 void
683 index_remove_failed_entry(const string& urlterm)
685 failed.del(urlterm);
688 void
689 index_add_document(const string& urlterm, time_t last_altered,
690 Xapian::docid did, const Xapian::Document& doc)
692 if (dup_action != DUP_SKIP) {
693 // If this document has already been indexed, update the existing
694 // entry.
695 if (did) {
696 // We already found out the document id above.
697 db.replace_document(did, doc);
698 } else if (last_altered <= last_altered_max) {
699 // We checked for the UID term and didn't find it.
700 did = db.add_document(doc);
701 } else {
702 did = db.replace_document(urlterm, doc);
704 mark_as_seen(did);
705 if (verbose) {
706 if (did <= old_lastdocid) {
707 cout << "updated" << endl;
708 } else {
709 cout << "added" << endl;
712 } else {
713 // If this were a duplicate, we'd have skipped it above.
714 db.add_document(doc);
715 if (verbose)
716 cout << "added" << endl;
720 void
721 index_mimetype(const string& file, const string& urlterm, const string& url,
722 const string& ext,
723 string mimetype,
724 DirectoryIterator& d,
725 string pathterm,
726 string record)
728 string context(file, root.size(), string::npos);
730 // FIXME: We could be cleverer here and check mtime too when use_ctime is
731 // set - if the ctime has changed but the mtime is unchanged, we can just
732 // update the existing Document and avoid having to re-extract text, etc.
733 time_t last_altered = use_ctime ? d.get_ctime() : d.get_mtime();
735 Xapian::docid did = 0;
736 if (index_check_existing(urlterm, last_altered, did))
737 return;
739 if (!retry_failed) {
740 // We only store and check the mtime (last modified) - a change to the
741 // metadata won't generally cause a previous failure to now work
742 // (FIXME: except permissions).
743 time_t failed_last_mod;
744 off_t failed_size;
745 if (failed.contains(urlterm, failed_last_mod, failed_size)) {
746 if (d.get_mtime() <= failed_last_mod &&
747 d.get_size() == failed_size) {
748 if (verbose)
749 cout << "failed to extract text on earlier run" << endl;
750 return;
752 // The file has changed, so remove the entry for it. If it fails
753 // again on this attempt, we'll add a new one.
754 failed.del(urlterm);
758 // If we didn't get the mime type from the extension, call libmagic to get
759 // it.
760 if (mimetype.empty()) {
761 mimetype = d.get_magic_mimetype();
762 if (mimetype.empty()) {
763 skip(urlterm, file.substr(root.size()),
764 "Unknown extension and unrecognised format",
765 d.get_size(), d.get_mtime(), SKIP_SHOW_FILENAME);
766 return;
770 if (verbose)
771 cout << "Indexing \"" << file.substr(root.size()) << "\" as "
772 << mimetype << " ... " << flush;
774 // Use `file` as the basis, as we don't want URL encoding in these terms,
775 // but need to switch over the initial part so we get `/~olly/foo/bar` not
776 // `/home/olly/public_html/foo/bar`.
777 Xapian::Document newdocument;
778 size_t j;
779 while ((j = pathterm.rfind('/')) > 1 && j != string::npos) {
780 pathterm.resize(j);
781 if (pathterm.length() > MAX_SAFE_TERM_LENGTH) {
782 string term_hash = hash_long_term(pathterm, MAX_SAFE_TERM_LENGTH);
783 newdocument.add_boolean_term(term_hash);
784 } else {
785 newdocument.add_boolean_term(pathterm);
789 string author, title, sample, keywords, topic, dump;
790 string to, cc, bcc, message_id;
791 string md5;
792 time_t created = time_t(-1);
793 int pages = -1;
795 map<string, Filter>::const_iterator cmd_it = commands.find(mimetype);
796 if (cmd_it == commands.end()) {
797 size_t slash = mimetype.find('/');
798 if (slash != string::npos) {
799 string wildtype(mimetype, 0, slash + 2);
800 wildtype[slash + 1] = '*';
801 cmd_it = commands.find(wildtype);
802 if (cmd_it == commands.end()) {
803 cmd_it = commands.find("*/*");
806 if (cmd_it == commands.end()) {
807 cmd_it = commands.find("*");
810 try {
811 if (cmd_it != commands.end() && cmd_it->second.worker) {
812 // Use a worker process to extract the content.
813 Worker* wrk = cmd_it->second.worker;
814 int r = wrk->extract(file, mimetype, dump, title, keywords, author,
815 to, cc, bcc, message_id, pages, created);
816 if (r != 0) {
817 string msg = wrk->get_error();
818 assert(!msg.empty());
819 skip(urlterm, context, msg, d.get_size(), d.get_mtime());
820 if (r < 0) {
821 // Hard failure - don't try this filter again for this run.
822 string filter_entry;
823 if (cmd_it != commands.end()) {
824 filter_entry = cmd_it->first;
825 } else {
826 filter_entry = mimetype;
828 commands[filter_entry] = Filter();
830 return;
832 } else if (cmd_it != commands.end()) {
833 // Easy "run a command and read text or HTML from stdout or a
834 // temporary file" cases.
835 auto& filter = cmd_it->second;
836 string cmd = filter.cmd;
837 if (cmd.empty()) {
838 skip(urlterm, context, "required filter not installed",
839 d.get_size(), d.get_mtime(), SKIP_VERBOSE_ONLY);
840 return;
842 if (cmd == "false") {
843 // Allow setting 'false' as a filter to mean that a MIME type
844 // should be quietly ignored.
845 string m = "ignoring MIME type '";
846 m += cmd_it->first;
847 m += "'";
848 skip(urlterm, context, m, d.get_size(), d.get_mtime(),
849 SKIP_VERBOSE_ONLY);
850 return;
852 bool use_shell = filter.use_shell();
853 bool input_on_stdin = filter.input_on_stdin();
854 bool substituted = false;
855 string tmpout;
856 size_t pcent = 0;
857 while (true) {
858 pcent = cmd.find('%', pcent);
859 if (pcent >= cmd.size() - 1)
860 break;
861 switch (cmd[pcent + 1]) {
862 case '%': // %% -> %.
863 cmd.erase(++pcent, 1);
864 break;
865 case 'f': { // %f -> escaped filename.
866 substituted = true;
867 if (filter.dev_stdin()) {
868 cmd.replace(pcent, 2, "/dev/stdin",
869 CONST_STRLEN("/dev/stdin"));
870 break;
872 string tail(cmd, pcent + 2);
873 cmd.resize(pcent);
874 // Suppress the space append_filename_argument()
875 // usually adds before the argument - the command
876 // string either includes one, or won't expect one
877 // (e.g. --input=%f).
878 append_filename_argument(cmd, file, false);
879 pcent = cmd.size();
880 cmd += tail;
881 break;
883 case 't': { // %t -> temporary output file.
884 if (tmpout.empty()) {
885 // Use a temporary file with a suitable extension
886 // in case the command cares, and for more helpful
887 // error messages from the command.
888 if (filter.output_type == "text/html") {
889 tmpout = get_tmpfile("tmp.html");
890 } else if (filter.output_type == "image/svg+xml") {
891 tmpout = get_tmpfile("tmp.svg");
892 } else {
893 tmpout = get_tmpfile("tmp.txt");
896 substituted = true;
897 string tail(cmd, pcent + 2);
898 cmd.resize(pcent);
899 // Suppress the space append_filename_argument()
900 // usually adds before the argument - the command
901 // string either includes one, or won't expect one
902 // (e.g. --output=%t).
903 append_filename_argument(cmd, tmpout, false);
904 pcent = cmd.size();
905 cmd += tail;
906 break;
908 default:
909 // Leave anything else alone for now.
910 pcent += 2;
911 break;
914 if (!substituted && cmd != "true") {
915 if (input_on_stdin) {
916 if (filter.dev_stdin()) {
917 cmd += " /dev/stdin";
919 } else {
920 // If no %f, append the filename to the command.
921 append_filename_argument(cmd, file);
924 try {
925 if (!tmpout.empty()) {
926 // Output in temporary file.
927 if (input_on_stdin) {
928 run_filter(d.get_fd(), cmd, use_shell);
929 } else {
930 run_filter(cmd, use_shell);
932 if (!load_file(tmpout, dump, NOCACHE)) {
933 throw ReadError("Couldn't read output file");
935 unlink(tmpout.c_str());
936 } else if (cmd == "true") {
937 // Ignore the file's contents, just index metadata from the
938 // filing system.
939 } else {
940 // Output on stdout.
941 if (input_on_stdin) {
942 run_filter(d.get_fd(), cmd, use_shell, &dump);
943 } else {
944 run_filter(cmd, use_shell, &dump);
947 const string& charset = filter.output_charset;
948 if (filter.output_type == "text/html") {
949 HtmlParser p;
950 p.ignore_metarobots();
951 p.description_as_sample = description_as_sample;
952 try {
953 p.parse(dump, charset, false);
954 } catch (const string& newcharset) {
955 p.reset();
956 p.ignore_metarobots();
957 p.description_as_sample = description_as_sample;
958 p.parse(dump, newcharset, true);
959 } catch (const ReadError&) {
960 skip_cmd_failed(urlterm, context, cmd,
961 d.get_size(), d.get_mtime());
962 return;
964 dump = p.dump;
965 title = p.title;
966 keywords = p.keywords;
967 topic = p.topic;
968 sample = p.sample;
969 author = p.author;
970 created = p.created;
971 } else if (filter.output_type == "image/svg+xml") {
972 SvgParser svgparser;
973 svgparser.parse(dump);
974 dump = svgparser.dump;
975 title = svgparser.title;
976 keywords = svgparser.keywords;
977 // FIXME: topic = svgparser.topic;
978 author = svgparser.author;
979 } else if (!charset.empty()) {
980 convert_to_utf8(dump, charset);
982 } catch (const ReadError&) {
983 skip_cmd_failed(urlterm, context, cmd,
984 d.get_size(), d.get_mtime());
985 return;
987 } else if (mimetype == "text/html" || mimetype == "text/x-php") {
988 const string& text = d.file_to_string();
989 HtmlParser p;
990 if (ignore_exclusions) p.ignore_metarobots();
991 p.description_as_sample = description_as_sample;
992 try {
993 // Default HTML character set is latin 1, though not specifying
994 // one is deprecated these days.
995 p.parse(text, "iso-8859-1", false);
996 } catch (const string& newcharset) {
997 p.reset();
998 if (ignore_exclusions) p.ignore_metarobots();
999 p.description_as_sample = description_as_sample;
1000 p.parse(text, newcharset, true);
1002 if (!p.indexing_allowed) {
1003 skip_meta_tag(urlterm, context,
1004 d.get_size(), d.get_mtime());
1005 return;
1007 dump = p.dump;
1008 title = p.title;
1009 keywords = p.keywords;
1010 topic = p.topic;
1011 sample = p.sample;
1012 author = p.author;
1013 created = p.created;
1014 md5_string(text, md5);
1015 } else if (mimetype == "text/plain") {
1016 // Currently we assume that text files are UTF-8 unless they have a
1017 // byte-order mark.
1018 dump = d.file_to_string();
1019 md5_string(dump, md5);
1021 // Look for Byte-Order Mark (BOM).
1022 if (startswith(dump, "\xfe\xff") || startswith(dump, "\xff\xfe")) {
1023 // UTF-16 in big-endian/little-endian order - we just convert
1024 // it as "UTF-16" and let the conversion handle the BOM as that
1025 // way we avoid the copying overhead of erasing 2 bytes from
1026 // the start of dump.
1027 convert_to_utf8(dump, "UTF-16");
1028 } else if (startswith(dump, "\xef\xbb\xbf")) {
1029 // UTF-8 with stupid Windows not-the-byte-order mark.
1030 dump.erase(0, 3);
1031 } else {
1032 // FIXME: What charset is the file? Look at contents?
1034 } else if (mimetype == "application/pdf") {
1035 const char* const cmd[] = {
1036 "pdftotext", "-enc", "UTF-8", "-", "-", NULL
1038 try {
1039 run_filter(d.get_fd(), cmd, &dump);
1040 } catch (const ReadError&) {
1041 skip_cmd_failed(urlterm, context, cmd,
1042 d.get_size(), d.get_mtime());
1043 return;
1045 get_pdf_metainfo(d.get_fd(), author, title, keywords, topic, pages);
1046 } else if (mimetype == "application/postscript") {
1047 // There simply doesn't seem to be a Unicode capable PostScript to
1048 // text converter (e.g. pstotext always outputs ISO-8859-1). The
1049 // only solution seems to be to convert via PDF using ps2pdf and
1050 // then pdftotext. This gives plausible looking UTF-8 output for
1051 // some Chinese PostScript files I found using Google. It also has
1052 // the benefit of allowing us to extract meta information from
1053 // PostScript files.
1054 string tmpfile = get_tmpfile("tmp.pdf");
1055 if (tmpfile.empty()) {
1056 // FIXME: should this be fatal? Or disable indexing postscript?
1057 string msg = "Couldn't create temporary directory (";
1058 msg += strerror(errno);
1059 msg += ")";
1060 skip(urlterm, context, msg,
1061 d.get_size(), d.get_mtime());
1062 return;
1064 const char* cmd[] = {
1065 "ps2pdf", "-", NULL, NULL
1067 cmd[2] = tmpfile.c_str();
1068 try {
1069 run_filter(d.get_fd(), cmd);
1070 const char* cmd2[] = {
1071 "pdftotext", "-enc", "UTF-8", NULL, "-", NULL
1073 cmd2[3] = tmpfile.c_str();
1074 run_filter(cmd2, &dump);
1075 } catch (const ReadError&) {
1076 skip_cmd_failed(urlterm, context, cmd,
1077 d.get_size(), d.get_mtime());
1078 unlink(tmpfile.c_str());
1079 return;
1080 } catch (...) {
1081 unlink(tmpfile.c_str());
1082 throw;
1084 try {
1085 get_pdf_metainfo(tmpfile, author, title, keywords, topic,
1086 pages);
1087 } catch (...) {
1088 unlink(tmpfile.c_str());
1089 throw;
1091 unlink(tmpfile.c_str());
1092 } else if (startswith(mimetype, "application/vnd.sun.xml.") ||
1093 startswith(mimetype, "application/vnd.oasis.opendocument."))
1095 // Inspired by http://mjr.towers.org.uk/comp/sxw2text
1096 string cmd = "unzip -p";
1097 append_filename_argument(cmd, file);
1098 cmd += " content.xml ; unzip -p";
1099 append_filename_argument(cmd, file);
1100 cmd += " styles.xml";
1101 try {
1102 OpenDocParser parser;
1103 parser.parse(stdout_to_string(cmd, true));
1104 dump = parser.dump;
1105 } catch (const ReadError&) {
1106 skip_cmd_failed(urlterm, context, cmd,
1107 d.get_size(), d.get_mtime());
1108 return;
1111 cmd = "unzip -p";
1112 append_filename_argument(cmd, file);
1113 cmd += " meta.xml";
1114 try {
1115 OpenDocMetaParser metaparser;
1116 metaparser.parse(stdout_to_string(cmd, false));
1117 title = metaparser.title;
1118 keywords = metaparser.keywords;
1119 // FIXME: topic = metaparser.topic;
1120 sample = metaparser.sample;
1121 author = metaparser.author;
1122 pages = metaparser.pages;
1123 } catch (const ReadError&) {
1124 // It's probably best to index the document even if this fails.
1126 } else if (startswith(mimetype,
1127 "application/vnd.openxmlformats-officedocument."))
1129 const char* args = NULL;
1130 string tail(mimetype, 46);
1131 if (startswith(tail, "wordprocessingml.")) {
1132 // unzip returns exit code 11 if a file to extract wasn't found
1133 // which we want to ignore, because there may be no headers or
1134 // no footers.
1135 args = " word/document.xml"
1136 " 'word/header*.xml'"
1137 " 'word/footer*.xml'"
1138 " 2>/dev/null";
1139 } else if (startswith(tail, "spreadsheetml.")) {
1140 // Extract the shared string table first, so our parser can
1141 // grab those ready for parsing the sheets which will reference
1142 // the shared strings.
1143 string cmd = "unzip -p";
1144 append_filename_argument(cmd, file);
1145 cmd += " xl/styles.xml xl/workbook.xml xl/sharedStrings.xml ; "
1146 "unzip -p";
1147 append_filename_argument(cmd, file);
1148 cmd += " xl/worksheets/sheet\\*.xml";
1149 try {
1150 XlsxParser parser;
1151 parser.parse(stdout_to_string(cmd, true));
1152 dump = parser.dump;
1153 } catch (const ReadError&) {
1154 skip_cmd_failed(urlterm, context, cmd,
1155 d.get_size(), d.get_mtime());
1156 return;
1158 } else if (startswith(tail, "presentationml.")) {
1159 // unzip returns exit code 11 if a file to extract wasn't found
1160 // which we want to ignore, because there may be no notesSlides
1161 // or comments.
1162 args = " 'ppt/slides/slide*.xml'"
1163 " 'ppt/notesSlides/notesSlide*.xml'"
1164 " 'ppt/comments/comment*.xml'"
1165 " 2>/dev/null";
1166 } else {
1167 // Don't know how to index this type.
1168 skip_unknown_mimetype(urlterm, context, mimetype,
1169 d.get_size(), d.get_mtime());
1170 return;
1173 if (args) {
1174 string cmd = "unzip -p";
1175 append_filename_argument(cmd, file);
1176 cmd += args;
1177 try {
1178 MSXmlParser xmlparser;
1179 // Treat exit status 11 from unzip as success - this is
1180 // what we get if one of the listed filenames to extract
1181 // doesn't match anything in the zip file.
1182 xmlparser.parse(stdout_to_string(cmd, false, 11));
1183 dump = xmlparser.dump;
1184 } catch (const ReadError&) {
1185 skip_cmd_failed(urlterm, context, cmd,
1186 d.get_size(), d.get_mtime());
1187 return;
1191 string cmd = "unzip -p";
1192 append_filename_argument(cmd, file);
1193 cmd += " docProps/core.xml";
1194 try {
1195 OpenDocMetaParser metaparser;
1196 metaparser.parse(stdout_to_string(cmd, false));
1197 title = metaparser.title;
1198 keywords = metaparser.keywords;
1199 // FIXME: topic = metaparser.topic;
1200 sample = metaparser.sample;
1201 author = metaparser.author;
1202 } catch (const ReadError&) {
1203 // It's probably best to index the document even if this fails.
1205 } else if (mimetype == "application/x-abiword") {
1206 AbiwordParser abiwordparser;
1207 const string& text = d.file_to_string();
1208 abiwordparser.parse(text);
1209 dump = abiwordparser.dump;
1210 md5_string(text, md5);
1211 } else if (mimetype == "application/x-abiword-compressed") {
1212 AbiwordParser abiwordparser;
1213 abiwordparser.parse(d.gzfile_to_string());
1214 dump = abiwordparser.dump;
1215 } else if (mimetype == "application/oxps" ||
1216 mimetype == "application/vnd.ms-xpsdocument") {
1217 const char* cmd[] = {
1218 "unzip", "-p", NULL, "Documents/*/Pages/*.fpage", NULL
1220 cmd[2] = file.c_str();
1221 try {
1222 XpsParser xpsparser;
1223 run_filter(cmd, &dump);
1224 xpsparser.parse(dump);
1225 dump = xpsparser.dump;
1226 } catch (const ReadError&) {
1227 skip_cmd_failed(urlterm, context, cmd,
1228 d.get_size(), d.get_mtime());
1229 return;
1232 const char* cmd2[] = {
1233 "unzip", "-p", NULL, "docProps/core.xml", NULL
1235 cmd2[2] = file.c_str();
1236 try {
1237 OpenDocMetaParser metaparser;
1238 metaparser.parse(stdout_to_string(cmd2));
1239 title = metaparser.title;
1240 keywords = metaparser.keywords;
1241 // FIXME: topic = metaparser.topic;
1242 sample = metaparser.sample;
1243 author = metaparser.author;
1244 } catch (const ReadError&) {
1245 // Ignore errors as not all XPS files contain this file.
1247 } else if (mimetype == "text/csv") {
1248 // Currently we assume that text files are UTF-8 unless they have a
1249 // byte-order mark.
1250 dump = d.file_to_string();
1251 md5_string(dump, md5);
1253 // Look for Byte-Order Mark (BOM).
1254 if (startswith(dump, "\xfe\xff") || startswith(dump, "\xff\xfe")) {
1255 // UTF-16 in big-endian/little-endian order - we just convert
1256 // it as "UTF-16" and let the conversion handle the BOM as that
1257 // way we avoid the copying overhead of erasing 2 bytes from
1258 // the start of dump.
1259 convert_to_utf8(dump, "UTF-16");
1260 } else if (startswith(dump, "\xef\xbb\xbf")) {
1261 // UTF-8 with stupid Windows not-the-byte-order mark.
1262 dump.erase(0, 3);
1263 } else {
1264 // FIXME: What charset is the file? Look at contents?
1267 generate_sample_from_csv(dump, sample);
1268 } else if (mimetype == "image/svg+xml") {
1269 SvgParser svgparser;
1270 const string& text = d.file_to_string();
1271 md5_string(text, md5);
1272 svgparser.parse(text);
1273 dump = svgparser.dump;
1274 title = svgparser.title;
1275 keywords = svgparser.keywords;
1276 // FIXME: topic = svgparser.topic;
1277 author = svgparser.author;
1278 } else if (mimetype == "image/svg+xml-compressed") {
1279 SvgParser svgparser;
1280 const string& text = d.gzfile_to_string();
1281 svgparser.parse(text);
1282 dump = svgparser.dump;
1283 title = svgparser.title;
1284 keywords = svgparser.keywords;
1285 // FIXME: topic = svgparser.topic;
1286 author = svgparser.author;
1287 } else if (mimetype == "application/vnd.debian.binary-package" ||
1288 mimetype == "application/x-debian-package") {
1289 const char* cmd = "dpkg-deb -f - Description";
1290 string desc;
1291 run_filter(d.get_fd(), cmd, false, &desc);
1292 // First line is short description, which we use as the title.
1293 string::size_type idx = desc.find('\n');
1294 title.assign(desc, 0, idx);
1295 if (idx != string::npos) {
1296 dump.assign(desc, idx + 1, string::npos);
1298 } else if (mimetype == "application/x-redhat-package-manager" ||
1299 mimetype == "application/x-rpm") {
1300 string cmd("rpm -q --qf '%{SUMMARY}\\n%{DESCRIPTION}' -p");
1301 append_filename_argument(cmd, file);
1302 string desc;
1303 run_filter(cmd, false, &desc);
1304 // First line is summary, which we use as the title.
1305 string::size_type idx = desc.find('\n');
1306 title.assign(desc, 0, idx);
1307 if (idx != string::npos) {
1308 dump.assign(desc, idx + 1, string::npos);
1310 } else if (mimetype == "application/atom+xml") {
1311 AtomParser atomparser;
1312 const string& text = d.file_to_string();
1313 md5_string(text, md5);
1314 atomparser.parse(text);
1315 dump = atomparser.dump;
1316 title = atomparser.title;
1317 keywords = atomparser.keywords;
1318 // FIXME: topic = atomparser.topic;
1319 author = atomparser.author;
1320 } else {
1321 // Don't know how to index this type.
1322 skip_unknown_mimetype(urlterm, context, mimetype,
1323 d.get_size(), d.get_mtime());
1324 return;
1327 // Compute the MD5 of the file if we haven't already.
1328 if (md5.empty() && !d.md5(md5)) {
1329 if (errno == ENOENT || errno == ENOTDIR) {
1330 skip(urlterm, context, "File removed during indexing",
1331 d.get_size(), d.get_mtime(),
1332 SKIP_VERBOSE_ONLY | SKIP_SHOW_FILENAME);
1333 } else {
1334 skip(urlterm, context,
1335 "failed to read file to calculate MD5 checksum",
1336 d.get_size(), d.get_mtime());
1338 return;
1341 // Remove any trailing formfeeds, so we don't consider them when
1342 // considering if we extracted any text (e.g. pdftotext outputs a
1343 // formfeed between each page, even for blank pages).
1345 // If dump contain only formfeeds, then trim_end will be string::npos
1346 // and ++trim_end will be 0, which is the correct new size.
1347 string::size_type trim_end = dump.find_last_not_of('\f');
1348 if (UNSIGNED_OVERFLOW_OK(++trim_end) != dump.size())
1349 dump.resize(trim_end);
1351 if (dump.empty()) {
1352 switch (empty_body) {
1353 case EMPTY_BODY_INDEX:
1354 break;
1355 case EMPTY_BODY_WARN:
1356 cout << "no text extracted from document body, "
1357 "but indexing metadata anyway" << endl;
1358 break;
1359 case EMPTY_BODY_SKIP:
1360 skip(urlterm, context,
1361 "no text extracted from document body",
1362 d.get_size(), d.get_mtime());
1363 return;
1367 // Produce a sample
1368 if (sample.empty()) {
1369 sample = generate_sample(dump, sample_size, "...", " ...");
1370 } else {
1371 sample = generate_sample(sample, sample_size, "...", " ...");
1374 // Put the data in the document
1375 if (record.empty()) {
1376 record = "url=";
1377 } else {
1378 record += "\nurl=";
1380 record += url;
1381 record += "\nsample=";
1382 record += sample;
1383 if (!title.empty()) {
1384 record += "\ncaption=";
1385 record += generate_sample(title, title_size, "...", " ...");
1387 if (!author.empty()) {
1388 record += "\nauthor=";
1389 record += author;
1391 if (!to.empty()) {
1392 record += "\nto=";
1393 record += to;
1395 if (!cc.empty()) {
1396 record += "\ncc=";
1397 record += cc;
1399 if (!bcc.empty()) {
1400 record += "\nbcc=";
1401 record += bcc;
1403 if (!message_id.empty()) {
1404 record += "\nmsgid=";
1405 record += message_id;
1407 record += "\ntype=";
1408 record += mimetype;
1409 time_t mtime = d.get_mtime();
1410 if (mtime != static_cast<time_t>(-1)) {
1411 record += "\nmodtime=";
1412 record += str(mtime);
1414 if (created != static_cast<time_t>(-1)) {
1415 record += "\ncreated=";
1416 record += str(created);
1418 if (pages >= 0) {
1419 record += "\npages=";
1420 record += str(pages);
1422 off_t size = d.get_size();
1423 record += "\nsize=";
1424 record += str(size);
1425 newdocument.set_data(record);
1427 // Index the title, document text, keywords and topic.
1428 indexer.set_document(newdocument);
1429 if (!title.empty()) {
1430 indexer.index_text(title, 5, "S");
1431 indexer.increase_termpos(100);
1433 if (!dump.empty()) {
1434 indexer.index_text(dump);
1436 if (!keywords.empty()) {
1437 indexer.increase_termpos(100);
1438 indexer.index_text(keywords);
1440 if (!topic.empty()) {
1441 indexer.increase_termpos(100);
1442 indexer.index_text(topic, 1, "B");
1444 // Index the leafname of the file.
1446 indexer.increase_termpos(100);
1447 string leaf = d.leafname();
1448 string::size_type dot = leaf.find_last_of('.');
1449 if (dot != string::npos && leaf.size() - dot - 1 <= max_ext_len)
1450 leaf.resize(dot);
1451 indexer.index_text(leaf, 1, "F");
1453 // Also index with underscores and ampersands replaced by spaces.
1454 bool modified = false;
1455 string::size_type rep = 0;
1456 while ((rep = leaf.find_first_of("_&", rep)) != string::npos) {
1457 leaf[rep++] = ' ';
1458 modified = true;
1460 if (modified) {
1461 indexer.increase_termpos(100);
1462 indexer.index_text(leaf, 1, "F");
1466 if (!author.empty()) {
1467 indexer.increase_termpos(100);
1468 indexer.index_text(author, 1, "A");
1471 if (!to.empty()) {
1472 indexer.increase_termpos(100);
1473 indexer.index_text(to, 1, "XTO");
1476 if (!cc.empty()) {
1477 indexer.increase_termpos(100);
1478 indexer.index_text(cc, 1, "XCC");
1481 if (!bcc.empty()) {
1482 indexer.increase_termpos(100);
1483 indexer.index_text(bcc, 1, "XBCC");
1486 if (!message_id.empty()) {
1487 newdocument.add_boolean_term("XMID:" + message_id);
1490 // mimeType:
1491 newdocument.add_boolean_term("T" + mimetype);
1493 newdocument.add_boolean_term(site_term);
1495 if (!host_term.empty())
1496 newdocument.add_boolean_term(host_term);
1498 if (date_terms) {
1499 struct tm* tm = localtime(&mtime);
1500 string date_term = "D";
1501 date_term += date_to_string(tm->tm_year + 1900,
1502 tm->tm_mon + 1,
1503 tm->tm_mday);
1504 newdocument.add_boolean_term(date_term); // Date (YYYYMMDD)
1505 date_term.resize(7);
1506 date_term[0] = 'M';
1507 newdocument.add_boolean_term(date_term); // Month (YYYYMM)
1508 date_term.resize(5);
1509 date_term[0] = 'Y';
1510 newdocument.add_boolean_term(date_term); // Year (YYYY)
1513 newdocument.add_boolean_term(urlterm); // Url
1515 // Add mtime as a value to allow "sort by date".
1516 newdocument.add_value(VALUE_LASTMOD,
1517 int_to_binary_string(uint32_t(mtime)));
1518 if (use_ctime) {
1519 // Add ctime as a value to track modifications.
1520 time_t ctime = d.get_ctime();
1521 newdocument.add_value(VALUE_CTIME,
1522 int_to_binary_string(uint32_t(ctime)));
1525 // Add MD5 as a value to allow duplicate documents to be collapsed
1526 // together.
1527 newdocument.add_value(VALUE_MD5, md5);
1529 // Add the file size as a value to allow "sort by size" and size ranges.
1530 newdocument.add_value(VALUE_SIZE,
1531 Xapian::sortable_serialise(size));
1533 if (created != static_cast<time_t>(-1)) {
1534 // Add created time as a value to allow "sort by created date".
1535 newdocument.add_value(VALUE_CREATED,
1536 int_to_binary_string(uint32_t(created)));
1539 bool inc_tag_added = false;
1540 if (d.is_other_readable()) {
1541 inc_tag_added = true;
1542 newdocument.add_boolean_term("I*");
1543 } else if (d.is_group_readable()) {
1544 const char* group = d.get_group();
1545 if (group) {
1546 newdocument.add_boolean_term(string("I#") + group);
1549 const char* owner = d.get_owner();
1550 if (owner) {
1551 newdocument.add_boolean_term(string("O") + owner);
1552 if (!inc_tag_added && d.is_owner_readable())
1553 newdocument.add_boolean_term(string("I@") + owner);
1556 string ext_term("E");
1557 for (string::const_iterator i = ext.begin(); i != ext.end(); ++i) {
1558 char ch = *i;
1559 if (ch >= 'A' && ch <= 'Z')
1560 ch |= 32;
1561 ext_term += ch;
1563 newdocument.add_boolean_term(ext_term);
1565 index_add_document(urlterm, last_altered, did, newdocument);
1566 } catch (const ReadError&) {
1567 skip(urlterm, context, string("can't read file: ") + strerror(errno),
1568 d.get_size(), d.get_mtime());
1569 } catch (const NoSuchFilter&) {
1570 string filter_entry;
1571 if (cmd_it != commands.end()) {
1572 filter_entry = cmd_it->first;
1573 } else {
1574 filter_entry = mimetype;
1576 string m = "Filter for \"";
1577 m += filter_entry;
1578 m += "\" not installed";
1579 skip(urlterm, context, m, d.get_size(), d.get_mtime());
1580 commands[filter_entry] = Filter();
1581 } catch (const FileNotFound&) {
1582 skip(urlterm, context, "File removed during indexing",
1583 d.get_size(), d.get_mtime(),
1584 SKIP_VERBOSE_ONLY | SKIP_SHOW_FILENAME);
1585 } catch (const std::string& error) {
1586 skip(urlterm, context, error, d.get_size(), d.get_mtime());
1587 } catch (const std::bad_alloc&) {
1588 // Attempt to flag the file as failed and commit changes, though that
1589 // might fail too if we're low on memory rather than being asked to
1590 // allocate a ludicrous amount.
1591 skip(urlterm, context, "Out of memory trying to extract text from file",
1592 d.get_size(), d.get_mtime(),
1593 SKIP_SHOW_FILENAME);
1594 throw CommitAndExit("Caught std::bad_alloc", "");
1598 void
1599 index_handle_deletion()
1601 if (updated.empty() || old_docs_not_seen == 0) return;
1603 if (verbose) {
1604 cout << "Deleting " << old_docs_not_seen
1605 << " old documents which weren't found" << endl;
1607 Xapian::PostingIterator alldocs = db.postlist_begin(string());
1608 Xapian::docid did = *alldocs;
1609 while (did < updated.size()) {
1610 if (!updated[did]) {
1611 alldocs.skip_to(did);
1612 if (alldocs == db.postlist_end(string()))
1613 break;
1614 if (*alldocs != did) {
1615 // Document #did didn't exist before we started.
1616 did = *alldocs;
1617 continue;
1619 db.delete_document(did);
1620 if (--old_docs_not_seen == 0)
1621 break;
1623 ++did;
1627 void
1628 index_commit()
1630 db.commit();
1633 void
1634 index_done()
1636 // If we created a temporary directory then delete it.
1637 remove_tmpdir();