Fix tg_termpos1 for 64-bit termpos
[xapian.git] / xapian-applications / omega / index_file.cc
blob0183d008612d9be56ebb14a4d9d188b1ee244efa
1 /** @file
2 * @brief Handle indexing a document from a file
3 */
4 /* Copyright 1999,2000,2001 BrightStation PLC
5 * Copyright 2001,2005 James Aylett
6 * Copyright 2001,2002 Ananova Ltd
7 * Copyright 2002-2023 Olly Betts
8 * Copyright 2009 Frank J Bruzzaniti
9 * Copyright 2012 Mihai Bivol
10 * Copyright 2019 Bruno Baruffaldi
11 * Copyright 2020 Parth Kapadia
13 * This program is free software; you can redistribute it and/or
14 * modify it under the terms of the GNU General Public License as
15 * published by the Free Software Foundation; either version 2 of the
16 * License, or (at your option) any later version.
18 * This program is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU General Public License for more details.
23 * You should have received a copy of the GNU General Public License
24 * along with this program; if not, write to the Free Software
25 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
26 * USA
29 #include <config.h>
31 #include "index_file.h"
33 #include <algorithm>
34 #include <iostream>
35 #include <limits>
36 #include <string>
37 #include <map>
38 #include <vector>
40 #include <sys/types.h>
41 #include "safeunistd.h"
42 #include <cassert>
43 #include <cerrno>
44 #include <cstdio>
45 #include <cstdlib>
46 #include <cstring>
47 #include "safefcntl.h"
48 #include <ctime>
50 #include <xapian.h>
52 #include "abiwordparser.h"
53 #include "append_filename_arg.h"
54 #include "atomparser.h"
55 #include "datetime.h"
56 #include "diritor.h"
57 #include "failed.h"
58 #include "hashterm.h"
59 #include "htmlparser.h"
60 #include "md5wrap.h"
61 #include "mimemap.h"
62 #include "msxmlparser.h"
63 #include "opendocmetaparser.h"
64 #include "opendocparser.h"
65 #include "pkglibbindir.h"
66 #include "runfilter.h"
67 #include "sample.h"
68 #include "str.h"
69 #include "stringutils.h"
70 #include "svgparser.h"
71 #include "tmpdir.h"
72 #include "utf8convert.h"
73 #include "values.h"
74 #include "worker.h"
75 #include "xlsxparser.h"
76 #include "xpsparser.h"
78 using namespace std;
80 static Xapian::WritableDatabase db;
81 static Xapian::TermGenerator indexer;
83 static Xapian::doccount old_docs_not_seen;
84 static Xapian::docid old_lastdocid;
85 static vector<bool> updated;
87 static bool verbose;
88 static bool retry_failed;
89 static bool use_ctime;
90 static dup_action_type dup_action;
91 static bool ignore_exclusions;
92 static bool description_as_sample;
93 static bool date_terms;
95 static time_t last_altered_max;
96 static size_t sample_size;
97 static size_t title_size;
98 static size_t max_ext_len;
100 static empty_body_type empty_body;
102 static string root;
103 static string site_term, host_term;
105 static Failed failed;
107 map<string, Filter> commands;
109 static void
110 mark_as_seen(Xapian::docid did)
112 if (usual(did < updated.size() && !updated[did])) {
113 updated[did] = true;
114 --old_docs_not_seen;
118 void
119 skip(const string& urlterm, const string& context, const string& msg,
120 off_t size, time_t last_mod, unsigned flags)
122 failed.add(urlterm, last_mod, size);
124 if (!verbose || (flags & SKIP_SHOW_FILENAME)) {
125 if (!verbose && (flags & SKIP_VERBOSE_ONLY)) return;
126 cout << context << ": ";
129 cout << "Skipping - " << msg << endl;
132 static void
133 skip_cmd_failed(const string& urlterm, const string& context, const string& cmd,
134 off_t size, time_t last_mod)
136 skip(urlterm, context, "\"" + cmd + "\" failed", size, last_mod);
139 static void
140 skip_meta_tag(const string& urlterm, const string& context,
141 off_t size, time_t last_mod)
143 skip(urlterm, context, "indexing disallowed by meta tag", size, last_mod);
146 static void
147 skip_unknown_mimetype(const string& urlterm, const string& context,
148 const string& mimetype, off_t size, time_t last_mod)
150 skip(urlterm, context, "unknown MIME type '" + mimetype + "'",
151 size, last_mod);
154 void
155 index_add_default_libraries()
157 #if defined HAVE_POPPLER
158 Worker* omindex_poppler = new Worker("omindex_poppler");
159 index_library("application/pdf", omindex_poppler);
160 #endif
161 #if defined HAVE_LIBEBOOK
162 Worker* omindex_libebook = new Worker("omindex_libebook");
163 index_library("application/vnd.palm", omindex_libebook);
164 index_library("application/x-fictionbook+xml", omindex_libebook);
165 index_library("application/x-zip-compressed-fb2", omindex_libebook);
166 index_library("application/x-sony-bbeb", omindex_libebook);
167 index_library("application/x-tcr-ebook", omindex_libebook);
168 index_library("application/x-qioo-ebook", omindex_libebook);
169 #endif
170 #if defined HAVE_LIBETONYEK
171 Worker* omindex_libetonyek = new Worker("omindex_libetonyek");
172 index_library("application/vnd.apple.keynote", omindex_libetonyek);
173 index_library("application/vnd.apple.pages", omindex_libetonyek);
174 index_library("application/vnd.apple.numbers", omindex_libetonyek);
175 #endif
176 #if defined HAVE_LIBGEPUB
177 Worker* omindex_libgepub = new Worker("omindex_libgepub");
178 index_library("application/epub+zip", omindex_libgepub);
179 #endif
180 #if defined HAVE_TESSERACT
181 Worker* omindex_tesseract = new Worker("omindex_tesseract");
182 index_library("image/gif", omindex_tesseract);
183 index_library("image/jpeg", omindex_tesseract);
184 index_library("image/png", omindex_tesseract);
185 index_library("image/webp", omindex_tesseract);
186 index_library("image/tiff", omindex_tesseract);
187 index_library("image/x-portable-bitmap", omindex_tesseract);
188 index_library("image/x-portable-graymap", omindex_tesseract);
189 index_library("image/x-portable-anymap", omindex_tesseract);
190 index_library("image/x-portable-pixmap", omindex_tesseract);
191 #endif
192 #if defined HAVE_GMIME
193 Worker* omindex_gmime = new Worker("omindex_gmime");
194 index_library("message/rfc822", omindex_gmime);
195 index_library("message/news", omindex_gmime);
196 #endif
197 #if defined HAVE_LIBARCHIVE
198 Worker* omindex_libarchive = new Worker("omindex_libarchive");
199 index_library("application/oxps", omindex_libarchive);
200 index_library("application/vnd.ms-xpsdocument", omindex_libarchive);
201 index_library("application/vnd.oasis.opendocument.text",
202 omindex_libarchive);
203 index_library("application/vnd.oasis.opendocument.spreadsheet",
204 omindex_libarchive);
205 index_library("application/vnd.oasis.opendocument.presentation",
206 omindex_libarchive);
207 index_library("application/vnd.oasis.opendocument.graphics",
208 omindex_libarchive);
209 index_library("application/vnd.oasis.opendocument.chart",
210 omindex_libarchive);
211 index_library("application/vnd.oasis.opendocument.formula",
212 omindex_libarchive);
213 index_library("application/vnd.oasis.opendocument.database",
214 omindex_libarchive);
215 index_library("application/vnd.oasis.opendocument.image",
216 omindex_libarchive);
217 index_library("application/vnd.oasis.opendocument.text-master",
218 omindex_libarchive);
219 index_library("application/vnd.oasis.opendocument.text-template",
220 omindex_libarchive);
221 index_library("application/vnd.oasis.opendocument.spreadsheet-template",
222 omindex_libarchive);
223 index_library("application/vnd.oasis.opendocument.presentation-template",
224 omindex_libarchive);
225 index_library("application/vnd.oasis.opendocument.graphics-template",
226 omindex_libarchive);
227 index_library("application/vnd.oasis.opendocument.chart-template",
228 omindex_libarchive);
229 index_library("application/vnd.oasis.opendocument.formula-template",
230 omindex_libarchive);
231 index_library("application/vnd.oasis.opendocument.image-template",
232 omindex_libarchive);
233 index_library("application/vnd.oasis.opendocument.text-web",
234 omindex_libarchive);
235 index_library("application/vnd.sun.xml.calc",
236 omindex_libarchive);
237 index_library("application/vnd.sun.xml.calc.template",
238 omindex_libarchive);
239 index_library("application/vnd.sun.xml.draw",
240 omindex_libarchive);
241 index_library("application/vnd.sun.xml.draw.template",
242 omindex_libarchive);
243 index_library("application/vnd.sun.xml.impress",
244 omindex_libarchive);
245 index_library("application/vnd.sun.xml.impress.template",
246 omindex_libarchive);
247 index_library("application/vnd.sun.xml.math",
248 omindex_libarchive);
249 index_library("application/vnd.sun.xml.writer",
250 omindex_libarchive);
251 index_library("application/vnd.sun.xml.writer.global",
252 omindex_libarchive);
253 index_library("application/vnd.sun.xml.writer.template",
254 omindex_libarchive);
255 index_library("application/vnd.openxmlformats-officedocument."
256 "wordprocessingml.document", omindex_libarchive);
257 index_library("application/vnd.openxmlformats-officedocument."
258 "wordprocessingml.template", omindex_libarchive);
259 index_library("application/vnd.openxmlformats-officedocument."
260 "spreadsheetml.sheet", omindex_libarchive);
261 index_library("application/vnd.openxmlformats-officedocument."
262 "spreadsheetml.template", omindex_libarchive);
263 index_library("application/vnd.openxmlformats-officedocument."
264 "presentationml.presentation", omindex_libarchive);
265 index_library("application/vnd.openxmlformats-officedocument."
266 "presentationml.slideshow", omindex_libarchive);
267 index_library("application/vnd.openxmlformats-officedocument."
268 "presentationml.template", omindex_libarchive);
269 #endif
270 #if defined HAVE_LIBABW
271 Worker* omindex_libabw = new Worker("omindex_libabw");
272 index_library("application/x-abiword", omindex_libabw);
273 index_library("application/x-abiword-compressed", omindex_libabw);
274 #endif
275 #if defined HAVE_LIBCDR
276 Worker* omindex_libcdr = new Worker("omindex_libcdr");
277 index_library("image/x-coreldraw", omindex_libcdr);
278 #endif
279 #if defined HAVE_LIBEXTRACTOR
280 Worker* omindex_libextractor = new Worker("omindex_libextractor");
281 index_library("video/mpeg", omindex_libextractor);
282 index_library("video/x-flv", omindex_libextractor);
283 index_library("video/x-msvideo", omindex_libextractor);
284 index_library("video/x-ms-asf", omindex_libextractor);
285 index_library("video/quicktime", omindex_libextractor);
286 index_library("video/ogg", omindex_libextractor);
287 index_library("audio/flac", omindex_libextractor);
288 index_library("audio/mpeg", omindex_libextractor);
289 index_library("audio/ogg", omindex_libextractor);
290 index_library("audio/x-wav", omindex_libextractor);
291 index_library("audio/x-mod", omindex_libextractor);
292 index_library("audio/x-s3m", omindex_libextractor);
293 #endif
294 #if defined HAVE_LIBMWAW
295 Worker* omindex_libmwaw = new Worker("omindex_libmwaw");
296 index_library("application/clarisworks", omindex_libmwaw);
297 index_library("image/x-pict", omindex_libmwaw);
298 #endif
301 void
302 index_add_default_filters()
304 // Command needs to be run using /bin/sh.
305 auto USE_SHELL = Filter::USE_SHELL;
306 // Currently none of these commands needs USE_SHELL.
307 (void)USE_SHELL;
308 // Input should be piped to stdin.
309 auto PIPE_IN = Filter::PIPE_IN;
310 // Filename can be /dev/stdin (which must be seekable).
311 auto SEEK_DEV_STDIN = Filter::SEEK_DEV_STDIN;
312 // Filename can be /dev/stdin (which can be a pipe).
313 auto PIPE_DEV_STDIN = Filter::PIPE_DEV_STDIN;
314 index_command("application/msword",
315 Filter("antiword -mUTF-8.txt -", PIPE_IN));
316 index_command("application/vnd.ms-excel",
317 Filter("xls2csv -c' ' -q0 -dutf-8", PIPE_DEV_STDIN));
318 index_command("application/vnd.ms-powerpoint",
319 Filter("catppt -dutf-8", PIPE_DEV_STDIN));
320 // Looking at the source of wpd2html and wpd2text I think both output
321 // UTF-8, but it's hard to be sure without sample Unicode .wpd files
322 // as they don't seem to be at all well documented.
323 index_command("application/vnd.wordperfect",
324 Filter("wpd2text", SEEK_DEV_STDIN));
325 // wps2text produces UTF-8 output from the sample files I've tested.
326 index_command("application/vnd.ms-works",
327 Filter("wps2text", SEEK_DEV_STDIN));
328 // Output is UTF-8 according to "man djvutxt". Generally this seems to
329 // be true, though some examples from djvu.org generate isolated byte
330 // 0x95 in a context which suggests it might be intended to be a bullet
331 // (as it is in CP1252).
332 index_command("image/vnd.djvu", Filter("djvutxt -", PIPE_IN));
333 index_command("text/markdown",
334 Filter("markdown", "text/html", PIPE_IN));
335 // The --text option unhelpfully converts all non-ASCII characters to "?"
336 // so we use --html instead, which produces HTML entities. The --nopict
337 // option suppresses exporting picture files as pictNNNN.wmf in the current
338 // directory. Note that this option was ignored in some older versions,
339 // but it was fixed in unrtf 0.20.4.
340 index_command("application/rtf",
341 Filter("unrtf --nopict --html 2>/dev/null", "text/html",
342 PIPE_IN));
343 index_command("text/rtf",
344 Filter("unrtf --nopict --html 2>/dev/null", "text/html",
345 PIPE_IN));
346 index_command("text/x-rst",
347 Filter("rst2html", "text/html", PIPE_IN));
348 index_command("application/x-mspublisher",
349 Filter("pub2xhtml", "text/html", SEEK_DEV_STDIN));
350 index_command("application/vnd.ms-outlook",
351 Filter(get_pkglibbindir() + "/outlookmsg2html",
352 "text/html", SEEK_DEV_STDIN));
353 index_command("application/vnd.ms-visio.drawing",
354 Filter("vsd2xhtml", "image/svg+xml", SEEK_DEV_STDIN));
355 index_command("application/vnd.ms-visio.stencil",
356 Filter("vsd2xhtml", "image/svg+xml", SEEK_DEV_STDIN));
357 index_command("application/vnd.ms-visio.template",
358 Filter("vsd2xhtml", "image/svg+xml", SEEK_DEV_STDIN));
359 index_command("application/vnd.visio",
360 Filter("vsd2xhtml", "image/svg+xml", SEEK_DEV_STDIN));
361 // pod2text's output character set doesn't seem to be documented, but from
362 // inspecting the source it looks like it's probably iso-8859-1. We need
363 // to pass "--errors=stderr" or else minor POD formatting errors cause a
364 // file not to be indexed.
365 index_command("text/x-perl",
366 Filter("pod2text --errors=stderr",
367 "text/plain", "iso-8859-1", PIPE_IN));
368 // FIXME: -e0 means "UTF-8", but that results in "fi", "ff", "ffi", etc
369 // appearing as single ligatures. For European languages, it's actually
370 // better to use -e2 (ISO-8859-1) and then convert, so let's do that for
371 // now until we handle Unicode "compatibility decompositions".
372 index_command("application/x-dvi",
373 Filter("catdvi -e2 -s", "text/plain", "iso-8859-1", PIPE_IN));
374 // Simplistic - ought to look in index.rdf files for filename and character
375 // set.
376 index_command("application/x-maff",
377 Filter("unzip -p %f '*/*.*htm*'", "text/html", "iso-8859-1",
378 SEEK_DEV_STDIN));
379 index_command("application/x-mimearchive",
380 Filter(get_pkglibbindir() + "/mhtml2html", "text/html",
381 PIPE_DEV_STDIN));
382 index_command("message/news",
383 Filter(get_pkglibbindir() + "/rfc822tohtml", "text/html",
384 PIPE_DEV_STDIN));
385 index_command("message/rfc822",
386 Filter(get_pkglibbindir() + "/rfc822tohtml", "text/html",
387 PIPE_DEV_STDIN));
388 index_command("text/vcard",
389 Filter(get_pkglibbindir() + "/vcard2text", PIPE_DEV_STDIN));
390 index_command("application/vnd.apple.keynote",
391 Filter("key2text", SEEK_DEV_STDIN));
392 index_command("application/vnd.apple.numbers",
393 Filter("numbers2text", SEEK_DEV_STDIN));
394 index_command("application/vnd.apple.pages",
395 Filter("pages2text", SEEK_DEV_STDIN));
398 void
399 index_init(const string& dbpath, const Xapian::Stem& stemmer,
400 const string& root_, const string& site_term_,
401 const string& host_term_,
402 empty_body_type empty_body_, dup_action_type dup_action_,
403 size_t sample_size_, size_t title_size_, size_t max_ext_len_,
404 bool overwrite, bool retry_failed_,
405 bool delete_removed_documents, bool verbose_, bool use_ctime_,
406 bool spelling, bool ignore_exclusions_, bool description_as_sample_,
407 bool date_terms_)
409 root = root_;
410 site_term = site_term_;
411 host_term = host_term_;
412 empty_body = empty_body_;
413 dup_action = dup_action_;
414 sample_size = sample_size_;
415 title_size = title_size_;
416 max_ext_len = max_ext_len_;
417 verbose = verbose_;
418 use_ctime = use_ctime_;
419 ignore_exclusions = ignore_exclusions_;
420 description_as_sample = description_as_sample_;
421 date_terms = date_terms_;
423 if (!overwrite) {
424 db = Xapian::WritableDatabase(dbpath, Xapian::DB_CREATE_OR_OPEN);
425 old_docs_not_seen = db.get_doccount();
426 // Handle an initially empty database exactly the same way as when
427 // overwrite is true.
428 if (old_docs_not_seen != 0) {
429 old_lastdocid = db.get_lastdocid();
430 if (delete_removed_documents) {
431 // + 1 so that old_lastdocid is a valid subscript.
432 updated.resize(old_lastdocid + 1);
434 Xapian::valueno slot = use_ctime ? VALUE_CTIME : VALUE_LASTMOD;
435 string ubound = db.get_value_upper_bound(slot);
436 if (!ubound.empty())
437 last_altered_max = binary_string_to_int(ubound);
439 } else {
440 db = Xapian::WritableDatabase(dbpath, Xapian::DB_CREATE_OR_OVERWRITE);
443 if (spelling) {
444 indexer.set_database(db);
445 indexer.set_flags(indexer.FLAG_SPELLING);
447 indexer.set_stemmer(stemmer);
449 runfilter_init();
451 failed.init(db);
453 if (overwrite) {
454 // There are no failures to retry, so setting this flag doesn't
455 // change the outcome, but does mean we avoid the overhead of
456 // checking for a previous failure.
457 retry_failed = true;
458 } else if (retry_failed_) {
459 failed.clear();
460 retry_failed = true;
461 } else {
462 // If there are no existing failures, setting this flag doesn't
463 // change the outcome, but does mean we avoid the overhead of
464 // checking for a previous failure.
465 retry_failed = failed.empty();
469 static void
470 parse_pdfinfo_field(const char* p, const char* end, string& out,
471 const char* field, size_t len)
473 if (size_t(end - p) > len && memcmp(p, field, len) == 0) {
474 p += len;
475 while (p != end && *p == ' ')
476 ++p;
477 if (p != end && (end[-1] != '\r' || --end != p))
478 out.assign(p, end - p);
482 #define PARSE_PDFINFO_FIELD(P, END, OUT, FIELD) \
483 parse_pdfinfo_field((P), (END), (OUT), FIELD":", CONST_STRLEN(FIELD) + 1)
485 static void
486 parse_pdf_metainfo(const string& pdfinfo, string& author, string& title,
487 string& keywords, string& topic, int& pages)
489 const char* p = pdfinfo.data();
490 const char* end = p + pdfinfo.size();
491 while (p != end) {
492 const char* start = p;
493 p = static_cast<const char*>(memchr(p, '\n', end - p));
494 const char* eol;
495 if (p) {
496 eol = p;
497 ++p;
498 } else {
499 p = eol = end;
501 switch (*start) {
502 case 'A':
503 PARSE_PDFINFO_FIELD(start, eol, author, "Author");
504 break;
505 case 'K':
506 PARSE_PDFINFO_FIELD(start, eol, keywords, "Keywords");
507 break;
508 case 'P': {
509 string s;
510 PARSE_PDFINFO_FIELD(start, eol, s, "Pages");
511 if (!s.empty())
512 pages = atoi(s.c_str());
513 break;
515 case 'S':
516 PARSE_PDFINFO_FIELD(start, eol, topic, "Subject");
517 break;
518 case 'T':
519 PARSE_PDFINFO_FIELD(start, eol, title, "Title");
520 break;
525 static void
526 get_pdf_metainfo(int fd, string& author, string& title,
527 string& keywords, string& topic, int& pages)
529 try {
530 string pdfinfo;
531 run_filter(fd, "pdfinfo -enc UTF-8 -", false, &pdfinfo);
532 parse_pdf_metainfo(pdfinfo, author, title, keywords, topic, pages);
533 } catch (const ReadError&) {
534 // It's probably best to index the document even if pdfinfo fails.
538 static void
539 get_pdf_metainfo(const string& file, string& author, string& title,
540 string& keywords, string& topic, int& pages)
542 try {
543 string cmd = "pdfinfo -enc UTF-8";
544 append_filename_argument(cmd, file);
545 parse_pdf_metainfo(stdout_to_string(cmd, false),
546 author, title, keywords, topic, pages);
547 } catch (const ReadError&) {
548 // It's probably best to index the document even if pdfinfo fails.
552 static void
553 generate_sample_from_csv(const string& csv_data, string& sample)
555 // Add 3 to allow for a 4 byte utf-8 sequence being appended when
556 // output is sample_size - 1 bytes long. Use csv_data.size() if smaller
557 // since the user might reasonably set sample_size really high.
558 sample.reserve(min(sample_size + 3, csv_data.size()));
559 size_t last_word_end = 0;
560 bool in_space = true;
561 bool in_quotes = false;
562 for (Xapian::Utf8Iterator i(csv_data); i != Xapian::Utf8Iterator(); ++i) {
563 unsigned ch = *i;
565 if (!in_quotes) {
566 // If not already in double quotes, '"' starts quoting and
567 // ',' starts a new field.
568 if (ch == '"') {
569 in_quotes = true;
570 continue;
572 if (ch == ',')
573 ch = ' ';
574 } else if (ch == '"') {
575 // In double quotes, '"' either ends double quotes, or
576 // if followed by another '"', means a literal '"'.
577 if (++i == Xapian::Utf8Iterator())
578 break;
579 ch = *i;
580 if (ch != '"') {
581 in_quotes = false;
582 if (ch == ',')
583 ch = ' ';
587 if (ch <= ' ' || ch == 0xa0) {
588 // FIXME: if all the whitespace characters between two
589 // words are 0xa0 (non-breaking space) then perhaps we
590 // should output 0xa0.
591 if (in_space)
592 continue;
593 last_word_end = sample.size();
594 sample += ' ';
595 in_space = true;
596 } else {
597 Xapian::Unicode::append_utf8(sample, ch);
598 in_space = false;
601 if (sample.size() >= sample_size) {
602 // Need to truncate sample.
603 if (last_word_end <= sample_size / 2) {
604 // Monster word! We'll have to just split it.
605 sample.replace(sample_size - 3, string::npos, "...", 3);
606 } else {
607 sample.replace(last_word_end, string::npos, " ...", 4);
609 break;
614 static bool
615 index_check_existing(const string& urlterm, time_t last_altered,
616 Xapian::docid& did)
618 switch (dup_action) {
619 case DUP_SKIP: {
620 Xapian::PostingIterator p = db.postlist_begin(urlterm);
621 if (p != db.postlist_end(urlterm)) {
622 if (verbose)
623 cout << "already indexed, not updating" << endl;
624 did = *p;
625 mark_as_seen(did);
626 return true;
628 break;
630 case DUP_CHECK_LAZILY: {
631 // If last_altered > last_altered_max, we know for sure that the
632 // file is new or updated.
633 if (last_altered > last_altered_max) {
634 return false;
637 Xapian::PostingIterator p = db.postlist_begin(urlterm);
638 if (p != db.postlist_end(urlterm)) {
639 did = *p;
640 Xapian::Document doc = db.get_document(did);
641 Xapian::valueno slot = use_ctime ? VALUE_CTIME : VALUE_LASTMOD;
642 string value = doc.get_value(slot);
643 time_t old_last_altered = binary_string_to_int(value);
644 if (last_altered <= old_last_altered) {
645 if (verbose)
646 cout << "already indexed" << endl;
647 // The docid should be in updated - the only valid
648 // exception is if the URL was long and hashed to the
649 // same URL as an existing document indexed in the same
650 // batch.
651 mark_as_seen(did);
652 return true;
655 break;
658 return false;
661 void
662 index_remove_failed_entry(const string& urlterm)
664 failed.del(urlterm);
667 void
668 index_add_document(const string& urlterm, time_t last_altered,
669 Xapian::docid did, const Xapian::Document& doc)
671 if (dup_action != DUP_SKIP) {
672 // If this document has already been indexed, update the existing
673 // entry.
674 if (did) {
675 // We already found out the document id above.
676 db.replace_document(did, doc);
677 } else if (last_altered <= last_altered_max) {
678 // We checked for the UID term and didn't find it.
679 did = db.add_document(doc);
680 } else {
681 did = db.replace_document(urlterm, doc);
683 mark_as_seen(did);
684 if (verbose) {
685 if (did <= old_lastdocid) {
686 cout << "updated" << endl;
687 } else {
688 cout << "added" << endl;
691 } else {
692 // If this were a duplicate, we'd have skipped it above.
693 db.add_document(doc);
694 if (verbose)
695 cout << "added" << endl;
699 void
700 index_mimetype(const string& file, const string& urlterm, const string& url,
701 const string& ext,
702 string mimetype,
703 DirectoryIterator& d,
704 string pathterm,
705 string record)
707 string context(file, root.size(), string::npos);
709 // FIXME: We could be cleverer here and check mtime too when use_ctime is
710 // set - if the ctime has changed but the mtime is unchanged, we can just
711 // update the existing Document and avoid having to re-extract text, etc.
712 time_t last_altered = use_ctime ? d.get_ctime() : d.get_mtime();
714 Xapian::docid did = 0;
715 if (index_check_existing(urlterm, last_altered, did))
716 return;
718 if (!retry_failed) {
719 // We only store and check the mtime (last modified) - a change to the
720 // metadata won't generally cause a previous failure to now work
721 // (FIXME: except permissions).
722 time_t failed_last_mod;
723 off_t failed_size;
724 if (failed.contains(urlterm, failed_last_mod, failed_size)) {
725 if (d.get_mtime() <= failed_last_mod &&
726 d.get_size() == failed_size) {
727 if (verbose)
728 cout << "failed to extract text on earlier run" << endl;
729 return;
731 // The file has changed, so remove the entry for it. If it fails
732 // again on this attempt, we'll add a new one.
733 failed.del(urlterm);
737 // If we didn't get the mime type from the extension, call libmagic to get
738 // it.
739 if (mimetype.empty()) {
740 mimetype = d.get_magic_mimetype();
741 if (mimetype.empty()) {
742 skip(urlterm, file.substr(root.size()),
743 "Unknown extension and unrecognised format",
744 d.get_size(), d.get_mtime(), SKIP_SHOW_FILENAME);
745 return;
749 if (verbose)
750 cout << "Indexing \"" << file.substr(root.size()) << "\" as "
751 << mimetype << " ... " << flush;
753 // Use `file` as the basis, as we don't want URL encoding in these terms,
754 // but need to switch over the initial part so we get `/~olly/foo/bar` not
755 // `/home/olly/public_html/foo/bar`.
756 Xapian::Document newdocument;
757 size_t j;
758 while ((j = pathterm.rfind('/')) > 1 && j != string::npos) {
759 pathterm.resize(j);
760 if (pathterm.length() > MAX_SAFE_TERM_LENGTH) {
761 string term_hash = hash_long_term(pathterm, MAX_SAFE_TERM_LENGTH);
762 newdocument.add_boolean_term(term_hash);
763 } else {
764 newdocument.add_boolean_term(pathterm);
768 string author, title, sample, keywords, topic, dump;
769 string to, cc, bcc, message_id;
770 string md5;
771 time_t created = time_t(-1);
772 int pages = -1;
774 map<string, Filter>::const_iterator cmd_it = commands.find(mimetype);
775 if (cmd_it == commands.end()) {
776 size_t slash = mimetype.find('/');
777 if (slash != string::npos) {
778 string wildtype(mimetype, 0, slash + 2);
779 wildtype[slash + 1] = '*';
780 cmd_it = commands.find(wildtype);
781 if (cmd_it == commands.end()) {
782 cmd_it = commands.find("*/*");
785 if (cmd_it == commands.end()) {
786 cmd_it = commands.find("*");
789 try {
790 if (cmd_it != commands.end() && cmd_it->second.worker) {
791 // Use a worker process to extract the content.
792 Worker* wrk = cmd_it->second.worker;
793 int r = wrk->extract(file, mimetype, dump, title, keywords, author,
794 to, cc, bcc, message_id, pages, created);
795 if (r != 0) {
796 string msg = wrk->get_error();
797 assert(!msg.empty());
798 skip(urlterm, context, msg, d.get_size(), d.get_mtime());
799 if (r < 0) {
800 // Hard failure - don't try this filter again for this run.
801 string filter_entry;
802 if (cmd_it != commands.end()) {
803 filter_entry = cmd_it->first;
804 } else {
805 filter_entry = mimetype;
807 commands[filter_entry] = Filter();
809 return;
811 } else if (cmd_it != commands.end()) {
812 // Easy "run a command and read text or HTML from stdout or a
813 // temporary file" cases.
814 auto& filter = cmd_it->second;
815 string cmd = filter.cmd;
816 if (cmd.empty()) {
817 skip(urlterm, context, "required filter not installed",
818 d.get_size(), d.get_mtime(), SKIP_VERBOSE_ONLY);
819 return;
821 if (cmd == "false") {
822 // Allow setting 'false' as a filter to mean that a MIME type
823 // should be quietly ignored.
824 string m = "ignoring MIME type '";
825 m += cmd_it->first;
826 m += "'";
827 skip(urlterm, context, m, d.get_size(), d.get_mtime(),
828 SKIP_VERBOSE_ONLY);
829 return;
831 bool use_shell = filter.use_shell();
832 bool input_on_stdin = filter.input_on_stdin();
833 bool substituted = false;
834 string tmpout;
835 size_t pcent = 0;
836 while (true) {
837 pcent = cmd.find('%', pcent);
838 if (pcent >= cmd.size() - 1)
839 break;
840 switch (cmd[pcent + 1]) {
841 case '%': // %% -> %.
842 cmd.erase(++pcent, 1);
843 break;
844 case 'f': { // %f -> escaped filename.
845 substituted = true;
846 if (filter.dev_stdin()) {
847 cmd.replace(pcent, 2, "/dev/stdin",
848 CONST_STRLEN("/dev/stdin"));
849 break;
851 string tail(cmd, pcent + 2);
852 cmd.resize(pcent);
853 // Suppress the space append_filename_argument()
854 // usually adds before the argument - the command
855 // string either includes one, or won't expect one
856 // (e.g. --input=%f).
857 append_filename_argument(cmd, file, false);
858 pcent = cmd.size();
859 cmd += tail;
860 break;
862 case 't': { // %t -> temporary output file.
863 if (tmpout.empty()) {
864 // Use a temporary file with a suitable extension
865 // in case the command cares, and for more helpful
866 // error messages from the command.
867 if (filter.output_type == "text/html") {
868 tmpout = get_tmpfile("tmp.html");
869 } else if (filter.output_type == "image/svg+xml") {
870 tmpout = get_tmpfile("tmp.svg");
871 } else {
872 tmpout = get_tmpfile("tmp.txt");
875 substituted = true;
876 string tail(cmd, pcent + 2);
877 cmd.resize(pcent);
878 // Suppress the space append_filename_argument()
879 // usually adds before the argument - the command
880 // string either includes one, or won't expect one
881 // (e.g. --output=%t).
882 append_filename_argument(cmd, tmpout, false);
883 pcent = cmd.size();
884 cmd += tail;
885 break;
887 default:
888 // Leave anything else alone for now.
889 pcent += 2;
890 break;
893 if (!substituted && cmd != "true") {
894 if (input_on_stdin) {
895 if (filter.dev_stdin()) {
896 cmd += " /dev/stdin";
898 } else {
899 // If no %f, append the filename to the command.
900 append_filename_argument(cmd, file);
903 try {
904 if (!tmpout.empty()) {
905 // Output in temporary file.
906 if (input_on_stdin) {
907 run_filter(d.get_fd(), cmd, use_shell);
908 } else {
909 run_filter(cmd, use_shell);
911 if (!load_file(tmpout, dump, NOCACHE)) {
912 throw ReadError("Couldn't read output file");
914 unlink(tmpout.c_str());
915 } else if (cmd == "true") {
916 // Ignore the file's contents, just index metadata from the
917 // filing system.
918 } else {
919 // Output on stdout.
920 if (input_on_stdin) {
921 run_filter(d.get_fd(), cmd, use_shell, &dump);
922 } else {
923 run_filter(cmd, use_shell, &dump);
926 const string& charset = filter.output_charset;
927 if (filter.output_type == "text/html") {
928 HtmlParser p;
929 p.ignore_metarobots();
930 p.description_as_sample = description_as_sample;
931 try {
932 p.parse(dump, charset, false);
933 } catch (const string& newcharset) {
934 p.reset();
935 p.ignore_metarobots();
936 p.description_as_sample = description_as_sample;
937 p.parse(dump, newcharset, true);
938 } catch (const ReadError&) {
939 skip_cmd_failed(urlterm, context, cmd,
940 d.get_size(), d.get_mtime());
941 return;
943 dump = p.dump;
944 title = p.title;
945 keywords = p.keywords;
946 topic = p.topic;
947 sample = p.sample;
948 author = p.author;
949 created = p.created;
950 } else if (filter.output_type == "image/svg+xml") {
951 SvgParser svgparser;
952 svgparser.parse(dump);
953 dump = svgparser.dump;
954 title = svgparser.title;
955 keywords = svgparser.keywords;
956 // FIXME: topic = svgparser.topic;
957 author = svgparser.author;
958 } else if (!charset.empty()) {
959 convert_to_utf8(dump, charset);
961 } catch (const ReadError&) {
962 skip_cmd_failed(urlterm, context, cmd,
963 d.get_size(), d.get_mtime());
964 return;
966 } else if (mimetype == "text/html" || mimetype == "text/x-php") {
967 const string& text = d.file_to_string();
968 HtmlParser p;
969 if (ignore_exclusions) p.ignore_metarobots();
970 p.description_as_sample = description_as_sample;
971 try {
972 // Default HTML character set is latin 1, though not specifying
973 // one is deprecated these days.
974 p.parse(text, "iso-8859-1", false);
975 } catch (const string& newcharset) {
976 p.reset();
977 if (ignore_exclusions) p.ignore_metarobots();
978 p.description_as_sample = description_as_sample;
979 p.parse(text, newcharset, true);
981 if (!p.indexing_allowed) {
982 skip_meta_tag(urlterm, context,
983 d.get_size(), d.get_mtime());
984 return;
986 dump = p.dump;
987 title = p.title;
988 keywords = p.keywords;
989 topic = p.topic;
990 sample = p.sample;
991 author = p.author;
992 created = p.created;
993 md5_string(text, md5);
994 } else if (mimetype == "text/plain") {
995 // Currently we assume that text files are UTF-8 unless they have a
996 // byte-order mark.
997 dump = d.file_to_string();
998 md5_string(dump, md5);
1000 // Look for Byte-Order Mark (BOM).
1001 if (startswith(dump, "\xfe\xff") || startswith(dump, "\xff\xfe")) {
1002 // UTF-16 in big-endian/little-endian order - we just convert
1003 // it as "UTF-16" and let the conversion handle the BOM as that
1004 // way we avoid the copying overhead of erasing 2 bytes from
1005 // the start of dump.
1006 convert_to_utf8(dump, "UTF-16");
1007 } else if (startswith(dump, "\xef\xbb\xbf")) {
1008 // UTF-8 with stupid Windows not-the-byte-order mark.
1009 dump.erase(0, 3);
1010 } else {
1011 // FIXME: What charset is the file? Look at contents?
1013 } else if (mimetype == "application/pdf") {
1014 const char* cmd = "pdftotext -enc UTF-8 - -";
1015 try {
1016 run_filter(d.get_fd(), cmd, false, &dump);
1017 } catch (const ReadError&) {
1018 skip_cmd_failed(urlterm, context, cmd,
1019 d.get_size(), d.get_mtime());
1020 return;
1022 get_pdf_metainfo(d.get_fd(), author, title, keywords, topic, pages);
1023 } else if (mimetype == "application/postscript") {
1024 // There simply doesn't seem to be a Unicode capable PostScript to
1025 // text converter (e.g. pstotext always outputs ISO-8859-1). The
1026 // only solution seems to be to convert via PDF using ps2pdf and
1027 // then pdftotext. This gives plausible looking UTF-8 output for
1028 // some Chinese PostScript files I found using Google. It also has
1029 // the benefit of allowing us to extract meta information from
1030 // PostScript files.
1031 string tmpfile = get_tmpfile("tmp.pdf");
1032 if (tmpfile.empty()) {
1033 // FIXME: should this be fatal? Or disable indexing postscript?
1034 string msg = "Couldn't create temporary directory (";
1035 msg += strerror(errno);
1036 msg += ")";
1037 skip(urlterm, context, msg,
1038 d.get_size(), d.get_mtime());
1039 return;
1041 string cmd = "ps2pdf -";
1042 append_filename_argument(cmd, tmpfile);
1043 try {
1044 run_filter(d.get_fd(), cmd, false);
1045 cmd = "pdftotext -enc UTF-8";
1046 append_filename_argument(cmd, tmpfile);
1047 cmd += " -";
1048 run_filter(cmd, false, &dump);
1049 } catch (const ReadError&) {
1050 skip_cmd_failed(urlterm, context, cmd,
1051 d.get_size(), d.get_mtime());
1052 unlink(tmpfile.c_str());
1053 return;
1054 } catch (...) {
1055 unlink(tmpfile.c_str());
1056 throw;
1058 try {
1059 get_pdf_metainfo(tmpfile, author, title, keywords, topic,
1060 pages);
1061 } catch (...) {
1062 unlink(tmpfile.c_str());
1063 throw;
1065 unlink(tmpfile.c_str());
1066 } else if (startswith(mimetype, "application/vnd.sun.xml.") ||
1067 startswith(mimetype, "application/vnd.oasis.opendocument."))
1069 // Inspired by http://mjr.towers.org.uk/comp/sxw2text
1070 string cmd = "unzip -p";
1071 append_filename_argument(cmd, file);
1072 cmd += " content.xml ; unzip -p";
1073 append_filename_argument(cmd, file);
1074 cmd += " styles.xml";
1075 try {
1076 OpenDocParser parser;
1077 parser.parse(stdout_to_string(cmd, true));
1078 dump = parser.dump;
1079 } catch (const ReadError&) {
1080 skip_cmd_failed(urlterm, context, cmd,
1081 d.get_size(), d.get_mtime());
1082 return;
1085 cmd = "unzip -p";
1086 append_filename_argument(cmd, file);
1087 cmd += " meta.xml";
1088 try {
1089 OpenDocMetaParser metaparser;
1090 metaparser.parse(stdout_to_string(cmd, false));
1091 title = metaparser.title;
1092 keywords = metaparser.keywords;
1093 // FIXME: topic = metaparser.topic;
1094 sample = metaparser.sample;
1095 author = metaparser.author;
1096 pages = metaparser.pages;
1097 } catch (const ReadError&) {
1098 // It's probably best to index the document even if this fails.
1100 } else if (startswith(mimetype,
1101 "application/vnd.openxmlformats-officedocument."))
1103 const char* args = NULL;
1104 string tail(mimetype, 46);
1105 if (startswith(tail, "wordprocessingml.")) {
1106 // unzip returns exit code 11 if a file to extract wasn't found
1107 // which we want to ignore, because there may be no headers or
1108 // no footers.
1109 args = " word/document.xml"
1110 " 'word/header*.xml'"
1111 " 'word/footer*.xml'"
1112 " 2>/dev/null";
1113 } else if (startswith(tail, "spreadsheetml.")) {
1114 // Extract the shared string table first, so our parser can
1115 // grab those ready for parsing the sheets which will reference
1116 // the shared strings.
1117 string cmd = "unzip -p";
1118 append_filename_argument(cmd, file);
1119 cmd += " xl/styles.xml xl/workbook.xml xl/sharedStrings.xml ; "
1120 "unzip -p";
1121 append_filename_argument(cmd, file);
1122 cmd += " xl/worksheets/sheet\\*.xml";
1123 try {
1124 XlsxParser parser;
1125 parser.parse(stdout_to_string(cmd, true));
1126 dump = parser.dump;
1127 } catch (const ReadError&) {
1128 skip_cmd_failed(urlterm, context, cmd,
1129 d.get_size(), d.get_mtime());
1130 return;
1132 } else if (startswith(tail, "presentationml.")) {
1133 // unzip returns exit code 11 if a file to extract wasn't found
1134 // which we want to ignore, because there may be no notesSlides
1135 // or comments.
1136 args = " 'ppt/slides/slide*.xml'"
1137 " 'ppt/notesSlides/notesSlide*.xml'"
1138 " 'ppt/comments/comment*.xml'"
1139 " 2>/dev/null";
1140 } else {
1141 // Don't know how to index this type.
1142 skip_unknown_mimetype(urlterm, context, mimetype,
1143 d.get_size(), d.get_mtime());
1144 return;
1147 if (args) {
1148 string cmd = "unzip -p";
1149 append_filename_argument(cmd, file);
1150 cmd += args;
1151 try {
1152 MSXmlParser xmlparser;
1153 // Treat exit status 11 from unzip as success - this is
1154 // what we get if one of the listed filenames to extract
1155 // doesn't match anything in the zip file.
1156 xmlparser.parse(stdout_to_string(cmd, false, 11));
1157 dump = xmlparser.dump;
1158 } catch (const ReadError&) {
1159 skip_cmd_failed(urlterm, context, cmd,
1160 d.get_size(), d.get_mtime());
1161 return;
1165 string cmd = "unzip -p";
1166 append_filename_argument(cmd, file);
1167 cmd += " docProps/core.xml";
1168 try {
1169 OpenDocMetaParser metaparser;
1170 metaparser.parse(stdout_to_string(cmd, false));
1171 title = metaparser.title;
1172 keywords = metaparser.keywords;
1173 // FIXME: topic = metaparser.topic;
1174 sample = metaparser.sample;
1175 author = metaparser.author;
1176 } catch (const ReadError&) {
1177 // It's probably best to index the document even if this fails.
1179 } else if (mimetype == "application/x-abiword") {
1180 AbiwordParser abiwordparser;
1181 const string& text = d.file_to_string();
1182 abiwordparser.parse(text);
1183 dump = abiwordparser.dump;
1184 md5_string(text, md5);
1185 } else if (mimetype == "application/x-abiword-compressed") {
1186 AbiwordParser abiwordparser;
1187 abiwordparser.parse(d.gzfile_to_string());
1188 dump = abiwordparser.dump;
1189 } else if (mimetype == "application/oxps" ||
1190 mimetype == "application/vnd.ms-xpsdocument") {
1191 string cmd = "unzip -p";
1192 append_filename_argument(cmd, file);
1193 cmd += " 'Documents/*/Pages/*.fpage'";
1194 try {
1195 XpsParser xpsparser;
1196 run_filter(cmd, false, &dump);
1197 xpsparser.parse(dump);
1198 dump = xpsparser.dump;
1199 } catch (const ReadError&) {
1200 skip_cmd_failed(urlterm, context, cmd,
1201 d.get_size(), d.get_mtime());
1202 return;
1205 cmd = "unzip -p";
1206 append_filename_argument(cmd, file);
1207 cmd += " docProps/core.xml";
1208 try {
1209 OpenDocMetaParser metaparser;
1210 metaparser.parse(stdout_to_string(cmd, false));
1211 title = metaparser.title;
1212 keywords = metaparser.keywords;
1213 // FIXME: topic = metaparser.topic;
1214 sample = metaparser.sample;
1215 author = metaparser.author;
1216 } catch (const ReadError&) {
1217 // Ignore errors as not all XPS files contain this file.
1219 } else if (mimetype == "text/csv") {
1220 // Currently we assume that text files are UTF-8 unless they have a
1221 // byte-order mark.
1222 dump = d.file_to_string();
1223 md5_string(dump, md5);
1225 // Look for Byte-Order Mark (BOM).
1226 if (startswith(dump, "\xfe\xff") || startswith(dump, "\xff\xfe")) {
1227 // UTF-16 in big-endian/little-endian order - we just convert
1228 // it as "UTF-16" and let the conversion handle the BOM as that
1229 // way we avoid the copying overhead of erasing 2 bytes from
1230 // the start of dump.
1231 convert_to_utf8(dump, "UTF-16");
1232 } else if (startswith(dump, "\xef\xbb\xbf")) {
1233 // UTF-8 with stupid Windows not-the-byte-order mark.
1234 dump.erase(0, 3);
1235 } else {
1236 // FIXME: What charset is the file? Look at contents?
1239 generate_sample_from_csv(dump, sample);
1240 } else if (mimetype == "image/svg+xml") {
1241 SvgParser svgparser;
1242 const string& text = d.file_to_string();
1243 md5_string(text, md5);
1244 svgparser.parse(text);
1245 dump = svgparser.dump;
1246 title = svgparser.title;
1247 keywords = svgparser.keywords;
1248 // FIXME: topic = svgparser.topic;
1249 author = svgparser.author;
1250 } else if (mimetype == "image/svg+xml-compressed") {
1251 SvgParser svgparser;
1252 const string& text = d.gzfile_to_string();
1253 svgparser.parse(text);
1254 dump = svgparser.dump;
1255 title = svgparser.title;
1256 keywords = svgparser.keywords;
1257 // FIXME: topic = svgparser.topic;
1258 author = svgparser.author;
1259 } else if (mimetype == "application/vnd.debian.binary-package" ||
1260 mimetype == "application/x-debian-package") {
1261 const char* cmd = "dpkg-deb -f - Description";
1262 string desc;
1263 run_filter(d.get_fd(), cmd, false, &desc);
1264 // First line is short description, which we use as the title.
1265 string::size_type idx = desc.find('\n');
1266 title.assign(desc, 0, idx);
1267 if (idx != string::npos) {
1268 dump.assign(desc, idx + 1, string::npos);
1270 } else if (mimetype == "application/x-redhat-package-manager" ||
1271 mimetype == "application/x-rpm") {
1272 string cmd("rpm -q --qf '%{SUMMARY}\\n%{DESCRIPTION}' -p");
1273 append_filename_argument(cmd, file);
1274 string desc;
1275 run_filter(cmd, false, &desc);
1276 // First line is summary, which we use as the title.
1277 string::size_type idx = desc.find('\n');
1278 title.assign(desc, 0, idx);
1279 if (idx != string::npos) {
1280 dump.assign(desc, idx + 1, string::npos);
1282 } else if (mimetype == "application/atom+xml") {
1283 AtomParser atomparser;
1284 const string& text = d.file_to_string();
1285 md5_string(text, md5);
1286 atomparser.parse(text);
1287 dump = atomparser.dump;
1288 title = atomparser.title;
1289 keywords = atomparser.keywords;
1290 // FIXME: topic = atomparser.topic;
1291 author = atomparser.author;
1292 } else {
1293 // Don't know how to index this type.
1294 skip_unknown_mimetype(urlterm, context, mimetype,
1295 d.get_size(), d.get_mtime());
1296 return;
1299 // Compute the MD5 of the file if we haven't already.
1300 if (md5.empty() && !d.md5(md5)) {
1301 if (errno == ENOENT || errno == ENOTDIR) {
1302 skip(urlterm, context, "File removed during indexing",
1303 d.get_size(), d.get_mtime(),
1304 SKIP_VERBOSE_ONLY | SKIP_SHOW_FILENAME);
1305 } else {
1306 skip(urlterm, context,
1307 "failed to read file to calculate MD5 checksum",
1308 d.get_size(), d.get_mtime());
1310 return;
1313 // Remove any trailing formfeeds, so we don't consider them when
1314 // considering if we extracted any text (e.g. pdftotext outputs a
1315 // formfeed between each page, even for blank pages).
1317 // If dump contain only formfeeds, then trim_end will be string::npos
1318 // and ++trim_end will be 0, which is the correct new size.
1319 string::size_type trim_end = dump.find_last_not_of('\f');
1320 if (UNSIGNED_OVERFLOW_OK(++trim_end) != dump.size())
1321 dump.resize(trim_end);
1323 if (dump.empty()) {
1324 switch (empty_body) {
1325 case EMPTY_BODY_INDEX:
1326 break;
1327 case EMPTY_BODY_WARN:
1328 cout << "no text extracted from document body, "
1329 "but indexing metadata anyway" << endl;
1330 break;
1331 case EMPTY_BODY_SKIP:
1332 skip(urlterm, context,
1333 "no text extracted from document body",
1334 d.get_size(), d.get_mtime());
1335 return;
1339 // Produce a sample
1340 if (sample.empty()) {
1341 sample = generate_sample(dump, sample_size, "...", " ...");
1342 } else {
1343 sample = generate_sample(sample, sample_size, "...", " ...");
1346 // Put the data in the document
1347 if (record.empty()) {
1348 record = "url=";
1349 } else {
1350 record += "\nurl=";
1352 record += url;
1353 record += "\nsample=";
1354 record += sample;
1355 if (!title.empty()) {
1356 record += "\ncaption=";
1357 record += generate_sample(title, title_size, "...", " ...");
1359 if (!author.empty()) {
1360 record += "\nauthor=";
1361 record += author;
1363 if (!to.empty()) {
1364 record += "\nto=";
1365 record += to;
1367 if (!cc.empty()) {
1368 record += "\ncc=";
1369 record += cc;
1371 if (!bcc.empty()) {
1372 record += "\nbcc=";
1373 record += bcc;
1375 if (!message_id.empty()) {
1376 record += "\nmsgid=";
1377 record += message_id;
1379 record += "\ntype=";
1380 record += mimetype;
1381 time_t mtime = d.get_mtime();
1382 if (mtime != static_cast<time_t>(-1)) {
1383 record += "\nmodtime=";
1384 record += str(mtime);
1386 if (created != static_cast<time_t>(-1)) {
1387 record += "\ncreated=";
1388 record += str(created);
1390 if (pages >= 0) {
1391 record += "\npages=";
1392 record += str(pages);
1394 off_t size = d.get_size();
1395 record += "\nsize=";
1396 record += str(size);
1397 newdocument.set_data(record);
1399 // Index the title, document text, keywords and topic.
1400 indexer.set_document(newdocument);
1401 if (!title.empty()) {
1402 indexer.index_text(title, 5, "S");
1403 indexer.increase_termpos(100);
1405 if (!dump.empty()) {
1406 indexer.index_text(dump);
1408 if (!keywords.empty()) {
1409 indexer.increase_termpos(100);
1410 indexer.index_text(keywords);
1412 if (!topic.empty()) {
1413 indexer.increase_termpos(100);
1414 indexer.index_text(topic, 1, "B");
1416 // Index the leafname of the file.
1418 indexer.increase_termpos(100);
1419 string leaf = d.leafname();
1420 string::size_type dot = leaf.find_last_of('.');
1421 if (dot != string::npos && leaf.size() - dot - 1 <= max_ext_len)
1422 leaf.resize(dot);
1423 indexer.index_text(leaf, 1, "F");
1425 // Also index with underscores and ampersands replaced by spaces.
1426 bool modified = false;
1427 string::size_type rep = 0;
1428 while ((rep = leaf.find_first_of("_&", rep)) != string::npos) {
1429 leaf[rep++] = ' ';
1430 modified = true;
1432 if (modified) {
1433 indexer.increase_termpos(100);
1434 indexer.index_text(leaf, 1, "F");
1438 if (!author.empty()) {
1439 indexer.increase_termpos(100);
1440 indexer.index_text(author, 1, "A");
1443 if (!to.empty()) {
1444 indexer.increase_termpos(100);
1445 indexer.index_text(to, 1, "XTO");
1448 if (!cc.empty()) {
1449 indexer.increase_termpos(100);
1450 indexer.index_text(cc, 1, "XCC");
1453 if (!bcc.empty()) {
1454 indexer.increase_termpos(100);
1455 indexer.index_text(bcc, 1, "XBCC");
1458 if (!message_id.empty()) {
1459 newdocument.add_boolean_term("XMID:" + message_id);
1462 // mimeType:
1463 newdocument.add_boolean_term("T" + mimetype);
1465 newdocument.add_boolean_term(site_term);
1467 if (!host_term.empty())
1468 newdocument.add_boolean_term(host_term);
1470 if (date_terms) {
1471 struct tm* tm = localtime(&mtime);
1472 string date_term = "D";
1473 date_term += date_to_string(tm->tm_year + 1900,
1474 tm->tm_mon + 1,
1475 tm->tm_mday);
1476 newdocument.add_boolean_term(date_term); // Date (YYYYMMDD)
1477 date_term.resize(7);
1478 date_term[0] = 'M';
1479 newdocument.add_boolean_term(date_term); // Month (YYYYMM)
1480 date_term.resize(5);
1481 date_term[0] = 'Y';
1482 newdocument.add_boolean_term(date_term); // Year (YYYY)
1485 newdocument.add_boolean_term(urlterm); // Url
1487 // Add mtime as a value to allow "sort by date".
1488 newdocument.add_value(VALUE_LASTMOD,
1489 int_to_binary_string(uint32_t(mtime)));
1490 if (use_ctime) {
1491 // Add ctime as a value to track modifications.
1492 time_t ctime = d.get_ctime();
1493 newdocument.add_value(VALUE_CTIME,
1494 int_to_binary_string(uint32_t(ctime)));
1497 // Add MD5 as a value to allow duplicate documents to be collapsed
1498 // together.
1499 newdocument.add_value(VALUE_MD5, md5);
1501 // Add the file size as a value to allow "sort by size" and size ranges.
1502 newdocument.add_value(VALUE_SIZE,
1503 Xapian::sortable_serialise(size));
1505 if (created != static_cast<time_t>(-1)) {
1506 // Add created time as a value to allow "sort by created date".
1507 newdocument.add_value(VALUE_CREATED,
1508 int_to_binary_string(uint32_t(created)));
1511 bool inc_tag_added = false;
1512 if (d.is_other_readable()) {
1513 inc_tag_added = true;
1514 newdocument.add_boolean_term("I*");
1515 } else if (d.is_group_readable()) {
1516 const char* group = d.get_group();
1517 if (group) {
1518 newdocument.add_boolean_term(string("I#") + group);
1521 const char* owner = d.get_owner();
1522 if (owner) {
1523 newdocument.add_boolean_term(string("O") + owner);
1524 if (!inc_tag_added && d.is_owner_readable())
1525 newdocument.add_boolean_term(string("I@") + owner);
1528 string ext_term("E");
1529 for (string::const_iterator i = ext.begin(); i != ext.end(); ++i) {
1530 char ch = *i;
1531 if (ch >= 'A' && ch <= 'Z')
1532 ch |= 32;
1533 ext_term += ch;
1535 newdocument.add_boolean_term(ext_term);
1537 index_add_document(urlterm, last_altered, did, newdocument);
1538 } catch (const ReadError&) {
1539 skip(urlterm, context, string("can't read file: ") + strerror(errno),
1540 d.get_size(), d.get_mtime());
1541 } catch (const NoSuchFilter&) {
1542 string filter_entry;
1543 if (cmd_it != commands.end()) {
1544 filter_entry = cmd_it->first;
1545 } else {
1546 filter_entry = mimetype;
1548 string m = "Filter for \"";
1549 m += filter_entry;
1550 m += "\" not installed";
1551 skip(urlterm, context, m, d.get_size(), d.get_mtime());
1552 commands[filter_entry] = Filter();
1553 } catch (const FileNotFound&) {
1554 skip(urlterm, context, "File removed during indexing",
1555 d.get_size(), d.get_mtime(),
1556 SKIP_VERBOSE_ONLY | SKIP_SHOW_FILENAME);
1557 } catch (const std::string& error) {
1558 skip(urlterm, context, error, d.get_size(), d.get_mtime());
1559 } catch (const std::bad_alloc&) {
1560 // Attempt to flag the file as failed and commit changes, though that
1561 // might fail too if we're low on memory rather than being asked to
1562 // allocate a ludicrous amount.
1563 skip(urlterm, context, "Out of memory trying to extract text from file",
1564 d.get_size(), d.get_mtime(),
1565 SKIP_SHOW_FILENAME);
1566 throw CommitAndExit("Caught std::bad_alloc", "");
1570 void
1571 index_handle_deletion()
1573 if (updated.empty() || old_docs_not_seen == 0) return;
1575 if (verbose) {
1576 cout << "Deleting " << old_docs_not_seen
1577 << " old documents which weren't found" << endl;
1579 Xapian::PostingIterator alldocs = db.postlist_begin(string());
1580 Xapian::docid did = *alldocs;
1581 while (did < updated.size()) {
1582 if (!updated[did]) {
1583 alldocs.skip_to(did);
1584 if (alldocs == db.postlist_end(string()))
1585 break;
1586 if (*alldocs != did) {
1587 // Document #did didn't exist before we started.
1588 did = *alldocs;
1589 continue;
1591 db.delete_document(did);
1592 if (--old_docs_not_seen == 0)
1593 break;
1595 ++did;
1599 void
1600 index_commit()
1602 db.commit();
1605 void
1606 index_done()
1608 // If we created a temporary directory then delete it.
1609 remove_tmpdir();