xapian-applications/omega/index_file.cc

   1 /** @file
   2  * @brief Handle indexing a document from a file
   3  */
   4 /* Copyright 1999,2000,2001 BrightStation PLC
   5  * Copyright 2001,2005 James Aylett
   6  * Copyright 2001,2002 Ananova Ltd
   7  * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018 Olly Betts
   8  * Copyright 2009 Frank J Bruzzaniti
   9  * Copyright 2012 Mihai Bivol
  10  *
  11  * This program is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU General Public License as
  13  * published by the Free Software Foundation; either version 2 of the
  14  * License, or (at your option) any later version.
  15  *
  16  * This program is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19  * GNU General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU General Public License
  22  * along with this program; if not, write to the Free Software
  23  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
  24  * USA
  25  */
  26
  27 #include <config.h>
  28
  29 #include "index_file.h"
  30
  31 #include <algorithm>
  32 #include <iostream>
  33 #include <limits>
  34 #include <string>
  35 #include <map>
  36 #include <vector>
  37
  38 #include <sys/types.h>
  39 #include "safeunistd.h"
  40 #include <cerrno>
  41 #include <cstdio>
  42 #include <cstdlib>
  43 #include <cstring>
  44 #include "safefcntl.h"
  45 #include <ctime>
  46
  47 #include <xapian.h>
  48
  49 #include "append_filename_arg.h"
  50 #include "atomparse.h"
  51 #include "diritor.h"
  52 #include "failed.h"
  53 #include "md5wrap.h"
  54 #include "metaxmlparse.h"
  55 #include "mimemap.h"
  56 #include "msxmlparse.h"
  57 #include "myhtmlparse.h"
  58 #include "opendocparse.h"
  59 #include "pkglibbindir.h"
  60 #include "runfilter.h"
  61 #include "sample.h"
  62 #include "str.h"
  63 #include "stringutils.h"
  64 #include "svgparse.h"
  65 #include "tmpdir.h"
  66 #include "utf8convert.h"
  67 #include "utils.h"
  68 #include "values.h"
  69 #include "xmlparse.h"
  70 #include "xlsxparse.h"
  71 #include "xpsxmlparse.h"
  72
  73 using namespace std;
  74
  75 static Xapian::WritableDatabase db;
  76 static Xapian::TermGenerator indexer;
  77
  78 static Xapian::doccount old_docs_not_seen;
  79 static Xapian::docid old_lastdocid;
  80 static vector<bool> updated;
  81
  82 static bool verbose;
  83 static bool retry_failed;
  84 static bool use_ctime;
  85 static dup_action_type dup_action;
  86 static bool ignore_exclusions;
  87 static bool description_as_sample;
  88
  89 static time_t last_altered_max;
  90 static size_t sample_size;
  91 static size_t title_size;
  92 static size_t max_ext_len;
  93
  94 static empty_body_type empty_body;
  95
  96 static string root;
  97 static string site_term, host_term;
  98
  99 static Failed failed;
 100
 101 map<string, Filter> commands;
 102
 103 static void
 104 mark_as_seen(Xapian::docid did)
 105 {
 106     if (usual(did < updated.size() && !updated[did])) {
 107         updated[did] = true;
 108         --old_docs_not_seen;
 109     }
 110 }
 111
 112 void
 113 skip(const string & urlterm, const string & context, const string & msg,
 114      off_t size, time_t last_mod, unsigned flags)
 115 {
 116     failed.add(urlterm, last_mod, size);
 117
 118     if (!verbose || (flags & SKIP_SHOW_FILENAME)) {
 119         if (!verbose && (flags & SKIP_VERBOSE_ONLY)) return;
 120         cout << context << ": ";
 121     }
 122
 123     cout << "Skipping - " << msg << endl;
 124 }
 125
 126 static void
 127 skip_cmd_failed(const string & urlterm, const string & context, const string & cmd,
 128                 off_t size, time_t last_mod)
 129 {
 130     skip(urlterm, context, "\"" + cmd + "\" failed", size, last_mod);
 131 }
 132
 133 static void
 134 skip_meta_tag(const string & urlterm, const string & context,
 135               off_t size, time_t last_mod)
 136 {
 137     skip(urlterm, context, "indexing disallowed by meta tag", size, last_mod);
 138 }
 139
 140 static void
 141 skip_unknown_mimetype(const string & urlterm, const string & context,
 142                       const string & mimetype, off_t size, time_t last_mod)
 143 {
 144     skip(urlterm, context, "unknown MIME type '" + mimetype + "'", size, last_mod);
 145 }
 146
 147 void
 148 index_add_default_filters()
 149 {
 150     index_command("application/msword", Filter("antiword -mUTF-8.txt", false));
 151     index_command("application/vnd.ms-excel",
 152                   Filter("xls2csv -c' ' -q0 -dutf-8", false));
 153     index_command("application/vnd.ms-powerpoint",
 154                   Filter("catppt -dutf-8", false));
 155     // Looking at the source of wpd2html and wpd2text I think both output
 156     // UTF-8, but it's hard to be sure without sample Unicode .wpd files
 157     // as they don't seem to be at all well documented.
 158     index_command("application/vnd.wordperfect", Filter("wpd2text", false));
 159     // wps2text produces UTF-8 output from the sample files I've tested.
 160     index_command("application/vnd.ms-works", Filter("wps2text", false));
 161     // Output is UTF-8 according to "man djvutxt".  Generally this seems to
 162     // be true, though some examples from djvu.org generate isolated byte
 163     // 0x95 in a context which suggests it might be intended to be a bullet
 164     // (as it is in CP1252).
 165     index_command("image/vnd.djvu", Filter("djvutxt", false));
 166     index_command("text/markdown", Filter("markdown", "text/html", false));
 167     // The --text option unhelpfully converts all non-ASCII characters to "?"
 168     // so we use --html instead, which produces HTML entities.  The --nopict
 169     // option suppresses exporting picture files as pictNNNN.wmf in the current
 170     // directory.  Note that this option was ignored in some older versions,
 171     // but it was fixed in unrtf 0.20.4.
 172     index_command("application/rtf",
 173                   Filter("unrtf --nopict --html 2>/dev/null", "text/html",
 174                          false));
 175     index_command("text/rtf",
 176                   Filter("unrtf --nopict --html 2>/dev/null", "text/html",
 177                          false));
 178     index_command("text/x-rst", Filter("rst2html", "text/html", false));
 179     index_command("application/x-mspublisher",
 180                   Filter("pub2xhtml", "text/html", false));
 181     index_command("application/vnd.ms-outlook",
 182                   Filter(get_pkglibbindir() + "/outlookmsg2html", "text/html",
 183                          false));
 184     index_command("application/vnd.ms-visio.drawing",
 185                   Filter("vsd2xhtml", "image/svg+xml", false));
 186     index_command("application/vnd.ms-visio.stencil",
 187                   Filter("vsd2xhtml", "image/svg+xml", false));
 188     index_command("application/vnd.ms-visio.template",
 189                   Filter("vsd2xhtml", "image/svg+xml", false));
 190     index_command("application/vnd.visio",
 191                   Filter("vsd2xhtml", "image/svg+xml", false));
 192     // pod2text's output character set doesn't seem to be documented, but from
 193     // inspecting the source it looks like it's probably iso-8859-1.  We need
 194     // to pass "--errors=stderr" or else minor POD formatting errors cause a
 195     // file not to be indexed.
 196     index_command("text/x-perl",
 197                   Filter("pod2text --errors=stderr",
 198                          "text/plain", "iso-8859-1", false));
 199     // FIXME: -e0 means "UTF-8", but that results in "fi", "ff", "ffi", etc
 200     // appearing as single ligatures.  For European languages, it's actually
 201     // better to use -e2 (ISO-8859-1) and then convert, so let's do that for
 202     // now until we handle Unicode "compatibility decompositions".
 203     index_command("application/x-dvi",
 204                   Filter("catdvi -e2 -s", "text/plain", "iso-8859-1", false));
 205     // Simplistic - ought to look in index.rdf files for filename and character
 206     // set.
 207     index_command("application/x-maff",
 208                   Filter("unzip -p %f '*/*.*htm*'", "text/html", "iso-8859-1",
 209                          false));
 210     index_command("application/x-mimearchive",
 211                   Filter(get_pkglibbindir() + "/mhtml2html", "text/html",
 212                          false));
 213     index_command("message/news",
 214                   Filter(get_pkglibbindir() + "/rfc822tohtml", "text/html",
 215                          false));
 216     index_command("message/rfc822",
 217                   Filter(get_pkglibbindir() + "/rfc822tohtml", "text/html",
 218                          false));
 219     index_command("text/vcard",
 220                   Filter(get_pkglibbindir() + "/vcard2text", false));
 221     index_command("application/vnd.apple.keynote",
 222                   Filter("key2text", false));
 223     index_command("application/vnd.apple.numbers",
 224                   Filter("numbers2text", false));
 225     index_command("application/vnd.apple.pages",
 226                   Filter("pages2text", false));
 227 }
 228
 229 void
 230 index_init(const string & dbpath, const Xapian::Stem & stemmer,
 231            const string & root_, const string & site_term_,
 232            const string & host_term_,
 233            empty_body_type empty_body_, dup_action_type dup_action_,
 234            size_t sample_size_, size_t title_size_, size_t max_ext_len_,
 235            bool overwrite, bool retry_failed_,
 236            bool delete_removed_documents, bool verbose_, bool use_ctime_,
 237            bool spelling, bool ignore_exclusions_, bool description_as_sample_)
 238 {
 239     root = root_;
 240     site_term = site_term_;
 241     host_term = host_term_;
 242     empty_body = empty_body_;
 243     dup_action = dup_action_;
 244     sample_size = sample_size_;
 245     title_size = title_size_;
 246     max_ext_len = max_ext_len_;
 247     verbose = verbose_;
 248     use_ctime = use_ctime_;
 249     ignore_exclusions = ignore_exclusions_;
 250     description_as_sample = description_as_sample_;
 251
 252     if (!overwrite) {
 253         db = Xapian::WritableDatabase(dbpath, Xapian::DB_CREATE_OR_OPEN);
 254         old_docs_not_seen = db.get_doccount();
 255         // Handle an initially empty database exactly the same way as when
 256         // overwrite is true.
 257         if (old_docs_not_seen != 0) {
 258             old_lastdocid = db.get_lastdocid();
 259             if (delete_removed_documents) {
 260                 // + 1 so that old_lastdocid is a valid subscript.
 261                 updated.resize(old_lastdocid + 1);
 262             }
 263             Xapian::valueno slot = use_ctime ? VALUE_CTIME : VALUE_LASTMOD;
 264             string ubound = db.get_value_upper_bound(slot);
 265             if (!ubound.empty())
 266                 last_altered_max = binary_string_to_int(ubound);
 267         }
 268     } else {
 269         db = Xapian::WritableDatabase(dbpath, Xapian::DB_CREATE_OR_OVERWRITE);
 270     }
 271
 272     if (spelling) {
 273         indexer.set_database(db);
 274         indexer.set_flags(indexer.FLAG_SPELLING);
 275     }
 276     indexer.set_stemmer(stemmer);
 277
 278     runfilter_init();
 279
 280     failed.init(db);
 281
 282     if (overwrite) {
 283         // There are no failures to retry, so setting this flag doesn't
 284         // change the outcome, but does mean we avoid the overhead of
 285         // checking for a previous failure.
 286         retry_failed = true;
 287     } else if (retry_failed_) {
 288         failed.clear();
 289         retry_failed = true;
 290     } else {
 291         // If there are no existing failures, setting this flag doesn't
 292         // change the outcome, but does mean we avoid the overhead of
 293         // checking for a previous failure.
 294         retry_failed = failed.empty();
 295     }
 296 }
 297
 298 static void
 299 parse_pdfinfo_field(const char * p, const char * end, string & out, const char * field, size_t len)
 300 {
 301     if (size_t(end - p) > len && memcmp(p, field, len) == 0) {
 302         p += len;
 303         while (p != end && *p == ' ')
 304             ++p;
 305         if (p != end && (end[-1] != '\r' || --end != p))
 306             out.assign(p, end - p);
 307     }
 308 }
 309
 310 #define PARSE_PDFINFO_FIELD(P, END, OUT, FIELD) \
 311     parse_pdfinfo_field((P), (END), (OUT), FIELD":", CONST_STRLEN(FIELD) + 1)
 312
 313 static void
 314 parse_pdf_metainfo(const string& pdfinfo, string &author, string &title,
 315                    string &keywords, string &topic, int& pages)
 316 {
 317     const char * p = pdfinfo.data();
 318     const char * end = p + pdfinfo.size();
 319     while (p != end) {
 320         const char * start = p;
 321         p = static_cast<const char *>(memchr(p, '\n', end - p));
 322         const char * eol;
 323         if (p) {
 324             eol = p;
 325             ++p;
 326         } else {
 327             p = eol = end;
 328         }
 329         switch (*start) {
 330             case 'A':
 331                 PARSE_PDFINFO_FIELD(start, eol, author, "Author");
 332                 break;
 333             case 'K':
 334                 PARSE_PDFINFO_FIELD(start, eol, keywords, "Keywords");
 335                 break;
 336             case 'P': {
 337                 string s;
 338                 PARSE_PDFINFO_FIELD(start, eol, s, "Pages");
 339                 if (!s.empty())
 340                     pages = atoi(s.c_str());
 341                 break;
 342             }
 343             case 'S':
 344                 PARSE_PDFINFO_FIELD(start, eol, topic, "Subject");
 345                 break;
 346             case 'T':
 347                 PARSE_PDFINFO_FIELD(start, eol, title, "Title");
 348                 break;
 349         }
 350     }
 351 }
 352
 353 static void
 354 get_pdf_metainfo(int fd, string &author, string &title,
 355                  string &keywords, string &topic, int& pages)
 356 {
 357     try {
 358         string pdfinfo;
 359         run_filter(fd, "pdfinfo -enc UTF-8 -", false, &pdfinfo);
 360         parse_pdf_metainfo(pdfinfo, author, title, keywords, topic, pages);
 361     } catch (const ReadError&) {
 362         // It's probably best to index the document even if pdfinfo fails.
 363     }
 364 }
 365
 366 static void
 367 get_pdf_metainfo(const string& file, string &author, string &title,
 368                  string &keywords, string &topic, int& pages)
 369 {
 370     try {
 371         string cmd = "pdfinfo -enc UTF-8";
 372         append_filename_argument(cmd, file);
 373         parse_pdf_metainfo(stdout_to_string(cmd, false),
 374                            author, title, keywords, topic, pages);
 375     } catch (const ReadError&) {
 376         // It's probably best to index the document even if pdfinfo fails.
 377     }
 378 }
 379
 380 static void
 381 generate_sample_from_csv(const string & csv_data, string & sample)
 382 {
 383     // Add 3 to allow for a 4 byte utf-8 sequence being appended when
 384     // output is sample_size - 1 bytes long.  Use csv_data.size() if smaller
 385     // since the user might reasonably set sample_size really high.
 386     sample.reserve(min(sample_size + 3, csv_data.size()));
 387     size_t last_word_end = 0;
 388     bool in_space = true;
 389     bool in_quotes = false;
 390     for (Xapian::Utf8Iterator i(csv_data); i != Xapian::Utf8Iterator(); ++i) {
 391         unsigned ch = *i;
 392
 393         if (!in_quotes) {
 394             // If not already in double quotes, '"' starts quoting and
 395             // ',' starts a new field.
 396             if (ch == '"') {
 397                 in_quotes = true;
 398                 continue;
 399             }
 400             if (ch == ',')
 401                 ch = ' ';
 402         } else if (ch == '"') {
 403             // In double quotes, '"' either ends double quotes, or
 404             // if followed by another '"', means a literal '"'.
 405             if (++i == Xapian::Utf8Iterator())
 406                 break;
 407             ch = *i;
 408             if (ch != '"') {
 409                 in_quotes = false;
 410                 if (ch == ',')
 411                     ch = ' ';
 412             }
 413         }
 414
 415         if (ch <= ' ' || ch == 0xa0) {
 416             // FIXME: if all the whitespace characters between two
 417             // words are 0xa0 (non-breaking space) then perhaps we
 418             // should output 0xa0.
 419             if (in_space)
 420                 continue;
 421             last_word_end = sample.size();
 422             sample += ' ';
 423             in_space = true;
 424         } else {
 425             Xapian::Unicode::append_utf8(sample, ch);
 426             in_space = false;
 427         }
 428
 429         if (sample.size() >= sample_size) {
 430             // Need to truncate sample.
 431             if (last_word_end <= sample_size / 2) {
 432                 // Monster word!  We'll have to just split it.
 433                 sample.replace(sample_size - 3, string::npos, "...", 3);
 434             } else {
 435                 sample.replace(last_word_end, string::npos, " ...", 4);
 436             }
 437             break;
 438         }
 439     }
 440 }
 441
 442 static bool
 443 index_check_existing(const string & urlterm, time_t last_altered,
 444                      Xapian::docid & did)
 445 {
 446     switch (dup_action) {
 447         case DUP_SKIP: {
 448             Xapian::PostingIterator p = db.postlist_begin(urlterm);
 449             if (p != db.postlist_end(urlterm)) {
 450                 if (verbose)
 451                     cout << "already indexed, not updating" << endl;
 452                 did = *p;
 453                 mark_as_seen(did);
 454                 return true;
 455             }
 456             break;
 457         }
 458         case DUP_CHECK_LAZILY: {
 459             // If last_altered > last_altered_max, we know for sure that the
 460             // file is new or updated.
 461             if (last_altered > last_altered_max) {
 462                 return false;
 463             }
 464
 465             Xapian::PostingIterator p = db.postlist_begin(urlterm);
 466             if (p != db.postlist_end(urlterm)) {
 467                 did = *p;
 468                 Xapian::Document doc = db.get_document(did);
 469                 Xapian::valueno slot = use_ctime ? VALUE_CTIME : VALUE_LASTMOD;
 470                 string value = doc.get_value(slot);
 471                 time_t old_last_altered = binary_string_to_int(value);
 472                 if (last_altered <= old_last_altered) {
 473                     if (verbose)
 474                         cout << "already indexed" << endl;
 475                     // The docid should be in updated - the only valid
 476                     // exception is if the URL was long and hashed to the
 477                     // same URL as an existing document indexed in the same
 478                     // batch.
 479                     mark_as_seen(did);
 480                     return true;
 481                 }
 482             }
 483             break;
 484         }
 485     }
 486     return false;
 487 }
 488
 489 void
 490 index_remove_failed_entry(const string& urlterm)
 491 {
 492     failed.del(urlterm);
 493 }
 494
 495 void
 496 index_add_document(const string & urlterm, time_t last_altered,
 497                    Xapian::docid did, const Xapian::Document & doc)
 498 {
 499     if (dup_action != DUP_SKIP) {
 500         // If this document has already been indexed, update the existing
 501         // entry.
 502         if (did) {
 503             // We already found out the document id above.
 504             db.replace_document(did, doc);
 505         } else if (last_altered <= last_altered_max) {
 506             // We checked for the UID term and didn't find it.
 507             did = db.add_document(doc);
 508         } else {
 509             did = db.replace_document(urlterm, doc);
 510         }
 511         mark_as_seen(did);
 512         if (verbose) {
 513             if (did <= old_lastdocid) {
 514                 cout << "updated" << endl;
 515             } else {
 516                 cout << "added" << endl;
 517             }
 518         }
 519     } else {
 520         // If this were a duplicate, we'd have skipped it above.
 521         db.add_document(doc);
 522         if (verbose)
 523             cout << "added" << endl;
 524     }
 525 }
 526
 527 void
 528 index_mimetype(const string & file, const string & urlterm, const string & url,
 529                const string & ext,
 530                const string &mimetype, DirectoryIterator &d,
 531                Xapian::Document & newdocument,
 532                string record)
 533 {
 534     string context(file, root.size(), string::npos);
 535
 536     // FIXME: We could be cleverer here and check mtime too when use_ctime is
 537     // set - if the ctime has changed but the mtime is unchanged, we can just
 538     // update the existing Document and avoid having to re-extract text, etc.
 539     time_t last_altered = use_ctime ? d.get_ctime() : d.get_mtime();
 540
 541     Xapian::docid did = 0;
 542     if (index_check_existing(urlterm, last_altered, did))
 543         return;
 544
 545     if (!retry_failed) {
 546         // We only store and check the mtime (last modified) - a change to the
 547         // metadata won't generally cause a previous failure to now work
 548         // (FIXME: except permissions).
 549         time_t failed_last_mod;
 550         off_t failed_size;
 551         if (failed.contains(urlterm, failed_last_mod, failed_size)) {
 552             if (d.get_mtime() <= failed_last_mod &&
 553                 d.get_size() == failed_size) {
 554                 if (verbose)
 555                     cout << "failed to extract text on earlier run" << endl;
 556                 return;
 557             }
 558             // The file has changed, so remove the entry for it.  If it fails
 559             // again on this attempt, we'll add a new one.
 560             failed.del(urlterm);
 561         }
 562     }
 563
 564     if (verbose) cout << flush;
 565
 566     string author, title, sample, keywords, topic, dump;
 567     string md5;
 568     time_t created = time_t(-1);
 569     int pages = -1;
 570
 571     map<string, Filter>::const_iterator cmd_it = commands.find(mimetype);
 572     if (cmd_it == commands.end()) {
 573         size_t slash = mimetype.find('/');
 574         if (slash != string::npos) {
 575             string wildtype(mimetype, 0, slash + 2);
 576             wildtype[slash + 1] = '*';
 577             cmd_it = commands.find(wildtype);
 578             if (cmd_it == commands.end()) {
 579                 cmd_it = commands.find("*/*");
 580             }
 581         }
 582         if (cmd_it == commands.end()) {
 583             cmd_it = commands.find("*");
 584         }
 585     }
 586     try {
 587         if (cmd_it != commands.end()) {
 588             // Easy "run a command and read text or HTML from stdout or a
 589             // temporary file" cases.
 590             auto& filter = cmd_it->second;
 591             string cmd = filter.cmd;
 592             if (cmd.empty()) {
 593                 skip(urlterm, context, "required filter not installed",
 594                      d.get_size(), d.get_mtime(), SKIP_VERBOSE_ONLY);
 595                 return;
 596             }
 597             if (cmd == "false") {
 598                 // Allow setting 'false' as a filter to mean that a MIME type
 599                 // should be quietly ignored.
 600                 string m = "ignoring MIME type '";
 601                 m += cmd_it->first;
 602                 m += "'";
 603                 skip(urlterm, context, m, d.get_size(), d.get_mtime(),
 604                      SKIP_VERBOSE_ONLY);
 605                 return;
 606             }
 607             bool use_shell = filter.use_shell();
 608             bool substituted = false;
 609             string tmpout;
 610             size_t pcent = 0;
 611             while (true) {
 612                 pcent = cmd.find('%', pcent);
 613                 if (pcent >= cmd.size() - 1)
 614                     break;
 615                 switch (cmd[pcent + 1]) {
 616                     case '%': // %% -> %.
 617                         cmd.erase(++pcent, 1);
 618                         break;
 619                     case 'f': { // %f -> escaped filename.
 620                         substituted = true;
 621                         string tail(cmd, pcent + 2);
 622                         cmd.resize(pcent);
 623                         // Suppress the space append_filename_argument()
 624                         // usually adds before the argument - the command
 625                         // string either includes one, or won't expect one
 626                         // (e.g. --input=%f).
 627                         append_filename_argument(cmd, file, false);
 628                         pcent = cmd.size();
 629                         cmd += tail;
 630                         break;
 631                     }
 632                     case 't': { // %t -> temporary output file.
 633                         if (tmpout.empty()) {
 634                             // Use a temporary file with a suitable extension
 635                             // in case the command cares, and for more helpful
 636                             // error messages from the command.
 637                             if (filter.output_type == "text/html") {
 638                                 tmpout = get_tmpfile("tmp.html");
 639                             } else if (filter.output_type == "image/svg+xml") {
 640                                 tmpout = get_tmpfile("tmp.svg");
 641                             } else {
 642                                 tmpout = get_tmpfile("tmp.txt");
 643                             }
 644                         }
 645                         substituted = true;
 646                         string tail(cmd, pcent + 2);
 647                         cmd.resize(pcent);
 648                         // Suppress the space append_filename_argument()
 649                         // usually adds before the argument - the command
 650                         // string either includes one, or won't expect one
 651                         // (e.g. --output=%t).
 652                         append_filename_argument(cmd, tmpout, false);
 653                         pcent = cmd.size();
 654                         cmd += tail;
 655                         break;
 656                     }
 657                     default:
 658                         // Leave anything else alone for now.
 659                         pcent += 2;
 660                         break;
 661                 }
 662             }
 663             if (!substituted && cmd != "true") {
 664                 // If no %f, append the filename to the command.
 665                 append_filename_argument(cmd, file);
 666             }
 667             try {
 668                 if (!tmpout.empty()) {
 669                     // Output in temporary file.
 670                     run_filter(cmd, use_shell);
 671                     if (!load_file(tmpout, dump, NOCACHE)) {
 672                         throw ReadError("Couldn't read output file");
 673                     }
 674                     unlink(tmpout.c_str());
 675                 } else if (cmd == "true") {
 676                     // Ignore the file's contents, just index metadata from the
 677                     // filing system.
 678                 } else {
 679                     // Output on stdout.
 680                     run_filter(cmd, use_shell, &dump);
 681                 }
 682                 const string & charset = filter.output_charset;
 683                 if (filter.output_type == "text/html") {
 684                     MyHtmlParser p;
 685                     p.ignore_metarobots();
 686                     p.description_as_sample = description_as_sample;
 687                     try {
 688                         p.parse_html(dump, charset, false);
 689                     } catch (const string & newcharset) {
 690                         p.reset();
 691                         p.ignore_metarobots();
 692                         p.description_as_sample = description_as_sample;
 693                         p.parse_html(dump, newcharset, true);
 694                     } catch (const ReadError&) {
 695                         skip_cmd_failed(urlterm, context, cmd,
 696                                         d.get_size(), d.get_mtime());
 697                         return;
 698                     }
 699                     dump = p.dump;
 700                     title = p.title;
 701                     keywords = p.keywords;
 702                     topic = p.topic;
 703                     sample = p.sample;
 704                     author = p.author;
 705                     created = p.created;
 706                 } else if (filter.output_type == "image/svg+xml") {
 707                     SvgParser svgparser;
 708                     svgparser.parse(dump);
 709                     dump = svgparser.dump;
 710                     title = svgparser.title;
 711                     keywords = svgparser.keywords;
 712                     // FIXME: topic = svgparser.topic;
 713                     author = svgparser.author;
 714                 } else if (!charset.empty()) {
 715                     convert_to_utf8(dump, charset);
 716                 }
 717             } catch (const ReadError&) {
 718                 skip_cmd_failed(urlterm, context, cmd,
 719                                 d.get_size(), d.get_mtime());
 720                 return;
 721             }
 722         } else if (mimetype == "text/html" || mimetype == "text/x-php") {
 723             const string & text = d.file_to_string();
 724             MyHtmlParser p;
 725             if (ignore_exclusions) p.ignore_metarobots();
 726             p.description_as_sample = description_as_sample;
 727             try {
 728                 // Default HTML character set is latin 1, though not specifying
 729                 // one is deprecated these days.
 730                 p.parse_html(text, "iso-8859-1", false);
 731             } catch (const string & newcharset) {
 732                 p.reset();
 733                 if (ignore_exclusions) p.ignore_metarobots();
 734                 p.description_as_sample = description_as_sample;
 735                 p.parse_html(text, newcharset, true);
 736             }
 737             if (!p.indexing_allowed) {
 738                 skip_meta_tag(urlterm, context,
 739                               d.get_size(), d.get_mtime());
 740                 return;
 741             }
 742             dump = p.dump;
 743             title = p.title;
 744             keywords = p.keywords;
 745             topic = p.topic;
 746             sample = p.sample;
 747             author = p.author;
 748             created = p.created;
 749             md5_string(text, md5);
 750         } else if (mimetype == "text/plain") {
 751             // Currently we assume that text files are UTF-8 unless they have a
 752             // byte-order mark.
 753             dump = d.file_to_string();
 754             md5_string(dump, md5);
 755
 756             // Look for Byte-Order Mark (BOM).
 757             if (startswith(dump, "\xfe\xff") || startswith(dump, "\xff\xfe")) {
 758                 // UTF-16 in big-endian/little-endian order - we just convert
 759                 // it as "UTF-16" and let the conversion handle the BOM as that
 760                 // way we avoid the copying overhead of erasing 2 bytes from
 761                 // the start of dump.
 762                 convert_to_utf8(dump, "UTF-16");
 763             } else if (startswith(dump, "\xef\xbb\xbf")) {
 764                 // UTF-8 with stupid Windows not-the-byte-order mark.
 765                 dump.erase(0, 3);
 766             } else {
 767                 // FIXME: What charset is the file?  Look at contents?
 768             }
 769         } else if (mimetype == "application/pdf") {
 770             const char* cmd = "pdftotext -enc UTF-8 - -";
 771             try {
 772                 run_filter(d.get_fd(), cmd, false, &dump);
 773             } catch (const ReadError&) {
 774                 skip_cmd_failed(urlterm, context, cmd,
 775                                 d.get_size(), d.get_mtime());
 776                 return;
 777             }
 778             get_pdf_metainfo(d.get_fd(), author, title, keywords, topic, pages);
 779         } else if (mimetype == "application/postscript") {
 780             // There simply doesn't seem to be a Unicode capable PostScript to
 781             // text converter (e.g. pstotext always outputs ISO-8859-1).  The
 782             // only solution seems to be to convert via PDF using ps2pdf and
 783             // then pdftotext.  This gives plausible looking UTF-8 output for
 784             // some Chinese PostScript files I found using Google.  It also has
 785             // the benefit of allowing us to extract meta information from
 786             // PostScript files.
 787             string tmpfile = get_tmpfile("tmp.pdf");
 788             if (tmpfile.empty()) {
 789                 // FIXME: should this be fatal?  Or disable indexing postscript?
 790                 string msg = "Couldn't create temporary directory (";
 791                 msg += strerror(errno);
 792                 msg += ")";
 793                 skip(urlterm, context, msg,
 794                      d.get_size(), d.get_mtime());
 795                 return;
 796             }
 797             string cmd = "ps2pdf -";
 798             append_filename_argument(cmd, tmpfile);
 799             try {
 800                 run_filter(d.get_fd(), cmd, false);
 801                 cmd = "pdftotext -enc UTF-8";
 802                 append_filename_argument(cmd, tmpfile);
 803                 cmd += " -";
 804                 run_filter(cmd, false, &dump);
 805             } catch (const ReadError&) {
 806                 skip_cmd_failed(urlterm, context, cmd,
 807                                 d.get_size(), d.get_mtime());
 808                 unlink(tmpfile.c_str());
 809                 return;
 810             } catch (...) {
 811                 unlink(tmpfile.c_str());
 812                 throw;
 813             }
 814             try {
 815                 get_pdf_metainfo(tmpfile, author, title, keywords, topic,
 816                                  pages);
 817             } catch (...) {
 818                 unlink(tmpfile.c_str());
 819                 throw;
 820             }
 821             unlink(tmpfile.c_str());
 822         } else if (startswith(mimetype, "application/vnd.sun.xml.") ||
 823                    startswith(mimetype, "application/vnd.oasis.opendocument."))
 824         {
 825             // Inspired by http://mjr.towers.org.uk/comp/sxw2text
 826             string cmd = "unzip -p";
 827             append_filename_argument(cmd, file);
 828             cmd += " content.xml ; unzip -p";
 829             append_filename_argument(cmd, file);
 830             cmd += " styles.xml";
 831             try {
 832                 OpenDocParser parser;
 833                 parser.parse(stdout_to_string(cmd, true));
 834                 dump = parser.dump;
 835             } catch (const ReadError&) {
 836                 skip_cmd_failed(urlterm, context, cmd,
 837                                 d.get_size(), d.get_mtime());
 838                 return;
 839             }
 840
 841             cmd = "unzip -p";
 842             append_filename_argument(cmd, file);
 843             cmd += " meta.xml";
 844             try {
 845                 MetaXmlParser metaxmlparser;
 846                 metaxmlparser.parse(stdout_to_string(cmd, false));
 847                 title = metaxmlparser.title;
 848                 keywords = metaxmlparser.keywords;
 849                 // FIXME: topic = metaxmlparser.topic;
 850                 sample = metaxmlparser.sample;
 851                 author = metaxmlparser.author;
 852             } catch (const ReadError&) {
 853                 // It's probably best to index the document even if this fails.
 854             }
 855         } else if (startswith(mimetype, "application/vnd.openxmlformats-officedocument.")) {
 856             const char * args = NULL;
 857             string tail(mimetype, 46);
 858             if (startswith(tail, "wordprocessingml.")) {
 859                 // unzip returns exit code 11 if a file to extract wasn't found
 860                 // which we want to ignore, because there may be no headers or
 861                 // no footers.
 862                 args = " word/document.xml 'word/header*.xml' 'word/footer*.xml' 2>/dev/null";
 863             } else if (startswith(tail, "spreadsheetml.")) {
 864                 // Extract the shared string table first, so our parser can
 865                 // grab those ready for parsing the sheets which will reference
 866                 // the shared strings.
 867                 string cmd = "unzip -p";
 868                 append_filename_argument(cmd, file);
 869                 cmd += " xl/styles.xml xl/workbook.xml xl/sharedStrings.xml ; unzip -p";
 870                 append_filename_argument(cmd, file);
 871                 cmd += " xl/worksheets/sheet\\*.xml";
 872                 try {
 873                     XlsxParser parser;
 874                     parser.parse(stdout_to_string(cmd, true));
 875                     dump = parser.dump;
 876                 } catch (const ReadError&) {
 877                     skip_cmd_failed(urlterm, context, cmd,
 878                                     d.get_size(), d.get_mtime());
 879                     return;
 880                 }
 881             } else if (startswith(tail, "presentationml.")) {
 882                 // unzip returns exit code 11 if a file to extract wasn't found
 883                 // which we want to ignore, because there may be no notesSlides
 884                 // or comments.
 885                 args = " 'ppt/slides/slide*.xml' 'ppt/notesSlides/notesSlide*.xml' 'ppt/comments/comment*.xml' 2>/dev/null";
 886             } else {
 887                 // Don't know how to index this type.
 888                 skip_unknown_mimetype(urlterm, context, mimetype,
 889                                       d.get_size(), d.get_mtime());
 890                 return;
 891             }
 892
 893             if (args) {
 894                 string cmd = "unzip -p";
 895                 append_filename_argument(cmd, file);
 896                 cmd += args;
 897                 try {
 898                     MSXmlParser xmlparser;
 899                     // Treat exit status 11 from unzip as success - this is
 900                     // what we get if one of the listed filenames to extract
 901                     // doesn't match anything in the zip file.
 902                     xmlparser.parse_xml(stdout_to_string(cmd, false, 11));
 903                     dump = xmlparser.dump;
 904                 } catch (const ReadError&) {
 905                     skip_cmd_failed(urlterm, context, cmd,
 906                                     d.get_size(), d.get_mtime());
 907                     return;
 908                 }
 909             }
 910
 911             string cmd = "unzip -p";
 912             append_filename_argument(cmd, file);
 913             cmd += " docProps/core.xml";
 914             try {
 915                 MetaXmlParser metaxmlparser;
 916                 metaxmlparser.parse(stdout_to_string(cmd, false));
 917                 title = metaxmlparser.title;
 918                 keywords = metaxmlparser.keywords;
 919                 // FIXME: topic = metaxmlparser.topic;
 920                 sample = metaxmlparser.sample;
 921                 author = metaxmlparser.author;
 922             } catch (const ReadError&) {
 923                 // It's probably best to index the document even if this fails.
 924             }
 925         } else if (mimetype == "application/x-abiword") {
 926             // FIXME: Implement support for metadata.
 927             XmlParser xmlparser;
 928             const string & text = d.file_to_string();
 929             xmlparser.parse_xml(text);
 930             dump = xmlparser.dump;
 931             md5_string(text, md5);
 932         } else if (mimetype == "application/x-abiword-compressed") {
 933             // FIXME: Implement support for metadata.
 934             XmlParser xmlparser;
 935             xmlparser.parse_xml(d.gzfile_to_string());
 936             dump = xmlparser.dump;
 937         } else if (mimetype == "application/oxps" ||
 938                    mimetype == "application/vnd.ms-xpsdocument") {
 939             string cmd = "unzip -p";
 940             append_filename_argument(cmd, file);
 941             cmd += " 'Documents/1/Pages/*.fpage'";
 942             try {
 943                 XpsXmlParser xpsparser;
 944                 run_filter(cmd, false, &dump);
 945                 xpsparser.parse(dump);
 946                 dump = xpsparser.dump;
 947             } catch (const ReadError&) {
 948                 skip_cmd_failed(urlterm, context, cmd,
 949                                 d.get_size(), d.get_mtime());
 950                 return;
 951             }
 952
 953             cmd = "unzip -p";
 954             append_filename_argument(cmd, file);
 955             cmd += " docProps/core.xml";
 956             try {
 957                 MetaXmlParser metaparser;
 958                 metaparser.parse(stdout_to_string(cmd, false));
 959                 title = metaparser.title;
 960                 keywords = metaparser.keywords;
 961                 // FIXME: topic = metaparser.topic;
 962                 sample = metaparser.sample;
 963                 author = metaparser.author;
 964             } catch (const ReadError&) {
 965                 // Ignore errors as not all XPS files contain this file.
 966             }
 967         } else if (mimetype == "text/csv") {
 968             // Currently we assume that text files are UTF-8 unless they have a
 969             // byte-order mark.
 970             dump = d.file_to_string();
 971             md5_string(dump, md5);
 972
 973             // Look for Byte-Order Mark (BOM).
 974             if (startswith(dump, "\xfe\xff") || startswith(dump, "\xff\xfe")) {
 975                 // UTF-16 in big-endian/little-endian order - we just convert
 976                 // it as "UTF-16" and let the conversion handle the BOM as that
 977                 // way we avoid the copying overhead of erasing 2 bytes from
 978                 // the start of dump.
 979                 convert_to_utf8(dump, "UTF-16");
 980             } else if (startswith(dump, "\xef\xbb\xbf")) {
 981                 // UTF-8 with stupid Windows not-the-byte-order mark.
 982                 dump.erase(0, 3);
 983             } else {
 984                 // FIXME: What charset is the file?  Look at contents?
 985             }
 986
 987             generate_sample_from_csv(dump, sample);
 988         } else if (mimetype == "image/svg+xml") {
 989             SvgParser svgparser;
 990             const string & text = d.file_to_string();
 991             md5_string(text, md5);
 992             svgparser.parse(text);
 993             dump = svgparser.dump;
 994             title = svgparser.title;
 995             keywords = svgparser.keywords;
 996             // FIXME: topic = svgparser.topic;
 997             author = svgparser.author;
 998         } else if (mimetype == "application/vnd.debian.binary-package" ||
 999                    mimetype == "application/x-debian-package") {
1000             const char* cmd = "dpkg-deb -f - Description";
1001             string desc;
1002             run_filter(d.get_fd(), cmd, false, &desc);
1003             // First line is short description, which we use as the title.
1004             string::size_type idx = desc.find('\n');
1005             title.assign(desc, 0, idx);
1006             if (idx != string::npos) {
1007                 dump.assign(desc, idx + 1, string::npos);
1008             }
1009         } else if (mimetype == "application/x-redhat-package-manager" ||
1010                    mimetype == "application/x-rpm") {
1011             string cmd("rpm -q --qf '%{SUMMARY}\\n%{DESCRIPTION}' -p");
1012             append_filename_argument(cmd, file);
1013             string desc;
1014             run_filter(cmd, false, &desc);
1015             // First line is summary, which we use as the title.
1016             string::size_type idx = desc.find('\n');
1017             title.assign(desc, 0, idx);
1018             if (idx != string::npos) {
1019                 dump.assign(desc, idx + 1, string::npos);
1020             }
1021         } else if (mimetype == "application/atom+xml") {
1022             AtomParser atomparser;
1023             const string & text = d.file_to_string();
1024             md5_string(text, md5);
1025             atomparser.parse(text);
1026             dump = atomparser.dump;
1027             title = atomparser.title;
1028             keywords = atomparser.keywords;
1029             // FIXME: topic = atomparser.topic;
1030             author = atomparser.author;
1031         } else {
1032             // Don't know how to index this type.
1033             skip_unknown_mimetype(urlterm, context, mimetype,
1034                                   d.get_size(), d.get_mtime());
1035             return;
1036         }
1037
1038         // Compute the MD5 of the file if we haven't already.
1039         if (md5.empty() && !d.md5(md5)) {
1040             if (errno == ENOENT || errno == ENOTDIR) {
1041                 skip(urlterm, context, "File removed during indexing",
1042                      d.get_size(), d.get_mtime(),
1043                      SKIP_VERBOSE_ONLY | SKIP_SHOW_FILENAME);
1044             } else {
1045                 skip(urlterm, context, "failed to read file to calculate MD5 checksum",
1046                      d.get_size(), d.get_mtime());
1047             }
1048             return;
1049         }
1050
1051         // Remove any trailing formfeeds, so we don't consider them when
1052         // considering if we extracted any text (e.g. pdftotext outputs a
1053         // formfeed between each page, even for blank pages).
1054         //
1055         // If dump contain only formfeeds, then trim_end will be string::npos
1056         // and ++trim_end will be 0, which is the correct new size.
1057         string::size_type trim_end = dump.find_last_not_of('\f');
1058         if (++trim_end != dump.size())
1059             dump.resize(trim_end);
1060
1061         if (dump.empty()) {
1062             switch (empty_body) {
1063                 case EMPTY_BODY_INDEX:
1064                     break;
1065                 case EMPTY_BODY_WARN:
1066                     cout << "no text extracted from document body, "
1067                             "but indexing metadata anyway" << endl;
1068                     break;
1069                 case EMPTY_BODY_SKIP:
1070                     skip(urlterm, context, "no text extracted from document body",
1071                          d.get_size(), d.get_mtime());
1072                     return;
1073             }
1074         }
1075
1076         // Produce a sample
1077         if (sample.empty()) {
1078             sample = generate_sample(dump, sample_size, "...", " ...");
1079         } else {
1080             sample = generate_sample(sample, sample_size, "...", " ...");
1081         }
1082
1083         // Put the data in the document
1084         if (record.empty()) {
1085             record = "url=";
1086         } else {
1087             record += "\nurl=";
1088         }
1089         record += url;
1090         record += "\nsample=";
1091         record += sample;
1092         if (!title.empty()) {
1093             record += "\ncaption=";
1094             record += generate_sample(title, title_size, "...", " ...");
1095         }
1096         if (!author.empty()) {
1097             record += "\nauthor=";
1098             record += author;
1099         }
1100         record += "\ntype=";
1101         record += mimetype;
1102         time_t mtime = d.get_mtime();
1103         if (mtime != static_cast<time_t>(-1)) {
1104             record += "\nmodtime=";
1105             record += str(mtime);
1106         }
1107         if (created != static_cast<time_t>(-1)) {
1108             record += "\ncreated=";
1109             record += str(created);
1110         }
1111         if (pages >= 0) {
1112             record += "\npages=";
1113             record += str(pages);
1114         }
1115         off_t size = d.get_size();
1116         record += "\nsize=";
1117         record += str(size);
1118         newdocument.set_data(record);
1119
1120         // Index the title, document text, keywords and topic.
1121         indexer.set_document(newdocument);
1122         if (!title.empty()) {
1123             indexer.index_text(title, 5, "S");
1124             indexer.increase_termpos(100);
1125         }
1126         if (!dump.empty()) {
1127             indexer.index_text(dump);
1128         }
1129         if (!keywords.empty()) {
1130             indexer.increase_termpos(100);
1131             indexer.index_text(keywords);
1132         }
1133         if (!topic.empty()) {
1134             indexer.increase_termpos(100);
1135             indexer.index_text(topic, 1, "B");
1136         }
1137         // Index the leafname of the file.
1138         {
1139             indexer.increase_termpos(100);
1140             string leaf = d.leafname();
1141             string::size_type dot = leaf.find_last_of('.');
1142             if (dot != string::npos && leaf.size() - dot - 1 <= max_ext_len)
1143                 leaf.resize(dot);
1144             indexer.index_text(leaf, 1, "F");
1145
1146             // Also index with underscores and ampersands replaced by spaces.
1147             bool modified = false;
1148             string::size_type rep = 0;
1149             while ((rep = leaf.find_first_of("_&", rep)) != string::npos) {
1150                 leaf[rep++] = ' ';
1151                 modified = true;
1152             }
1153             if (modified) {
1154                 indexer.increase_termpos(100);
1155                 indexer.index_text(leaf, 1, "F");
1156             }
1157         }
1158
1159         if (!author.empty()) {
1160             indexer.increase_termpos(100);
1161             indexer.index_text(author, 1, "A");
1162         }
1163
1164         // mimeType:
1165         newdocument.add_boolean_term("T" + mimetype);
1166
1167         newdocument.add_boolean_term(site_term);
1168
1169         if (!host_term.empty())
1170             newdocument.add_boolean_term(host_term);
1171
1172         struct tm *tm = localtime(&mtime);
1173         string date_term = "D" + date_to_string(tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday);
1174         newdocument.add_boolean_term(date_term); // Date (YYYYMMDD)
1175         date_term.resize(7);
1176         date_term[0] = 'M';
1177         newdocument.add_boolean_term(date_term); // Month (YYYYMM)
1178         date_term.resize(5);
1179         date_term[0] = 'Y';
1180         newdocument.add_boolean_term(date_term); // Year (YYYY)
1181
1182         newdocument.add_boolean_term(urlterm); // Url
1183
1184         // Add mtime as a value to allow "sort by date".
1185         newdocument.add_value(VALUE_LASTMOD,
1186                               int_to_binary_string(uint32_t(mtime)));
1187         if (use_ctime) {
1188             // Add ctime as a value to track modifications.
1189             time_t ctime = d.get_ctime();
1190             newdocument.add_value(VALUE_CTIME,
1191                                   int_to_binary_string(uint32_t(ctime)));
1192         }
1193
1194         // Add MD5 as a value to allow duplicate documents to be collapsed
1195         // together.
1196         newdocument.add_value(VALUE_MD5, md5);
1197
1198         // Add the file size as a value to allow "sort by size" and size ranges.
1199         newdocument.add_value(VALUE_SIZE,
1200                               Xapian::sortable_serialise(size));
1201
1202         bool inc_tag_added = false;
1203         if (d.is_other_readable()) {
1204             inc_tag_added = true;
1205             newdocument.add_boolean_term("I*");
1206         } else if (d.is_group_readable()) {
1207             const char * group = d.get_group();
1208             if (group) {
1209                 newdocument.add_boolean_term(string("I#") + group);
1210             }
1211         }
1212         const char * owner = d.get_owner();
1213         if (owner) {
1214             newdocument.add_boolean_term(string("O") + owner);
1215             if (!inc_tag_added && d.is_owner_readable())
1216                 newdocument.add_boolean_term(string("I@") + owner);
1217         }
1218
1219         string ext_term("E");
1220         for (string::const_iterator i = ext.begin(); i != ext.end(); ++i) {
1221             char ch = *i;
1222             if (ch >= 'A' && ch <= 'Z')
1223                 ch |= 32;
1224             ext_term += ch;
1225         }
1226         newdocument.add_boolean_term(ext_term);
1227
1228         index_add_document(urlterm, last_altered, did, newdocument);
1229     } catch (const ReadError&) {
1230         skip(urlterm, context, string("can't read file: ") + strerror(errno),
1231              d.get_size(), d.get_mtime());
1232     } catch (const NoSuchFilter&) {
1233         string filter_entry;
1234         if (cmd_it != commands.end()) {
1235             filter_entry = cmd_it->first;
1236         } else {
1237             filter_entry = mimetype;
1238         }
1239         string m = "Filter for \"";
1240         m += filter_entry;
1241         m += "\" not installed";
1242         skip(urlterm, context, m, d.get_size(), d.get_mtime());
1243         commands[filter_entry] = Filter();
1244     } catch (const FileNotFound&) {
1245         skip(urlterm, context, "File removed during indexing",
1246              d.get_size(), d.get_mtime(),
1247              SKIP_VERBOSE_ONLY | SKIP_SHOW_FILENAME);
1248     } catch (const std::string & error) {
1249         skip(urlterm, context, error, d.get_size(), d.get_mtime());
1250     } catch (const std::bad_alloc&) {
1251         // Attempt to flag the file as failed and commit changes, though that
1252         // might fail too if we're low on memory rather than being asked to
1253         // allocate a ludicrous amount.
1254         skip(urlterm, context, "Out of memory trying to extract text from file",
1255              d.get_size(), d.get_mtime(),
1256              SKIP_SHOW_FILENAME);
1257         throw CommitAndExit("Caught std::bad_alloc", "");
1258     }
1259 }
1260
1261 void
1262 index_handle_deletion()
1263 {
1264     if (updated.empty() || old_docs_not_seen == 0) return;
1265
1266     if (verbose) {
1267         cout << "Deleting " << old_docs_not_seen << " old documents which weren't found" << endl;
1268     }
1269     Xapian::PostingIterator alldocs = db.postlist_begin(string());
1270     Xapian::docid did = *alldocs;
1271     while (did < updated.size()) {
1272         if (!updated[did]) {
1273             alldocs.skip_to(did);
1274             if (alldocs == db.postlist_end(string()))
1275                 break;
1276             if (*alldocs != did) {
1277                 // Document #did didn't exist before we started.
1278                 did = *alldocs;
1279                 continue;
1280             }
1281             db.delete_document(did);
1282             if (--old_docs_not_seen == 0)
1283                 break;
1284         }
1285         ++did;
1286     }
1287 }
1288
1289 void
1290 index_commit()
1291 {
1292     db.commit();
1293 }
1294
1295 void
1296 index_done()
1297 {
1298     // If we created a temporary directory then delete it.
1299     remove_tmpdir();
1300 }