xapian-applications/omega/index_file.cc

   1 /** @file
   2  * @brief Handle indexing a document from a file
   3  */
   4 /* Copyright 1999,2000,2001 BrightStation PLC
   5  * Copyright 2001,2005 James Aylett
   6  * Copyright 2001,2002 Ananova Ltd
   7  * Copyright 2002-2023 Olly Betts
   8  * Copyright 2009 Frank J Bruzzaniti
   9  * Copyright 2012 Mihai Bivol
  10  * Copyright 2019 Bruno Baruffaldi
  11  * Copyright 2020 Parth Kapadia
  12  *
  13  * This program is free software; you can redistribute it and/or
  14  * modify it under the terms of the GNU General Public License as
  15  * published by the Free Software Foundation; either version 2 of the
  16  * License, or (at your option) any later version.
  17  *
  18  * This program is distributed in the hope that it will be useful,
  19  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  20  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21  * GNU General Public License for more details.
  22  *
  23  * You should have received a copy of the GNU General Public License
  24  * along with this program; if not, write to the Free Software
  25  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
  26  * USA
  27  */
  28
  29 #include <config.h>
  30
  31 #include "index_file.h"
  32
  33 #include <algorithm>
  34 #include <iostream>
  35 #include <limits>
  36 #include <string>
  37 #include <map>
  38 #include <vector>
  39
  40 #include <sys/types.h>
  41 #include "safeunistd.h"
  42 #include <cassert>
  43 #include <cerrno>
  44 #include <cstdio>
  45 #include <cstdlib>
  46 #include <cstring>
  47 #include "safefcntl.h"
  48 #include <ctime>
  49
  50 #include <xapian.h>
  51
  52 #include "abiwordparser.h"
  53 #include "append_filename_arg.h"
  54 #include "atomparser.h"
  55 #include "datetime.h"
  56 #include "diritor.h"
  57 #include "failed.h"
  58 #include "hashterm.h"
  59 #include "htmlparser.h"
  60 #include "md5wrap.h"
  61 #include "mimemap.h"
  62 #include "msxmlparser.h"
  63 #include "opendocmetaparser.h"
  64 #include "opendocparser.h"
  65 #include "pkglibbindir.h"
  66 #include "runfilter.h"
  67 #include "sample.h"
  68 #include "str.h"
  69 #include "stringutils.h"
  70 #include "svgparser.h"
  71 #include "tmpdir.h"
  72 #include "utf8convert.h"
  73 #include "values.h"
  74 #include "worker.h"
  75 #include "xlsxparser.h"
  76 #include "xpsparser.h"
  77
  78 using namespace std;
  79
  80 static Xapian::WritableDatabase db;
  81 static Xapian::TermGenerator indexer;
  82
  83 static Xapian::doccount old_docs_not_seen;
  84 static Xapian::docid old_lastdocid;
  85 static vector<bool> updated;
  86
  87 static bool verbose;
  88 static bool retry_failed;
  89 static bool use_ctime;
  90 static dup_action_type dup_action;
  91 static bool ignore_exclusions;
  92 static bool description_as_sample;
  93 static bool date_terms;
  94
  95 static time_t last_altered_max;
  96 static size_t sample_size;
  97 static size_t title_size;
  98 static size_t max_ext_len;
  99
 100 static empty_body_type empty_body;
 101
 102 static string root;
 103 static string site_term, host_term;
 104
 105 static Failed failed;
 106
 107 map<string, Filter> commands;
 108
 109 static void
 110 mark_as_seen(Xapian::docid did)
 111 {
 112     if (usual(did < updated.size() && !updated[did])) {
 113         updated[did] = true;
 114         --old_docs_not_seen;
 115     }
 116 }
 117
 118 void
 119 skip(const string& urlterm, const string& context, const string& msg,
 120      off_t size, time_t last_mod, unsigned flags)
 121 {
 122     failed.add(urlterm, last_mod, size);
 123
 124     if (!verbose || (flags & SKIP_SHOW_FILENAME)) {
 125         if (!verbose && (flags & SKIP_VERBOSE_ONLY)) return;
 126         cout << context << ": ";
 127     }
 128
 129     cout << "Skipping - " << msg << endl;
 130 }
 131
 132 static void
 133 skip_cmd_failed(const string& urlterm, const string& context,
 134                 const char* const cmd[],
 135                 off_t size, time_t last_mod)
 136 {
 137     string message;
 138     const char* sep = "['";
 139     for (auto i = cmd; *i; ++i) {
 140         message += sep;
 141         message += *i;
 142         sep = "', '";
 143     }
 144     message += "'] failed";
 145     skip(urlterm, context, message, size, last_mod);
 146 }
 147
 148 static void
 149 skip_cmd_failed(const string& urlterm, const string& context, const string& cmd,
 150                 off_t size, time_t last_mod)
 151 {
 152     skip(urlterm, context, "\"" + cmd + "\" failed", size, last_mod);
 153 }
 154
 155 static void
 156 skip_meta_tag(const string& urlterm, const string& context,
 157               off_t size, time_t last_mod)
 158 {
 159     skip(urlterm, context, "indexing disallowed by meta tag", size, last_mod);
 160 }
 161
 162 static void
 163 skip_unknown_mimetype(const string& urlterm, const string& context,
 164                       const string& mimetype, off_t size, time_t last_mod)
 165 {
 166     skip(urlterm, context, "unknown MIME type '" + mimetype + "'",
 167          size, last_mod);
 168 }
 169
 170 void
 171 index_add_default_libraries()
 172 {
 173 #if defined HAVE_POPPLER
 174     Worker* omindex_poppler = new Worker("omindex_poppler");
 175     index_library("application/pdf", omindex_poppler);
 176 #endif
 177 #if defined HAVE_LIBEBOOK
 178     Worker* omindex_libebook = new Worker("omindex_libebook");
 179     index_library("application/vnd.palm", omindex_libebook);
 180     index_library("application/x-fictionbook+xml", omindex_libebook);
 181     index_library("application/x-zip-compressed-fb2", omindex_libebook);
 182     index_library("application/x-sony-bbeb", omindex_libebook);
 183     index_library("application/x-tcr-ebook", omindex_libebook);
 184     index_library("application/x-qioo-ebook", omindex_libebook);
 185 #endif
 186 #if defined HAVE_LIBETONYEK
 187     Worker* omindex_libetonyek = new Worker("omindex_libetonyek");
 188     index_library("application/vnd.apple.keynote", omindex_libetonyek);
 189     index_library("application/vnd.apple.pages", omindex_libetonyek);
 190     index_library("application/vnd.apple.numbers", omindex_libetonyek);
 191 #endif
 192 #if defined HAVE_LIBGEPUB
 193     Worker* omindex_libgepub = new Worker("omindex_libgepub");
 194     index_library("application/epub+zip", omindex_libgepub);
 195 #endif
 196 #if defined HAVE_TESSERACT
 197     Worker* omindex_tesseract = new Worker("omindex_tesseract");
 198     index_library("image/gif", omindex_tesseract);
 199     index_library("image/jpeg", omindex_tesseract);
 200     index_library("image/png", omindex_tesseract);
 201     index_library("image/webp", omindex_tesseract);
 202     index_library("image/tiff", omindex_tesseract);
 203     index_library("image/x-portable-bitmap", omindex_tesseract);
 204     index_library("image/x-portable-graymap", omindex_tesseract);
 205     index_library("image/x-portable-anymap", omindex_tesseract);
 206     index_library("image/x-portable-pixmap", omindex_tesseract);
 207 #endif
 208 #if defined HAVE_GMIME
 209     Worker* omindex_gmime = new Worker("omindex_gmime");
 210     index_library("message/rfc822", omindex_gmime);
 211     index_library("message/news", omindex_gmime);
 212 #endif
 213 #if defined HAVE_LIBARCHIVE
 214     Worker* omindex_libarchive = new Worker("omindex_libarchive");
 215     index_library("application/oxps", omindex_libarchive);
 216     index_library("application/vnd.ms-xpsdocument", omindex_libarchive);
 217     index_library("application/vnd.oasis.opendocument.text",
 218                   omindex_libarchive);
 219     index_library("application/vnd.oasis.opendocument.spreadsheet",
 220                   omindex_libarchive);
 221     index_library("application/vnd.oasis.opendocument.presentation",
 222                   omindex_libarchive);
 223     index_library("application/vnd.oasis.opendocument.graphics",
 224                   omindex_libarchive);
 225     index_library("application/vnd.oasis.opendocument.chart",
 226                   omindex_libarchive);
 227     index_library("application/vnd.oasis.opendocument.formula",
 228                   omindex_libarchive);
 229     index_library("application/vnd.oasis.opendocument.database",
 230                   omindex_libarchive);
 231     index_library("application/vnd.oasis.opendocument.image",
 232                   omindex_libarchive);
 233     index_library("application/vnd.oasis.opendocument.text-master",
 234                   omindex_libarchive);
 235     index_library("application/vnd.oasis.opendocument.text-template",
 236                   omindex_libarchive);
 237     index_library("application/vnd.oasis.opendocument.spreadsheet-template",
 238                   omindex_libarchive);
 239     index_library("application/vnd.oasis.opendocument.presentation-template",
 240                   omindex_libarchive);
 241     index_library("application/vnd.oasis.opendocument.graphics-template",
 242                   omindex_libarchive);
 243     index_library("application/vnd.oasis.opendocument.chart-template",
 244                   omindex_libarchive);
 245     index_library("application/vnd.oasis.opendocument.formula-template",
 246                   omindex_libarchive);
 247     index_library("application/vnd.oasis.opendocument.image-template",
 248                   omindex_libarchive);
 249     index_library("application/vnd.oasis.opendocument.text-web",
 250                   omindex_libarchive);
 251     index_library("application/vnd.sun.xml.calc",
 252                   omindex_libarchive);
 253     index_library("application/vnd.sun.xml.calc.template",
 254                   omindex_libarchive);
 255     index_library("application/vnd.sun.xml.draw",
 256                   omindex_libarchive);
 257     index_library("application/vnd.sun.xml.draw.template",
 258                   omindex_libarchive);
 259     index_library("application/vnd.sun.xml.impress",
 260                   omindex_libarchive);
 261     index_library("application/vnd.sun.xml.impress.template",
 262                   omindex_libarchive);
 263     index_library("application/vnd.sun.xml.math",
 264                   omindex_libarchive);
 265     index_library("application/vnd.sun.xml.writer",
 266                   omindex_libarchive);
 267     index_library("application/vnd.sun.xml.writer.global",
 268                   omindex_libarchive);
 269     index_library("application/vnd.sun.xml.writer.template",
 270                   omindex_libarchive);
 271     index_library("application/vnd.openxmlformats-officedocument."
 272                   "wordprocessingml.document", omindex_libarchive);
 273     index_library("application/vnd.openxmlformats-officedocument."
 274                   "wordprocessingml.template", omindex_libarchive);
 275     index_library("application/vnd.openxmlformats-officedocument."
 276                   "spreadsheetml.sheet", omindex_libarchive);
 277     index_library("application/vnd.openxmlformats-officedocument."
 278                   "spreadsheetml.template", omindex_libarchive);
 279     index_library("application/vnd.openxmlformats-officedocument."
 280                   "presentationml.presentation", omindex_libarchive);
 281     index_library("application/vnd.openxmlformats-officedocument."
 282                   "presentationml.slideshow", omindex_libarchive);
 283     index_library("application/vnd.openxmlformats-officedocument."
 284                   "presentationml.template", omindex_libarchive);
 285 #endif
 286 #if defined HAVE_LIBABW
 287     Worker* omindex_libabw = new Worker("omindex_libabw");
 288     index_library("application/x-abiword", omindex_libabw);
 289     index_library("application/x-abiword-compressed", omindex_libabw);
 290 #endif
 291 #if defined HAVE_LIBCDR
 292     Worker* omindex_libcdr = new Worker("omindex_libcdr");
 293     index_library("image/x-coreldraw", omindex_libcdr);
 294 #endif
 295 #if defined HAVE_LIBEXTRACTOR
 296     Worker* omindex_libextractor = new Worker("omindex_libextractor");
 297     index_library("video/mpeg", omindex_libextractor);
 298     index_library("video/x-flv", omindex_libextractor);
 299     index_library("video/x-msvideo", omindex_libextractor);
 300     index_library("video/x-ms-asf", omindex_libextractor);
 301     index_library("video/quicktime", omindex_libextractor);
 302     index_library("video/ogg", omindex_libextractor);
 303     index_library("audio/flac", omindex_libextractor);
 304     index_library("audio/mpeg", omindex_libextractor);
 305     index_library("audio/ogg", omindex_libextractor);
 306     index_library("audio/x-wav", omindex_libextractor);
 307     index_library("audio/x-mod", omindex_libextractor);
 308     index_library("audio/x-s3m", omindex_libextractor);
 309 #endif
 310 #if defined HAVE_LIBMWAW
 311     Worker* omindex_libmwaw = new Worker("omindex_libmwaw");
 312     index_library("application/clarisworks", omindex_libmwaw);
 313     index_library("image/x-pict", omindex_libmwaw);
 314 #endif
 315 }
 316
 317 void
 318 index_add_default_filters()
 319 {
 320     // Command needs to be run using /bin/sh.
 321     auto USE_SHELL = Filter::USE_SHELL;
 322     // Currently none of these commands needs USE_SHELL.
 323     (void)USE_SHELL;
 324     // Input should be piped to stdin.
 325     auto PIPE_IN = Filter::PIPE_IN;
 326     // Filename can be /dev/stdin (which must be seekable).
 327     auto SEEK_DEV_STDIN = Filter::SEEK_DEV_STDIN;
 328     // Filename can be /dev/stdin (which can be a pipe).
 329     auto PIPE_DEV_STDIN = Filter::PIPE_DEV_STDIN;
 330     index_command("application/msword",
 331                   Filter("antiword -mUTF-8.txt -", PIPE_IN));
 332     index_command("application/vnd.ms-excel",
 333                   Filter("xls2csv -c' ' -q0 -dutf-8", PIPE_DEV_STDIN));
 334     index_command("application/vnd.ms-powerpoint",
 335                   Filter("catppt -dutf-8", PIPE_DEV_STDIN));
 336     // Looking at the source of wpd2html and wpd2text I think both output
 337     // UTF-8, but it's hard to be sure without sample Unicode .wpd files
 338     // as they don't seem to be at all well documented.
 339     index_command("application/vnd.wordperfect",
 340                   Filter("wpd2text", SEEK_DEV_STDIN));
 341     // wps2text produces UTF-8 output from the sample files I've tested.
 342     index_command("application/vnd.ms-works",
 343                   Filter("wps2text", SEEK_DEV_STDIN));
 344     // Output is UTF-8 according to "man djvutxt".  Generally this seems to
 345     // be true, though some examples from djvu.org generate isolated byte
 346     // 0x95 in a context which suggests it might be intended to be a bullet
 347     // (as it is in CP1252).
 348     index_command("image/vnd.djvu", Filter("djvutxt -", PIPE_IN));
 349     index_command("text/markdown",
 350                   Filter("markdown", "text/html", PIPE_IN));
 351     // The --text option unhelpfully converts all non-ASCII characters to "?"
 352     // so we use --html instead, which produces HTML entities.  The --nopict
 353     // option suppresses exporting picture files as pictNNNN.wmf in the current
 354     // directory.  Note that this option was ignored in some older versions,
 355     // but it was fixed in unrtf 0.20.4.
 356     index_command("application/rtf",
 357                   Filter("unrtf --nopict --html 2>/dev/null", "text/html",
 358                          PIPE_IN));
 359     index_command("text/rtf",
 360                   Filter("unrtf --nopict --html 2>/dev/null", "text/html",
 361                          PIPE_IN));
 362     index_command("text/x-rst",
 363                   Filter("rst2html", "text/html", PIPE_IN));
 364     index_command("application/x-mspublisher",
 365                   Filter("pub2xhtml", "text/html", SEEK_DEV_STDIN));
 366     index_command("application/vnd.ms-outlook",
 367                   Filter(get_pkglibbindir() + "/outlookmsg2html",
 368                          "text/html", SEEK_DEV_STDIN));
 369     index_command("application/vnd.ms-visio.drawing",
 370                   Filter("vsd2xhtml", "image/svg+xml", SEEK_DEV_STDIN));
 371     index_command("application/vnd.ms-visio.stencil",
 372                   Filter("vsd2xhtml", "image/svg+xml", SEEK_DEV_STDIN));
 373     index_command("application/vnd.ms-visio.template",
 374                   Filter("vsd2xhtml", "image/svg+xml", SEEK_DEV_STDIN));
 375     index_command("application/vnd.visio",
 376                   Filter("vsd2xhtml", "image/svg+xml", SEEK_DEV_STDIN));
 377     // pod2text's output character set doesn't seem to be documented, but from
 378     // inspecting the source it looks like it's probably iso-8859-1.  We need
 379     // to pass "--errors=stderr" or else minor POD formatting errors cause a
 380     // file not to be indexed.
 381     index_command("text/x-perl",
 382                   Filter("pod2text --errors=stderr",
 383                          "text/plain", "iso-8859-1", PIPE_IN));
 384     // FIXME: -e0 means "UTF-8", but that results in "fi", "ff", "ffi", etc
 385     // appearing as single ligatures.  For European languages, it's actually
 386     // better to use -e2 (ISO-8859-1) and then convert, so let's do that for
 387     // now until we handle Unicode "compatibility decompositions".
 388     index_command("application/x-dvi",
 389                   Filter("catdvi -e2 -s", "text/plain", "iso-8859-1", PIPE_IN));
 390     // Simplistic - ought to look in index.rdf files for filename and character
 391     // set.
 392     index_command("application/x-maff",
 393                   Filter("unzip -p %f '*/*.*htm*'", "text/html", "iso-8859-1",
 394                          SEEK_DEV_STDIN));
 395     index_command("application/x-mimearchive",
 396                   Filter(get_pkglibbindir() + "/mhtml2html", "text/html",
 397                          PIPE_DEV_STDIN));
 398     index_command("message/news",
 399                   Filter(get_pkglibbindir() + "/rfc822tohtml", "text/html",
 400                          PIPE_DEV_STDIN));
 401     index_command("message/rfc822",
 402                   Filter(get_pkglibbindir() + "/rfc822tohtml", "text/html",
 403                          PIPE_DEV_STDIN));
 404     index_command("text/vcard",
 405                   Filter(get_pkglibbindir() + "/vcard2text", PIPE_DEV_STDIN));
 406     index_command("application/vnd.apple.keynote",
 407                   Filter("key2text", SEEK_DEV_STDIN));
 408     index_command("application/vnd.apple.numbers",
 409                   Filter("numbers2text", SEEK_DEV_STDIN));
 410     index_command("application/vnd.apple.pages",
 411                   Filter("pages2text", SEEK_DEV_STDIN));
 412 }
 413
 414 void
 415 index_init(const string& dbpath, const Xapian::Stem& stemmer,
 416            const string& root_, const string& site_term_,
 417            const string& host_term_,
 418            empty_body_type empty_body_, dup_action_type dup_action_,
 419            size_t sample_size_, size_t title_size_, size_t max_ext_len_,
 420            bool overwrite, bool retry_failed_,
 421            bool delete_removed_documents, bool verbose_, bool use_ctime_,
 422            bool spelling, bool ignore_exclusions_, bool description_as_sample_,
 423            bool date_terms_)
 424 {
 425     root = root_;
 426     site_term = site_term_;
 427     host_term = host_term_;
 428     empty_body = empty_body_;
 429     dup_action = dup_action_;
 430     sample_size = sample_size_;
 431     title_size = title_size_;
 432     max_ext_len = max_ext_len_;
 433     verbose = verbose_;
 434     use_ctime = use_ctime_;
 435     ignore_exclusions = ignore_exclusions_;
 436     description_as_sample = description_as_sample_;
 437     date_terms = date_terms_;
 438
 439     if (!overwrite) {
 440         db = Xapian::WritableDatabase(dbpath, Xapian::DB_CREATE_OR_OPEN);
 441         old_docs_not_seen = db.get_doccount();
 442         // Handle an initially empty database exactly the same way as when
 443         // overwrite is true.
 444         if (old_docs_not_seen != 0) {
 445             old_lastdocid = db.get_lastdocid();
 446             if (delete_removed_documents) {
 447                 // + 1 so that old_lastdocid is a valid subscript.
 448                 updated.resize(old_lastdocid + 1);
 449             }
 450             Xapian::valueno slot = use_ctime ? VALUE_CTIME : VALUE_LASTMOD;
 451             string ubound = db.get_value_upper_bound(slot);
 452             if (!ubound.empty())
 453                 last_altered_max = binary_string_to_int(ubound);
 454         }
 455     } else {
 456         db = Xapian::WritableDatabase(dbpath, Xapian::DB_CREATE_OR_OVERWRITE);
 457     }
 458
 459     if (spelling) {
 460         indexer.set_database(db);
 461         indexer.set_flags(indexer.FLAG_SPELLING);
 462     }
 463     indexer.set_stemmer(stemmer);
 464
 465     runfilter_init();
 466
 467     failed.init(db);
 468
 469     if (overwrite) {
 470         // There are no failures to retry, so setting this flag doesn't
 471         // change the outcome, but does mean we avoid the overhead of
 472         // checking for a previous failure.
 473         retry_failed = true;
 474     } else if (retry_failed_) {
 475         failed.clear();
 476         retry_failed = true;
 477     } else {
 478         // If there are no existing failures, setting this flag doesn't
 479         // change the outcome, but does mean we avoid the overhead of
 480         // checking for a previous failure.
 481         retry_failed = failed.empty();
 482     }
 483 }
 484
 485 static void
 486 parse_pdfinfo_field(const char* p, const char* end, string& out,
 487                     const char* field, size_t len)
 488 {
 489     if (size_t(end - p) > len && memcmp(p, field, len) == 0) {
 490         p += len;
 491         while (p != end && *p == ' ')
 492             ++p;
 493         if (p != end && (end[-1] != '\r' || --end != p))
 494             out.assign(p, end - p);
 495     }
 496 }
 497
 498 #define PARSE_PDFINFO_FIELD(P, END, OUT, FIELD) \
 499     parse_pdfinfo_field((P), (END), (OUT), FIELD":", CONST_STRLEN(FIELD) + 1)
 500
 501 static void
 502 parse_pdf_metainfo(const string& pdfinfo, string& author, string& title,
 503                    string& keywords, string& topic, int& pages)
 504 {
 505     const char* p = pdfinfo.data();
 506     const char* end = p + pdfinfo.size();
 507     while (p != end) {
 508         const char* start = p;
 509         p = static_cast<const char*>(memchr(p, '\n', end - p));
 510         const char* eol;
 511         if (p) {
 512             eol = p;
 513             ++p;
 514         } else {
 515             p = eol = end;
 516         }
 517         switch (*start) {
 518             case 'A':
 519                 PARSE_PDFINFO_FIELD(start, eol, author, "Author");
 520                 break;
 521             case 'K':
 522                 PARSE_PDFINFO_FIELD(start, eol, keywords, "Keywords");
 523                 break;
 524             case 'P': {
 525                 string s;
 526                 PARSE_PDFINFO_FIELD(start, eol, s, "Pages");
 527                 if (!s.empty())
 528                     pages = atoi(s.c_str());
 529                 break;
 530             }
 531             case 'S':
 532                 PARSE_PDFINFO_FIELD(start, eol, topic, "Subject");
 533                 break;
 534             case 'T':
 535                 PARSE_PDFINFO_FIELD(start, eol, title, "Title");
 536                 break;
 537         }
 538     }
 539 }
 540
 541 static void
 542 get_pdf_metainfo(int fd, string& author, string& title,
 543                  string& keywords, string& topic, int& pages)
 544 {
 545     try {
 546         string pdfinfo;
 547         static const char* const cmd[] = {
 548             "pdfinfo", "-enc", "UTF-8", "-", NULL
 549         };
 550         run_filter(fd, cmd);
 551         parse_pdf_metainfo(pdfinfo, author, title, keywords, topic, pages);
 552     } catch (const ReadError&) {
 553         // It's probably best to index the document even if pdfinfo fails.
 554     }
 555 }
 556
 557 static void
 558 get_pdf_metainfo(const string& file, string& author, string& title,
 559                  string& keywords, string& topic, int& pages)
 560 {
 561     try {
 562         const char* cmd[] = {
 563             "pdfinfo", "-enc", "UTF-8", NULL, NULL
 564         };
 565         cmd[3] = file.c_str();
 566         parse_pdf_metainfo(stdout_to_string(cmd),
 567                            author, title, keywords, topic, pages);
 568     } catch (const ReadError&) {
 569         // It's probably best to index the document even if pdfinfo fails.
 570     }
 571 }
 572
 573 static void
 574 generate_sample_from_csv(const string& csv_data, string& sample)
 575 {
 576     // Add 3 to allow for a 4 byte utf-8 sequence being appended when
 577     // output is sample_size - 1 bytes long.  Use csv_data.size() if smaller
 578     // since the user might reasonably set sample_size really high.
 579     sample.reserve(min(sample_size + 3, csv_data.size()));
 580     size_t last_word_end = 0;
 581     bool in_space = true;
 582     bool in_quotes = false;
 583     for (Xapian::Utf8Iterator i(csv_data); i != Xapian::Utf8Iterator(); ++i) {
 584         unsigned ch = *i;
 585
 586         if (!in_quotes) {
 587             // If not already in double quotes, '"' starts quoting and
 588             // ',' starts a new field.
 589             if (ch == '"') {
 590                 in_quotes = true;
 591                 continue;
 592             }
 593             if (ch == ',')
 594                 ch = ' ';
 595         } else if (ch == '"') {
 596             // In double quotes, '"' either ends double quotes, or
 597             // if followed by another '"', means a literal '"'.
 598             if (++i == Xapian::Utf8Iterator())
 599                 break;
 600             ch = *i;
 601             if (ch != '"') {
 602                 in_quotes = false;
 603                 if (ch == ',')
 604                     ch = ' ';
 605             }
 606         }
 607
 608         if (ch <= ' ' || ch == 0xa0) {
 609             // FIXME: if all the whitespace characters between two
 610             // words are 0xa0 (non-breaking space) then perhaps we
 611             // should output 0xa0.
 612             if (in_space)
 613                 continue;
 614             last_word_end = sample.size();
 615             sample += ' ';
 616             in_space = true;
 617         } else {
 618             Xapian::Unicode::append_utf8(sample, ch);
 619             in_space = false;
 620         }
 621
 622         if (sample.size() >= sample_size) {
 623             // Need to truncate sample.
 624             if (last_word_end <= sample_size / 2) {
 625                 // Monster word!  We'll have to just split it.
 626                 sample.replace(sample_size - 3, string::npos, "...", 3);
 627             } else {
 628                 sample.replace(last_word_end, string::npos, " ...", 4);
 629             }
 630             break;
 631         }
 632     }
 633 }
 634
 635 static bool
 636 index_check_existing(const string& urlterm, time_t last_altered,
 637                      Xapian::docid& did)
 638 {
 639     switch (dup_action) {
 640         case DUP_SKIP: {
 641             Xapian::PostingIterator p = db.postlist_begin(urlterm);
 642             if (p != db.postlist_end(urlterm)) {
 643                 if (verbose)
 644                     cout << "already indexed, not updating" << endl;
 645                 did = *p;
 646                 mark_as_seen(did);
 647                 return true;
 648             }
 649             break;
 650         }
 651         case DUP_CHECK_LAZILY: {
 652             // If last_altered > last_altered_max, we know for sure that the
 653             // file is new or updated.
 654             if (last_altered > last_altered_max) {
 655                 return false;
 656             }
 657
 658             Xapian::PostingIterator p = db.postlist_begin(urlterm);
 659             if (p != db.postlist_end(urlterm)) {
 660                 did = *p;
 661                 Xapian::Document doc = db.get_document(did);
 662                 Xapian::valueno slot = use_ctime ? VALUE_CTIME : VALUE_LASTMOD;
 663                 string value = doc.get_value(slot);
 664                 time_t old_last_altered = binary_string_to_int(value);
 665                 if (last_altered <= old_last_altered) {
 666                     if (verbose)
 667                         cout << "already indexed" << endl;
 668                     // The docid should be in updated - the only valid
 669                     // exception is if the URL was long and hashed to the
 670                     // same URL as an existing document indexed in the same
 671                     // batch.
 672                     mark_as_seen(did);
 673                     return true;
 674                 }
 675             }
 676             break;
 677         }
 678     }
 679     return false;
 680 }
 681
 682 void
 683 index_remove_failed_entry(const string& urlterm)
 684 {
 685     failed.del(urlterm);
 686 }
 687
 688 void
 689 index_add_document(const string& urlterm, time_t last_altered,
 690                    Xapian::docid did, const Xapian::Document& doc)
 691 {
 692     if (dup_action != DUP_SKIP) {
 693         // If this document has already been indexed, update the existing
 694         // entry.
 695         if (did) {
 696             // We already found out the document id above.
 697             db.replace_document(did, doc);
 698         } else if (last_altered <= last_altered_max) {
 699             // We checked for the UID term and didn't find it.
 700             did = db.add_document(doc);
 701         } else {
 702             did = db.replace_document(urlterm, doc);
 703         }
 704         mark_as_seen(did);
 705         if (verbose) {
 706             if (did <= old_lastdocid) {
 707                 cout << "updated" << endl;
 708             } else {
 709                 cout << "added" << endl;
 710             }
 711         }
 712     } else {
 713         // If this were a duplicate, we'd have skipped it above.
 714         db.add_document(doc);
 715         if (verbose)
 716             cout << "added" << endl;
 717     }
 718 }
 719
 720 void
 721 index_mimetype(const string& file, const string& urlterm, const string& url,
 722                const string& ext,
 723                string mimetype,
 724                DirectoryIterator& d,
 725                string pathterm,
 726                string record)
 727 {
 728     string context(file, root.size(), string::npos);
 729
 730     // FIXME: We could be cleverer here and check mtime too when use_ctime is
 731     // set - if the ctime has changed but the mtime is unchanged, we can just
 732     // update the existing Document and avoid having to re-extract text, etc.
 733     time_t last_altered = use_ctime ? d.get_ctime() : d.get_mtime();
 734
 735     Xapian::docid did = 0;
 736     if (index_check_existing(urlterm, last_altered, did))
 737         return;
 738
 739     if (!retry_failed) {
 740         // We only store and check the mtime (last modified) - a change to the
 741         // metadata won't generally cause a previous failure to now work
 742         // (FIXME: except permissions).
 743         time_t failed_last_mod;
 744         off_t failed_size;
 745         if (failed.contains(urlterm, failed_last_mod, failed_size)) {
 746             if (d.get_mtime() <= failed_last_mod &&
 747                 d.get_size() == failed_size) {
 748                 if (verbose)
 749                     cout << "failed to extract text on earlier run" << endl;
 750                 return;
 751             }
 752             // The file has changed, so remove the entry for it.  If it fails
 753             // again on this attempt, we'll add a new one.
 754             failed.del(urlterm);
 755         }
 756     }
 757
 758     // If we didn't get the mime type from the extension, call libmagic to get
 759     // it.
 760     if (mimetype.empty()) {
 761         mimetype = d.get_magic_mimetype();
 762         if (mimetype.empty()) {
 763             skip(urlterm, file.substr(root.size()),
 764                  "Unknown extension and unrecognised format",
 765                  d.get_size(), d.get_mtime(), SKIP_SHOW_FILENAME);
 766             return;
 767         }
 768     }
 769
 770     if (verbose)
 771         cout << "Indexing \"" << file.substr(root.size()) << "\" as "
 772              << mimetype << " ... " << flush;
 773
 774     // Use `file` as the basis, as we don't want URL encoding in these terms,
 775     // but need to switch over the initial part so we get `/~olly/foo/bar` not
 776     // `/home/olly/public_html/foo/bar`.
 777     Xapian::Document newdocument;
 778     size_t j;
 779     while ((j = pathterm.rfind('/')) > 1 && j != string::npos) {
 780         pathterm.resize(j);
 781         if (pathterm.length() > MAX_SAFE_TERM_LENGTH) {
 782             string term_hash = hash_long_term(pathterm, MAX_SAFE_TERM_LENGTH);
 783             newdocument.add_boolean_term(term_hash);
 784         } else {
 785             newdocument.add_boolean_term(pathterm);
 786         }
 787     }
 788
 789     string author, title, sample, keywords, topic, dump;
 790     string to, cc, bcc, message_id;
 791     string md5;
 792     time_t created = time_t(-1);
 793     int pages = -1;
 794
 795     map<string, Filter>::const_iterator cmd_it = commands.find(mimetype);
 796     if (cmd_it == commands.end()) {
 797         size_t slash = mimetype.find('/');
 798         if (slash != string::npos) {
 799             string wildtype(mimetype, 0, slash + 2);
 800             wildtype[slash + 1] = '*';
 801             cmd_it = commands.find(wildtype);
 802             if (cmd_it == commands.end()) {
 803                 cmd_it = commands.find("*/*");
 804             }
 805         }
 806         if (cmd_it == commands.end()) {
 807             cmd_it = commands.find("*");
 808         }
 809     }
 810     try {
 811         if (cmd_it != commands.end() && cmd_it->second.worker) {
 812             // Use a worker process to extract the content.
 813             Worker* wrk = cmd_it->second.worker;
 814             int r = wrk->extract(file, mimetype, dump, title, keywords, author,
 815                                  to, cc, bcc, message_id, pages, created);
 816             if (r != 0) {
 817                 string msg = wrk->get_error();
 818                 assert(!msg.empty());
 819                 skip(urlterm, context, msg, d.get_size(), d.get_mtime());
 820                 if (r < 0) {
 821                     // Hard failure - don't try this filter again for this run.
 822                     string filter_entry;
 823                     if (cmd_it != commands.end()) {
 824                         filter_entry = cmd_it->first;
 825                     } else {
 826                         filter_entry = mimetype;
 827                     }
 828                     commands[filter_entry] = Filter();
 829                 }
 830                 return;
 831             }
 832         } else if (cmd_it != commands.end()) {
 833             // Easy "run a command and read text or HTML from stdout or a
 834             // temporary file" cases.
 835             auto& filter = cmd_it->second;
 836             string cmd = filter.cmd;
 837             if (cmd.empty()) {
 838                 skip(urlterm, context, "required filter not installed",
 839                      d.get_size(), d.get_mtime(), SKIP_VERBOSE_ONLY);
 840                 return;
 841             }
 842             if (cmd == "false") {
 843                 // Allow setting 'false' as a filter to mean that a MIME type
 844                 // should be quietly ignored.
 845                 string m = "ignoring MIME type '";
 846                 m += cmd_it->first;
 847                 m += "'";
 848                 skip(urlterm, context, m, d.get_size(), d.get_mtime(),
 849                      SKIP_VERBOSE_ONLY);
 850                 return;
 851             }
 852             bool use_shell = filter.use_shell();
 853             bool input_on_stdin = filter.input_on_stdin();
 854             bool substituted = false;
 855             string tmpout;
 856             size_t pcent = 0;
 857             while (true) {
 858                 pcent = cmd.find('%', pcent);
 859                 if (pcent >= cmd.size() - 1)
 860                     break;
 861                 switch (cmd[pcent + 1]) {
 862                     case '%': // %% -> %.
 863                         cmd.erase(++pcent, 1);
 864                         break;
 865                     case 'f': { // %f -> escaped filename.
 866                         substituted = true;
 867                         if (filter.dev_stdin()) {
 868                             cmd.replace(pcent, 2, "/dev/stdin",
 869                                         CONST_STRLEN("/dev/stdin"));
 870                             break;
 871                         }
 872                         string tail(cmd, pcent + 2);
 873                         cmd.resize(pcent);
 874                         // Suppress the space append_filename_argument()
 875                         // usually adds before the argument - the command
 876                         // string either includes one, or won't expect one
 877                         // (e.g. --input=%f).
 878                         append_filename_argument(cmd, file, false);
 879                         pcent = cmd.size();
 880                         cmd += tail;
 881                         break;
 882                     }
 883                     case 't': { // %t -> temporary output file.
 884                         if (tmpout.empty()) {
 885                             // Use a temporary file with a suitable extension
 886                             // in case the command cares, and for more helpful
 887                             // error messages from the command.
 888                             if (filter.output_type == "text/html") {
 889                                 tmpout = get_tmpfile("tmp.html");
 890                             } else if (filter.output_type == "image/svg+xml") {
 891                                 tmpout = get_tmpfile("tmp.svg");
 892                             } else {
 893                                 tmpout = get_tmpfile("tmp.txt");
 894                             }
 895                         }
 896                         substituted = true;
 897                         string tail(cmd, pcent + 2);
 898                         cmd.resize(pcent);
 899                         // Suppress the space append_filename_argument()
 900                         // usually adds before the argument - the command
 901                         // string either includes one, or won't expect one
 902                         // (e.g. --output=%t).
 903                         append_filename_argument(cmd, tmpout, false);
 904                         pcent = cmd.size();
 905                         cmd += tail;
 906                         break;
 907                     }
 908                     default:
 909                         // Leave anything else alone for now.
 910                         pcent += 2;
 911                         break;
 912                 }
 913             }
 914             if (!substituted && cmd != "true") {
 915                 if (input_on_stdin) {
 916                     if (filter.dev_stdin()) {
 917                         cmd += " /dev/stdin";
 918                     }
 919                 } else {
 920                     // If no %f, append the filename to the command.
 921                     append_filename_argument(cmd, file);
 922                 }
 923             }
 924             try {
 925                 if (!tmpout.empty()) {
 926                     // Output in temporary file.
 927                     if (input_on_stdin) {
 928                         run_filter(d.get_fd(), cmd, use_shell);
 929                     } else {
 930                         run_filter(cmd, use_shell);
 931                     }
 932                     if (!load_file(tmpout, dump, NOCACHE)) {
 933                         throw ReadError("Couldn't read output file");
 934                     }
 935                     unlink(tmpout.c_str());
 936                 } else if (cmd == "true") {
 937                     // Ignore the file's contents, just index metadata from the
 938                     // filing system.
 939                 } else {
 940                     // Output on stdout.
 941                     if (input_on_stdin) {
 942                         run_filter(d.get_fd(), cmd, use_shell, &dump);
 943                     } else {
 944                         run_filter(cmd, use_shell, &dump);
 945                     }
 946                 }
 947                 const string& charset = filter.output_charset;
 948                 if (filter.output_type == "text/html") {
 949                     HtmlParser p;
 950                     p.ignore_metarobots();
 951                     p.description_as_sample = description_as_sample;
 952                     try {
 953                         p.parse(dump, charset, false);
 954                     } catch (const string& newcharset) {
 955                         p.reset();
 956                         p.ignore_metarobots();
 957                         p.description_as_sample = description_as_sample;
 958                         p.parse(dump, newcharset, true);
 959                     } catch (const ReadError&) {
 960                         skip_cmd_failed(urlterm, context, cmd,
 961                                         d.get_size(), d.get_mtime());
 962                         return;
 963                     }
 964                     dump = p.dump;
 965                     title = p.title;
 966                     keywords = p.keywords;
 967                     topic = p.topic;
 968                     sample = p.sample;
 969                     author = p.author;
 970                     created = p.created;
 971                 } else if (filter.output_type == "image/svg+xml") {
 972                     SvgParser svgparser;
 973                     svgparser.parse(dump);
 974                     dump = svgparser.dump;
 975                     title = svgparser.title;
 976                     keywords = svgparser.keywords;
 977                     // FIXME: topic = svgparser.topic;
 978                     author = svgparser.author;
 979                 } else if (!charset.empty()) {
 980                     convert_to_utf8(dump, charset);
 981                 }
 982             } catch (const ReadError&) {
 983                 skip_cmd_failed(urlterm, context, cmd,
 984                                 d.get_size(), d.get_mtime());
 985                 return;
 986             }
 987         } else if (mimetype == "text/html" || mimetype == "text/x-php") {
 988             const string& text = d.file_to_string();
 989             HtmlParser p;
 990             if (ignore_exclusions) p.ignore_metarobots();
 991             p.description_as_sample = description_as_sample;
 992             try {
 993                 // Default HTML character set is latin 1, though not specifying
 994                 // one is deprecated these days.
 995                 p.parse(text, "iso-8859-1", false);
 996             } catch (const string& newcharset) {
 997                 p.reset();
 998                 if (ignore_exclusions) p.ignore_metarobots();
 999                 p.description_as_sample = description_as_sample;
1000                 p.parse(text, newcharset, true);
1001             }
1002             if (!p.indexing_allowed) {
1003                 skip_meta_tag(urlterm, context,
1004                               d.get_size(), d.get_mtime());
1005                 return;
1006             }
1007             dump = p.dump;
1008             title = p.title;
1009             keywords = p.keywords;
1010             topic = p.topic;
1011             sample = p.sample;
1012             author = p.author;
1013             created = p.created;
1014             md5_string(text, md5);
1015         } else if (mimetype == "text/plain") {
1016             // Currently we assume that text files are UTF-8 unless they have a
1017             // byte-order mark.
1018             dump = d.file_to_string();
1019             md5_string(dump, md5);
1020
1021             // Look for Byte-Order Mark (BOM).
1022             if (startswith(dump, "\xfe\xff") || startswith(dump, "\xff\xfe")) {
1023                 // UTF-16 in big-endian/little-endian order - we just convert
1024                 // it as "UTF-16" and let the conversion handle the BOM as that
1025                 // way we avoid the copying overhead of erasing 2 bytes from
1026                 // the start of dump.
1027                 convert_to_utf8(dump, "UTF-16");
1028             } else if (startswith(dump, "\xef\xbb\xbf")) {
1029                 // UTF-8 with stupid Windows not-the-byte-order mark.
1030                 dump.erase(0, 3);
1031             } else {
1032                 // FIXME: What charset is the file?  Look at contents?
1033             }
1034         } else if (mimetype == "application/pdf") {
1035             const char* const cmd[] = {
1036                 "pdftotext", "-enc", "UTF-8", "-", "-", NULL
1037             };
1038             try {
1039                 run_filter(d.get_fd(), cmd, &dump);
1040             } catch (const ReadError&) {
1041                 skip_cmd_failed(urlterm, context, cmd,
1042                                 d.get_size(), d.get_mtime());
1043                 return;
1044             }
1045             get_pdf_metainfo(d.get_fd(), author, title, keywords, topic, pages);
1046         } else if (mimetype == "application/postscript") {
1047             // There simply doesn't seem to be a Unicode capable PostScript to
1048             // text converter (e.g. pstotext always outputs ISO-8859-1).  The
1049             // only solution seems to be to convert via PDF using ps2pdf and
1050             // then pdftotext.  This gives plausible looking UTF-8 output for
1051             // some Chinese PostScript files I found using Google.  It also has
1052             // the benefit of allowing us to extract meta information from
1053             // PostScript files.
1054             string tmpfile = get_tmpfile("tmp.pdf");
1055             if (tmpfile.empty()) {
1056                 // FIXME: should this be fatal?  Or disable indexing postscript?
1057                 string msg = "Couldn't create temporary directory (";
1058                 msg += strerror(errno);
1059                 msg += ")";
1060                 skip(urlterm, context, msg,
1061                      d.get_size(), d.get_mtime());
1062                 return;
1063             }
1064             const char* cmd[] = {
1065                 "ps2pdf", "-", NULL, NULL
1066             };
1067             cmd[2] = tmpfile.c_str();
1068             try {
1069                 run_filter(d.get_fd(), cmd);
1070                 const char* cmd2[] = {
1071                     "pdftotext", "-enc", "UTF-8", NULL, "-", NULL
1072                 };
1073                 cmd2[3] = tmpfile.c_str();
1074                 run_filter(cmd2, &dump);
1075             } catch (const ReadError&) {
1076                 skip_cmd_failed(urlterm, context, cmd,
1077                                 d.get_size(), d.get_mtime());
1078                 unlink(tmpfile.c_str());
1079                 return;
1080             } catch (...) {
1081                 unlink(tmpfile.c_str());
1082                 throw;
1083             }
1084             try {
1085                 get_pdf_metainfo(tmpfile, author, title, keywords, topic,
1086                                  pages);
1087             } catch (...) {
1088                 unlink(tmpfile.c_str());
1089                 throw;
1090             }
1091             unlink(tmpfile.c_str());
1092         } else if (startswith(mimetype, "application/vnd.sun.xml.") ||
1093                    startswith(mimetype, "application/vnd.oasis.opendocument."))
1094         {
1095             // Inspired by http://mjr.towers.org.uk/comp/sxw2text
1096             string cmd = "unzip -p";
1097             append_filename_argument(cmd, file);
1098             cmd += " content.xml ; unzip -p";
1099             append_filename_argument(cmd, file);
1100             cmd += " styles.xml";
1101             try {
1102                 OpenDocParser parser;
1103                 parser.parse(stdout_to_string(cmd, true));
1104                 dump = parser.dump;
1105             } catch (const ReadError&) {
1106                 skip_cmd_failed(urlterm, context, cmd,
1107                                 d.get_size(), d.get_mtime());
1108                 return;
1109             }
1110
1111             cmd = "unzip -p";
1112             append_filename_argument(cmd, file);
1113             cmd += " meta.xml";
1114             try {
1115                 OpenDocMetaParser metaparser;
1116                 metaparser.parse(stdout_to_string(cmd, false));
1117                 title = metaparser.title;
1118                 keywords = metaparser.keywords;
1119                 // FIXME: topic = metaparser.topic;
1120                 sample = metaparser.sample;
1121                 author = metaparser.author;
1122                 pages = metaparser.pages;
1123             } catch (const ReadError&) {
1124                 // It's probably best to index the document even if this fails.
1125             }
1126         } else if (startswith(mimetype,
1127                               "application/vnd.openxmlformats-officedocument."))
1128         {
1129             const char* args = NULL;
1130             string tail(mimetype, 46);
1131             if (startswith(tail, "wordprocessingml.")) {
1132                 // unzip returns exit code 11 if a file to extract wasn't found
1133                 // which we want to ignore, because there may be no headers or
1134                 // no footers.
1135                 args = " word/document.xml"
1136                        " 'word/header*.xml'"
1137                        " 'word/footer*.xml'"
1138                        " 2>/dev/null";
1139             } else if (startswith(tail, "spreadsheetml.")) {
1140                 // Extract the shared string table first, so our parser can
1141                 // grab those ready for parsing the sheets which will reference
1142                 // the shared strings.
1143                 string cmd = "unzip -p";
1144                 append_filename_argument(cmd, file);
1145                 cmd += " xl/styles.xml xl/workbook.xml xl/sharedStrings.xml ; "
1146                        "unzip -p";
1147                 append_filename_argument(cmd, file);
1148                 cmd += " xl/worksheets/sheet\\*.xml";
1149                 try {
1150                     XlsxParser parser;
1151                     parser.parse(stdout_to_string(cmd, true));
1152                     dump = parser.dump;
1153                 } catch (const ReadError&) {
1154                     skip_cmd_failed(urlterm, context, cmd,
1155                                     d.get_size(), d.get_mtime());
1156                     return;
1157                 }
1158             } else if (startswith(tail, "presentationml.")) {
1159                 // unzip returns exit code 11 if a file to extract wasn't found
1160                 // which we want to ignore, because there may be no notesSlides
1161                 // or comments.
1162                 args = " 'ppt/slides/slide*.xml'"
1163                        " 'ppt/notesSlides/notesSlide*.xml'"
1164                        " 'ppt/comments/comment*.xml'"
1165                        " 2>/dev/null";
1166             } else {
1167                 // Don't know how to index this type.
1168                 skip_unknown_mimetype(urlterm, context, mimetype,
1169                                       d.get_size(), d.get_mtime());
1170                 return;
1171             }
1172
1173             if (args) {
1174                 string cmd = "unzip -p";
1175                 append_filename_argument(cmd, file);
1176                 cmd += args;
1177                 try {
1178                     MSXmlParser xmlparser;
1179                     // Treat exit status 11 from unzip as success - this is
1180                     // what we get if one of the listed filenames to extract
1181                     // doesn't match anything in the zip file.
1182                     xmlparser.parse(stdout_to_string(cmd, false, 11));
1183                     dump = xmlparser.dump;
1184                 } catch (const ReadError&) {
1185                     skip_cmd_failed(urlterm, context, cmd,
1186                                     d.get_size(), d.get_mtime());
1187                     return;
1188                 }
1189             }
1190
1191             string cmd = "unzip -p";
1192             append_filename_argument(cmd, file);
1193             cmd += " docProps/core.xml";
1194             try {
1195                 OpenDocMetaParser metaparser;
1196                 metaparser.parse(stdout_to_string(cmd, false));
1197                 title = metaparser.title;
1198                 keywords = metaparser.keywords;
1199                 // FIXME: topic = metaparser.topic;
1200                 sample = metaparser.sample;
1201                 author = metaparser.author;
1202             } catch (const ReadError&) {
1203                 // It's probably best to index the document even if this fails.
1204             }
1205         } else if (mimetype == "application/x-abiword") {
1206             AbiwordParser abiwordparser;
1207             const string& text = d.file_to_string();
1208             abiwordparser.parse(text);
1209             dump = abiwordparser.dump;
1210             md5_string(text, md5);
1211         } else if (mimetype == "application/x-abiword-compressed") {
1212             AbiwordParser abiwordparser;
1213             abiwordparser.parse(d.gzfile_to_string());
1214             dump = abiwordparser.dump;
1215         } else if (mimetype == "application/oxps" ||
1216                    mimetype == "application/vnd.ms-xpsdocument") {
1217             const char* cmd[] = {
1218                 "unzip", "-p", NULL, "Documents/*/Pages/*.fpage", NULL
1219             };
1220             cmd[2] = file.c_str();
1221             try {
1222                 XpsParser xpsparser;
1223                 run_filter(cmd,  &dump);
1224                 xpsparser.parse(dump);
1225                 dump = xpsparser.dump;
1226             } catch (const ReadError&) {
1227                 skip_cmd_failed(urlterm, context, cmd,
1228                                 d.get_size(), d.get_mtime());
1229                 return;
1230             }
1231
1232             const char* cmd2[] = {
1233                 "unzip", "-p", NULL, "docProps/core.xml", NULL
1234             };
1235             cmd2[2] = file.c_str();
1236             try {
1237                 OpenDocMetaParser metaparser;
1238                 metaparser.parse(stdout_to_string(cmd2));
1239                 title = metaparser.title;
1240                 keywords = metaparser.keywords;
1241                 // FIXME: topic = metaparser.topic;
1242                 sample = metaparser.sample;
1243                 author = metaparser.author;
1244             } catch (const ReadError&) {
1245                 // Ignore errors as not all XPS files contain this file.
1246             }
1247         } else if (mimetype == "text/csv") {
1248             // Currently we assume that text files are UTF-8 unless they have a
1249             // byte-order mark.
1250             dump = d.file_to_string();
1251             md5_string(dump, md5);
1252
1253             // Look for Byte-Order Mark (BOM).
1254             if (startswith(dump, "\xfe\xff") || startswith(dump, "\xff\xfe")) {
1255                 // UTF-16 in big-endian/little-endian order - we just convert
1256                 // it as "UTF-16" and let the conversion handle the BOM as that
1257                 // way we avoid the copying overhead of erasing 2 bytes from
1258                 // the start of dump.
1259                 convert_to_utf8(dump, "UTF-16");
1260             } else if (startswith(dump, "\xef\xbb\xbf")) {
1261                 // UTF-8 with stupid Windows not-the-byte-order mark.
1262                 dump.erase(0, 3);
1263             } else {
1264                 // FIXME: What charset is the file?  Look at contents?
1265             }
1266
1267             generate_sample_from_csv(dump, sample);
1268         } else if (mimetype == "image/svg+xml") {
1269             SvgParser svgparser;
1270             const string& text = d.file_to_string();
1271             md5_string(text, md5);
1272             svgparser.parse(text);
1273             dump = svgparser.dump;
1274             title = svgparser.title;
1275             keywords = svgparser.keywords;
1276             // FIXME: topic = svgparser.topic;
1277             author = svgparser.author;
1278         } else if (mimetype == "image/svg+xml-compressed") {
1279             SvgParser svgparser;
1280             const string& text = d.gzfile_to_string();
1281             svgparser.parse(text);
1282             dump = svgparser.dump;
1283             title = svgparser.title;
1284             keywords = svgparser.keywords;
1285             // FIXME: topic = svgparser.topic;
1286             author = svgparser.author;
1287         } else if (mimetype == "application/vnd.debian.binary-package" ||
1288                    mimetype == "application/x-debian-package") {
1289             const char* cmd = "dpkg-deb -f - Description";
1290             string desc;
1291             run_filter(d.get_fd(), cmd, false, &desc);
1292             // First line is short description, which we use as the title.
1293             string::size_type idx = desc.find('\n');
1294             title.assign(desc, 0, idx);
1295             if (idx != string::npos) {
1296                 dump.assign(desc, idx + 1, string::npos);
1297             }
1298         } else if (mimetype == "application/x-redhat-package-manager" ||
1299                    mimetype == "application/x-rpm") {
1300             string cmd("rpm -q --qf '%{SUMMARY}\\n%{DESCRIPTION}' -p");
1301             append_filename_argument(cmd, file);
1302             string desc;
1303             run_filter(cmd, false, &desc);
1304             // First line is summary, which we use as the title.
1305             string::size_type idx = desc.find('\n');
1306             title.assign(desc, 0, idx);
1307             if (idx != string::npos) {
1308                 dump.assign(desc, idx + 1, string::npos);
1309             }
1310         } else if (mimetype == "application/atom+xml") {
1311             AtomParser atomparser;
1312             const string& text = d.file_to_string();
1313             md5_string(text, md5);
1314             atomparser.parse(text);
1315             dump = atomparser.dump;
1316             title = atomparser.title;
1317             keywords = atomparser.keywords;
1318             // FIXME: topic = atomparser.topic;
1319             author = atomparser.author;
1320         } else {
1321             // Don't know how to index this type.
1322             skip_unknown_mimetype(urlterm, context, mimetype,
1323                                   d.get_size(), d.get_mtime());
1324             return;
1325         }
1326
1327         // Compute the MD5 of the file if we haven't already.
1328         if (md5.empty() && !d.md5(md5)) {
1329             if (errno == ENOENT || errno == ENOTDIR) {
1330                 skip(urlterm, context, "File removed during indexing",
1331                      d.get_size(), d.get_mtime(),
1332                      SKIP_VERBOSE_ONLY | SKIP_SHOW_FILENAME);
1333             } else {
1334                 skip(urlterm, context,
1335                      "failed to read file to calculate MD5 checksum",
1336                      d.get_size(), d.get_mtime());
1337             }
1338             return;
1339         }
1340
1341         // Remove any trailing formfeeds, so we don't consider them when
1342         // considering if we extracted any text (e.g. pdftotext outputs a
1343         // formfeed between each page, even for blank pages).
1344         //
1345         // If dump contain only formfeeds, then trim_end will be string::npos
1346         // and ++trim_end will be 0, which is the correct new size.
1347         string::size_type trim_end = dump.find_last_not_of('\f');
1348         if (UNSIGNED_OVERFLOW_OK(++trim_end) != dump.size())
1349             dump.resize(trim_end);
1350
1351         if (dump.empty()) {
1352             switch (empty_body) {
1353                 case EMPTY_BODY_INDEX:
1354                     break;
1355                 case EMPTY_BODY_WARN:
1356                     cout << "no text extracted from document body, "
1357                             "but indexing metadata anyway" << endl;
1358                     break;
1359                 case EMPTY_BODY_SKIP:
1360                     skip(urlterm, context,
1361                          "no text extracted from document body",
1362                          d.get_size(), d.get_mtime());
1363                     return;
1364             }
1365         }
1366
1367         // Produce a sample
1368         if (sample.empty()) {
1369             sample = generate_sample(dump, sample_size, "...", " ...");
1370         } else {
1371             sample = generate_sample(sample, sample_size, "...", " ...");
1372         }
1373
1374         // Put the data in the document
1375         if (record.empty()) {
1376             record = "url=";
1377         } else {
1378             record += "\nurl=";
1379         }
1380         record += url;
1381         record += "\nsample=";
1382         record += sample;
1383         if (!title.empty()) {
1384             record += "\ncaption=";
1385             record += generate_sample(title, title_size, "...", " ...");
1386         }
1387         if (!author.empty()) {
1388             record += "\nauthor=";
1389             record += author;
1390         }
1391         if (!to.empty()) {
1392             record += "\nto=";
1393             record += to;
1394         }
1395         if (!cc.empty()) {
1396             record += "\ncc=";
1397             record += cc;
1398         }
1399         if (!bcc.empty()) {
1400             record += "\nbcc=";
1401             record += bcc;
1402         }
1403         if (!message_id.empty()) {
1404             record += "\nmsgid=";
1405             record += message_id;
1406         }
1407         record += "\ntype=";
1408         record += mimetype;
1409         time_t mtime = d.get_mtime();
1410         if (mtime != static_cast<time_t>(-1)) {
1411             record += "\nmodtime=";
1412             record += str(mtime);
1413         }
1414         if (created != static_cast<time_t>(-1)) {
1415             record += "\ncreated=";
1416             record += str(created);
1417         }
1418         if (pages >= 0) {
1419             record += "\npages=";
1420             record += str(pages);
1421         }
1422         off_t size = d.get_size();
1423         record += "\nsize=";
1424         record += str(size);
1425         newdocument.set_data(record);
1426
1427         // Index the title, document text, keywords and topic.
1428         indexer.set_document(newdocument);
1429         if (!title.empty()) {
1430             indexer.index_text(title, 5, "S");
1431             indexer.increase_termpos(100);
1432         }
1433         if (!dump.empty()) {
1434             indexer.index_text(dump);
1435         }
1436         if (!keywords.empty()) {
1437             indexer.increase_termpos(100);
1438             indexer.index_text(keywords);
1439         }
1440         if (!topic.empty()) {
1441             indexer.increase_termpos(100);
1442             indexer.index_text(topic, 1, "B");
1443         }
1444         // Index the leafname of the file.
1445         {
1446             indexer.increase_termpos(100);
1447             string leaf = d.leafname();
1448             string::size_type dot = leaf.find_last_of('.');
1449             if (dot != string::npos && leaf.size() - dot - 1 <= max_ext_len)
1450                 leaf.resize(dot);
1451             indexer.index_text(leaf, 1, "F");
1452
1453             // Also index with underscores and ampersands replaced by spaces.
1454             bool modified = false;
1455             string::size_type rep = 0;
1456             while ((rep = leaf.find_first_of("_&", rep)) != string::npos) {
1457                 leaf[rep++] = ' ';
1458                 modified = true;
1459             }
1460             if (modified) {
1461                 indexer.increase_termpos(100);
1462                 indexer.index_text(leaf, 1, "F");
1463             }
1464         }
1465
1466         if (!author.empty()) {
1467             indexer.increase_termpos(100);
1468             indexer.index_text(author, 1, "A");
1469         }
1470
1471         if (!to.empty()) {
1472             indexer.increase_termpos(100);
1473             indexer.index_text(to, 1, "XTO");
1474         }
1475
1476         if (!cc.empty()) {
1477             indexer.increase_termpos(100);
1478             indexer.index_text(cc, 1, "XCC");
1479         }
1480
1481         if (!bcc.empty()) {
1482             indexer.increase_termpos(100);
1483             indexer.index_text(bcc, 1, "XBCC");
1484         }
1485
1486         if (!message_id.empty()) {
1487             newdocument.add_boolean_term("XMID:" + message_id);
1488         }
1489
1490         // mimeType:
1491         newdocument.add_boolean_term("T" + mimetype);
1492
1493         newdocument.add_boolean_term(site_term);
1494
1495         if (!host_term.empty())
1496             newdocument.add_boolean_term(host_term);
1497
1498         if (date_terms) {
1499             struct tm* tm = localtime(&mtime);
1500             string date_term = "D";
1501             date_term += date_to_string(tm->tm_year + 1900,
1502                                         tm->tm_mon + 1,
1503                                         tm->tm_mday);
1504             newdocument.add_boolean_term(date_term); // Date (YYYYMMDD)
1505             date_term.resize(7);
1506             date_term[0] = 'M';
1507             newdocument.add_boolean_term(date_term); // Month (YYYYMM)
1508             date_term.resize(5);
1509             date_term[0] = 'Y';
1510             newdocument.add_boolean_term(date_term); // Year (YYYY)
1511         }
1512
1513         newdocument.add_boolean_term(urlterm); // Url
1514
1515         // Add mtime as a value to allow "sort by date".
1516         newdocument.add_value(VALUE_LASTMOD,
1517                               int_to_binary_string(uint32_t(mtime)));
1518         if (use_ctime) {
1519             // Add ctime as a value to track modifications.
1520             time_t ctime = d.get_ctime();
1521             newdocument.add_value(VALUE_CTIME,
1522                                   int_to_binary_string(uint32_t(ctime)));
1523         }
1524
1525         // Add MD5 as a value to allow duplicate documents to be collapsed
1526         // together.
1527         newdocument.add_value(VALUE_MD5, md5);
1528
1529         // Add the file size as a value to allow "sort by size" and size ranges.
1530         newdocument.add_value(VALUE_SIZE,
1531                               Xapian::sortable_serialise(size));
1532
1533         if (created != static_cast<time_t>(-1)) {
1534             // Add created time as a value to allow "sort by created date".
1535             newdocument.add_value(VALUE_CREATED,
1536                                   int_to_binary_string(uint32_t(created)));
1537         }
1538
1539         bool inc_tag_added = false;
1540         if (d.is_other_readable()) {
1541             inc_tag_added = true;
1542             newdocument.add_boolean_term("I*");
1543         } else if (d.is_group_readable()) {
1544             const char* group = d.get_group();
1545             if (group) {
1546                 newdocument.add_boolean_term(string("I#") + group);
1547             }
1548         }
1549         const char* owner = d.get_owner();
1550         if (owner) {
1551             newdocument.add_boolean_term(string("O") + owner);
1552             if (!inc_tag_added && d.is_owner_readable())
1553                 newdocument.add_boolean_term(string("I@") + owner);
1554         }
1555
1556         string ext_term("E");
1557         for (string::const_iterator i = ext.begin(); i != ext.end(); ++i) {
1558             char ch = *i;
1559             if (ch >= 'A' && ch <= 'Z')
1560                 ch |= 32;
1561             ext_term += ch;
1562         }
1563         newdocument.add_boolean_term(ext_term);
1564
1565         index_add_document(urlterm, last_altered, did, newdocument);
1566     } catch (const ReadError&) {
1567         skip(urlterm, context, string("can't read file: ") + strerror(errno),
1568              d.get_size(), d.get_mtime());
1569     } catch (const NoSuchFilter&) {
1570         string filter_entry;
1571         if (cmd_it != commands.end()) {
1572             filter_entry = cmd_it->first;
1573         } else {
1574             filter_entry = mimetype;
1575         }
1576         string m = "Filter for \"";
1577         m += filter_entry;
1578         m += "\" not installed";
1579         skip(urlterm, context, m, d.get_size(), d.get_mtime());
1580         commands[filter_entry] = Filter();
1581     } catch (const FileNotFound&) {
1582         skip(urlterm, context, "File removed during indexing",
1583              d.get_size(), d.get_mtime(),
1584              SKIP_VERBOSE_ONLY | SKIP_SHOW_FILENAME);
1585     } catch (const std::string& error) {
1586         skip(urlterm, context, error, d.get_size(), d.get_mtime());
1587     } catch (const std::bad_alloc&) {
1588         // Attempt to flag the file as failed and commit changes, though that
1589         // might fail too if we're low on memory rather than being asked to
1590         // allocate a ludicrous amount.
1591         skip(urlterm, context, "Out of memory trying to extract text from file",
1592              d.get_size(), d.get_mtime(),
1593              SKIP_SHOW_FILENAME);
1594         throw CommitAndExit("Caught std::bad_alloc", "");
1595     }
1596 }
1597
1598 void
1599 index_handle_deletion()
1600 {
1601     if (updated.empty() || old_docs_not_seen == 0) return;
1602
1603     if (verbose) {
1604         cout << "Deleting " << old_docs_not_seen
1605              << " old documents which weren't found" << endl;
1606     }
1607     Xapian::PostingIterator alldocs = db.postlist_begin(string());
1608     Xapian::docid did = *alldocs;
1609     while (did < updated.size()) {
1610         if (!updated[did]) {
1611             alldocs.skip_to(did);
1612             if (alldocs == db.postlist_end(string()))
1613                 break;
1614             if (*alldocs != did) {
1615                 // Document #did didn't exist before we started.
1616                 did = *alldocs;
1617                 continue;
1618             }
1619             db.delete_document(did);
1620             if (--old_docs_not_seen == 0)
1621                 break;
1622         }
1623         ++did;
1624     }
1625 }
1626
1627 void
1628 index_commit()
1629 {
1630     db.commit();
1631 }
1632
1633 void
1634 index_done()
1635 {
1636     // If we created a temporary directory then delete it.
1637     remove_tmpdir();
1638 }