xapian-applications/omega/omindex.cc

   1 /** @file
   2  * @brief index static documents into the omega db
   3  */
   4 /* Copyright 1999,2000,2001 BrightStation PLC
   5  * Copyright 2001,2005 James Aylett
   6  * Copyright 2001,2002 Ananova Ltd
   7  * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2017,2018 Olly Betts
   8  * Copyright 2009 Frank J Bruzzaniti
   9  * Copyright 2012 Mihai Bivol
  10  *
  11  * This program is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU General Public License as
  13  * published by the Free Software Foundation; either version 2 of the
  14  * License, or (at your option) any later version.
  15  *
  16  * This program is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19  * GNU General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU General Public License
  22  * along with this program; if not, write to the Free Software
  23  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
  24  * USA
  25  */
  26
  27 #include <config.h>
  28
  29 #include <algorithm>
  30 #include <fstream>
  31 #include <iostream>
  32 #include <string>
  33 #include <map>
  34
  35 #include <sys/types.h>
  36 #include "safeunistd.h"
  37 #include <cerrno>
  38 #include <cstdio>
  39 #include <cstdlib>
  40 #include <cstring>
  41 #include "safefcntl.h"
  42
  43 #ifdef HAVE_FNMATCH
  44 # include <fnmatch.h>
  45 #endif
  46
  47 #include <xapian.h>
  48
  49 #include "commonhelp.h"
  50 #include "diritor.h"
  51 #include "hashterm.h"
  52 #include "index_file.h"
  53 #include "mime.h"
  54 #include "realtime.h"
  55 #include "str.h"
  56 #include "stringutils.h"
  57 #include "urlencode.h"
  58
  59 #include "gnu_getopt.h"
  60
  61 using namespace std;
  62
  63 #define PROG_NAME "omindex"
  64 #define PROG_DESC "Index static website data via the filesystem"
  65
  66 #define TITLE_SIZE 128
  67 #define SAMPLE_SIZE 512
  68
  69 static bool follow_symlinks = false;
  70 static off_t max_size = 0;
  71 static std::string pretty_max_size;
  72 static bool verbose = false;
  73 static double sleep_before_opendir = 0;
  74
  75 static string root;
  76 static string url_start_path;
  77
  78 #ifdef HAVE_FNMATCH
  79 static vector<pair<const char*, const char*>> mime_patterns;
  80 #endif
  81
  82 static inline bool
  83 p_notalnum(unsigned int c)
  84 {
  85     return !C_isalnum(c);
  86 }
  87
  88 static void
  89 index_file(const string &file, const string &url, DirectoryIterator & d,
  90            map<string, string>& mime_map)
  91 {
  92     string urlterm("U");
  93     urlterm += url;
  94
  95     if (urlterm.length() > MAX_SAFE_TERM_LENGTH)
  96         urlterm = hash_long_term(urlterm, MAX_SAFE_TERM_LENGTH);
  97
  98     const char* leafname = d.leafname();
  99
 100     string mimetype;
 101 #ifdef HAVE_FNMATCH
 102     for (auto&& i : mime_patterns) {
 103         if (fnmatch(i.first, leafname, 0) == 0) {
 104             if (strcmp(i.second, "ignore") == 0)
 105                 return;
 106             if (strcmp(i.second, "skip") == 0) {
 107                 string m = "Leafname '";
 108                 m += leafname;
 109                 m += "' matches pattern: ";
 110                 m += i.first;
 111                 skip(urlterm, file.substr(root.size()), m,
 112                      d.get_size(), d.get_mtime());
 113                 return;
 114             }
 115             mimetype = i.second;
 116             break;
 117         }
 118     }
 119 #endif
 120
 121     string ext;
 122     const char * dot_ptr = strrchr(leafname, '.');
 123     if (dot_ptr) {
 124         ext.assign(dot_ptr + 1);
 125         if (ext.size() > max_ext_len)
 126             ext.resize(0);
 127     }
 128
 129     if (mimetype.empty()) {
 130         mimetype = mimetype_from_ext(mime_map, ext);
 131         if (mimetype == "ignore") {
 132             // Remove any existing failed entry for this file.
 133             index_remove_failed_entry(urlterm);
 134             return;
 135         } else if (mimetype == "skip") {
 136             // Ignore mimetype, skipped mimetype should not be quietly ignored.
 137             string m = "skipping extension '";
 138             m += ext;
 139             m += "'";
 140             skip(urlterm, file.substr(root.size()), m,
 141                  d.get_size(), d.get_mtime());
 142             return;
 143         }
 144     }
 145
 146     // Check the file size.
 147     off_t size = d.get_size();
 148     if (size == 0) {
 149         skip(urlterm, file.substr(root.size()), "Zero-sized file",
 150              size, d.get_mtime(), SKIP_VERBOSE_ONLY);
 151         return;
 152     }
 153
 154     if (max_size > 0 && size > max_size) {
 155         skip(urlterm, file.substr(root.size()),
 156              "Larger than size limit of " + pretty_max_size,
 157              size, d.get_mtime(),
 158              SKIP_VERBOSE_ONLY);
 159         return;
 160     }
 161
 162     // If we didn't get the mime type from the extension, call libmagic to get
 163     // it.
 164     if (mimetype.empty()) {
 165         mimetype = d.get_magic_mimetype();
 166         if (mimetype.empty()) {
 167             skip(urlterm, file.substr(root.size()), "Unknown extension and unrecognised format",
 168                  d.get_size(), d.get_mtime(), SKIP_SHOW_FILENAME);
 169             return;
 170         }
 171     }
 172
 173     if (verbose)
 174         cout << "Indexing \"" << file.substr(root.size()) << "\" as "
 175              << mimetype << " ... ";
 176
 177     Xapian::Document new_doc;
 178
 179     // Use `file` as the basis, as we don't want URL encoding in these terms,
 180     // but need to switch over the initial part so we get `/~olly/foo/bar` not
 181     // `/home/olly/public_html/foo/bar`.
 182     string path_term("P");
 183     path_term += url_start_path;
 184     path_term.append(file, root.size(), string::npos);
 185
 186     size_t i;
 187     while ((i = path_term.rfind('/')) > 1 && i != string::npos) {
 188         path_term.resize(i);
 189         if (path_term.length() > MAX_SAFE_TERM_LENGTH) {
 190             new_doc.add_boolean_term(hash_long_term(path_term, MAX_SAFE_TERM_LENGTH));
 191         } else {
 192             new_doc.add_boolean_term(path_term);
 193         }
 194     }
 195
 196     index_mimetype(file, urlterm, url, ext, mimetype, d, new_doc, string());
 197 }
 198
 199 static void
 200 index_directory(const string &path, const string &url_, size_t depth_limit,
 201                 map<string, string>& mime_map)
 202 {
 203     if (verbose)
 204         cout << "[Entering directory \"" << path.substr(root.size()) << "\"]"
 205              << endl;
 206
 207     DirectoryIterator d(follow_symlinks);
 208     try {
 209         // Crude workaround for MS-DFS share misbehaviour.
 210         if (sleep_before_opendir > 0.0)
 211             RealTime::sleep(RealTime::now() + sleep_before_opendir);
 212
 213         d.start(path);
 214
 215         while (d.next()) {
 216             string url = url_;
 217             url_encode(url, d.leafname());
 218             string file = path;
 219             file += d.leafname();
 220
 221             try {
 222                 switch (d.get_type()) {
 223                     case DirectoryIterator::DIRECTORY: {
 224                         size_t new_limit = depth_limit;
 225                         if (new_limit) {
 226                             if (--new_limit == 0) continue;
 227                         }
 228                         url += '/';
 229                         file += '/';
 230                         index_directory(file, url, new_limit, mime_map);
 231                         break;
 232                     }
 233                     case DirectoryIterator::REGULAR_FILE:
 234                         index_file(file, url, d, mime_map);
 235                         break;
 236                     default:
 237                         skip("U" + url, file.substr(root.size()), "Not a regular file",
 238                              d.get_size(), d.get_mtime(),
 239                              SKIP_VERBOSE_ONLY | SKIP_SHOW_FILENAME);
 240                 }
 241             } catch (const FileNotFound & e) {
 242                 skip("U" + url, file.substr(root.size()), "File removed during indexing",
 243                      d.get_size(), d.get_mtime(),
 244                      /*SKIP_VERBOSE_ONLY |*/ SKIP_SHOW_FILENAME);
 245             } catch (const std::string & error) {
 246                 skip("U" + url, file.substr(root.size()), error,
 247                      d.get_size(), d.get_mtime(), SKIP_SHOW_FILENAME);
 248             }
 249         }
 250     } catch (const FileNotFound&) {
 251         if (verbose)
 252             cout << "Directory \"" << path.substr(root.size()) << "\" "
 253                     "deleted during indexing" << endl;
 254     } catch (const std::string & error) {
 255         cout << error << " - skipping directory "
 256                 "\"" << path.substr(root.size()) << "\"" << endl;
 257     }
 258 }
 259
 260 static off_t
 261 parse_size(char* p)
 262 {
 263     // Don't want negative numbers, infinity, NaN, or hex numbers.
 264     if (C_isdigit(p[0]) && (p[1] | 32) != 'x') {
 265         double arg = strtod(p, &p);
 266         switch (*p) {
 267             case '\0':
 268                 break;
 269             case 'k': case 'K':
 270                 arg *= 1024;
 271                 ++p;
 272                 break;
 273             case 'm': case 'M':
 274                 arg *= (1024 * 1024);
 275                 ++p;
 276                 break;
 277             case 'g': case 'G':
 278                 arg *= (1024 * 1024 * 1024);
 279                 ++p;
 280                 break;
 281         }
 282         if (*p == '\0') {
 283             return off_t(arg);
 284         }
 285     }
 286     return -1;
 287 }
 288
 289 static bool
 290 parse_filter_rule(const char* rule, map<string, string>& mime_map)
 291 {
 292     const char* s = strchr(rule, ':');
 293     if (s == NULL || s[1] == '\0') {
 294         cerr << "Invalid filter mapping '" << rule << "'\n"
 295                 "Should be of the form TYPE:COMMAND or TYPE1,TYPE2:COMMAND or "
 296                 "TYPE,EXT:COMMAND\n"
 297                 "e.g. 'application/octet-stream:strings -n8'"
 298              << endl;
 299         return false;
 300     }
 301
 302     const char* c = static_cast<const char*>(memchr(rule, ',', s - rule));
 303     string output_type, output_charset;
 304     if (c) {
 305         // Filter produces a specified content-type.
 306         ++c;
 307         const char* c2 = static_cast<const char *>(memchr(c, ',', s - c));
 308         if (c2) {
 309             output_type.assign(c, c2 - c);
 310             ++c2;
 311             output_charset.assign(c2, s - c2);
 312         } else {
 313             output_type.assign(c, s - c);
 314         }
 315         --c;
 316         if (output_type.find('/') == string::npos) {
 317             auto m = mime_map.find(output_type);
 318             if (m != mime_map.end()) {
 319                 output_type = m->second;
 320             } else {
 321                 const char* r = built_in_mime_map(output_type);
 322                 if (r) output_type = r;
 323             }
 324         }
 325         if (output_type != "text/html" &&
 326             output_type != "text/plain" &&
 327             output_type != "image/svg+xml") {
 328             cerr << "Currently only output types 'image/svg+xml', "
 329                     "'text/html' and 'text/plain' are supported."
 330                  << endl;
 331             return false;
 332         }
 333     } else {
 334         c = s;
 335     }
 336
 337     const char* cmd = s + 1;
 338     // Analyse the command string to decide if it needs a shell.
 339     bool use_shell = command_needs_shell(cmd);
 340     index_command(string(rule, c - rule),
 341                   Filter(string(cmd), output_type, output_charset, use_shell));
 342
 343     return true;
 344 }
 345
 346 int
 347 main(int argc, char **argv)
 348 {
 349     // If overwrite is true, the database will be created anew even if it
 350     // already exists.
 351     bool overwrite = false;
 352     // If delete_removed_documents is true, delete any documents we don't see.
 353     bool delete_removed_documents = true;
 354     // Retry files which we failed to index on a previous run?
 355     bool retry_failed = false;
 356     bool use_ctime = false;
 357     bool spelling = false;
 358     bool skip_duplicates = false;
 359     bool ignore_exclusions = false;
 360     bool description_as_sample = false;
 361     string baseurl;
 362     size_t depth_limit = 0;
 363     size_t title_size = TITLE_SIZE;
 364     size_t sample_size = SAMPLE_SIZE;
 365     empty_body_type empty_body = EMPTY_BODY_WARN;
 366     string site_term, host_term;
 367     Xapian::Stem stemmer("english");
 368
 369     enum {
 370         OPT_OPENDIR_SLEEP = 256,
 371         OPT_SAMPLE,
 372         OPT_READ_FILTERS
 373     };
 374     constexpr auto NO_ARG = no_argument;
 375     constexpr auto REQ_ARG = required_argument;
 376     static const struct option longopts[] = {
 377         { "help",               NO_ARG,         NULL, 'h' },
 378         { "version",            NO_ARG,         NULL, 'V' },
 379         { "overwrite",          NO_ARG,         NULL, 'o' },
 380         { "duplicates",         REQ_ARG,        NULL, 'd' },
 381         { "no-delete",          NO_ARG,         NULL, 'p' },
 382         { "preserve-nonduplicates", NO_ARG,     NULL, 'p' },
 383         { "db",                 REQ_ARG,        NULL, 'D' },
 384         { "url",                REQ_ARG,        NULL, 'U' },
 385         { "mime-type",          REQ_ARG,        NULL, 'M' },
 386         { "mime-type-match",    REQ_ARG,        NULL, 'G' },
 387         { "filter",             REQ_ARG,        NULL, 'F' },
 388         { "read-filters",       REQ_ARG,        NULL, OPT_READ_FILTERS },
 389         { "depth-limit",        REQ_ARG,        NULL, 'l' },
 390         { "follow",             NO_ARG,         NULL, 'f' },
 391         { "ignore-exclusions",  NO_ARG,         NULL, 'i' },
 392         { "stemmer",            REQ_ARG,        NULL, 's' },
 393         { "spelling",           NO_ARG,         NULL, 'S' },
 394         { "verbose",            NO_ARG,         NULL, 'v' },
 395         { "empty-docs",         REQ_ARG,        NULL, 'e' },
 396         { "max-size",           REQ_ARG,        NULL, 'm' },
 397         { "sample",             REQ_ARG,        NULL, OPT_SAMPLE },
 398         { "sample-size",        REQ_ARG,        NULL, 'E' },
 399         { "title-size",         REQ_ARG,        NULL, 'T' },
 400         { "retry-failed",       NO_ARG,         NULL, 'R' },
 401         { "opendir-sleep",      REQ_ARG,        NULL, OPT_OPENDIR_SLEEP },
 402         { "track-ctime",        NO_ARG,         NULL, 'C' },
 403         { 0, 0, NULL, 0 }
 404     };
 405
 406     map<string, string> mime_map;
 407
 408     index_add_default_filters();
 409
 410     if (argc == 2 && strcmp(argv[1], "-v") == 0) {
 411         // -v was the short option for --version in 1.2.3 and earlier, but
 412         // now it is short for --verbose (for consistency with scriptindex)
 413         // so if "-v" is the only option, translate it to "--version" for
 414         // backwards compatibility.
 415         argv[1] = const_cast<char *>("--version");
 416     }
 417
 418     string dbpath;
 419     int getopt_ret;
 420     while ((getopt_ret = gnu_getopt_long(argc, argv, "hvd:D:U:M:F:l:s:pfRSVe:im:E:T:",
 421                                          longopts, NULL)) != -1) {
 422         switch (getopt_ret) {
 423         case 'h': {
 424             cout << PROG_NAME " - " PROG_DESC "\n\n"
 425 "Usage: " PROG_NAME " [OPTIONS] --db DATABASE [BASEDIR] DIRECTORY\n"
 426 "\n"
 427 "DIRECTORY is the directory to start indexing from.\n"
 428 "\n"
 429 "BASEDIR is the directory corresponding to URL (default: DIRECTORY).\n"
 430 "\n"
 431 "Options:\n"
 432 "  -d, --duplicates=ARG      set duplicate handling: ARG can be 'ignore' or\n"
 433 "                            'replace' (default: replace)\n"
 434 "  -p, --no-delete           skip the deletion of documents corresponding to\n"
 435 "                            deleted files (--preserve-nonduplicates is a\n"
 436 "                            deprecated alias for --no-delete)\n"
 437 "  -e, --empty-docs=ARG      how to handle documents we extract no text from:\n"
 438 "                            ARG can be index, warn (issue a diagnostic and\n"
 439 "                            index), or skip.  (default: warn)\n"
 440 "  -D, --db=DATABASE         path to database to use\n"
 441 "  -U, --url=URL             base url BASEDIR corresponds to (default: /)\n"
 442 "  -M, --mime-type=EXT:TYPE  assume any file with extension EXT has MIME\n"
 443 "                            Content-Type TYPE, instead of using libmagic\n"
 444 "                            (empty TYPE removes any existing mapping for EXT;\n"
 445 "                            other special TYPE values: 'ignore' and 'skip')\n"
 446 "  -G, --mime-type-match=GLOB:TYPE\n"
 447 "                            assume any file with leaf name matching shell\n"
 448 "                            wildcard pattern GLOB has MIME Content-Type TYPE\n"
 449 "                            (special TYPE values: 'ignore' and 'skip')\n"
 450 "  -F, --filter=M[,[T][,C]]:CMD\n"
 451 "                            process files with MIME Content-Type M using\n"
 452 "                            command CMD, which produces output (on stdout or\n"
 453 "                            in a temporary file) with format T (Content-Type\n"
 454 "                            or file extension; currently txt (default), html\n"
 455 "                            or svg) in character encoding C (default: UTF-8).\n"
 456 "                            E.g. -Fapplication/octet-stream:'strings -n8'\n"
 457 "                            or -Ftext/x-foo,,utf-16:'foo2utf16 %f %t'\n"
 458 "      --read-filters=FILE   bulk-load --filter arguments from FILE, which\n"
 459 "                            should contain one such argument per line (e.g.\n"
 460 "                            text/x-bar:bar2txt --utf8).  Lines starting with #\n"
 461 "                            are treated as comments and ignored.\n"
 462 "  -l, --depth-limit=LIMIT   set recursion limit (0 = unlimited)\n"
 463 "  -f, --follow              follow symbolic links\n"
 464 "  -i, --ignore-exclusions   ignore meta robots tags and similar exclusions\n"
 465 "  -S, --spelling            index data for spelling correction\n"
 466 "  -m, --max-size            maximum size of file to index (in bytes or with a\n"
 467 "                            suffix of 'K'/'k', 'M'/'m', 'G'/'g')\n"
 468 "                            (default: unlimited)\n"
 469 "      --sample=SOURCE       what to use for the stored sample of text for\n"
 470 "                            HTML documents - SOURCE can be 'body' or\n"
 471 "                            'description' (default: 'body')\n"
 472 "  -E, --sample-size=SIZE    maximum size for the document text sample\n"
 473 "                            (supports the same formats as --max-size).\n"
 474 "                            (default: " STRINGIZE(SAMPLE_SIZE) ")\n"
 475 "  -T, --title-size=SIZE     maximum size for the document title\n"
 476 "                            (supports the same formats as --max-size).\n"
 477 "                            (default: " STRINGIZE(TITLE_SIZE) ")\n"
 478 "  -R, --retry-failed        retry files which omindex failed to extract text\n"
 479 "                            from on a previous run\n"
 480 "      --opendir-sleep=SECS  sleep for SECS seconds before opening each\n"
 481 "                            directory - sleeping for 2 seconds seems to\n"
 482 "                            reliably work around problems with indexing files\n"
 483 "                            on Microsoft DFS shares.\n"
 484 "  -C, --track-ctime         track each file's ctime so we can detect changes\n"
 485 "                            to ownership or permissions.\n"
 486 "  -v, --verbose             show more information about what is happening\n"
 487 "      --overwrite           create the database anew (the default is to update\n"
 488 "                            if the database already exists)" << endl;
 489             print_stemmer_help("      ");
 490             print_help_and_version_help("      ");
 491             return 0;
 492         }
 493         case 'V':
 494             print_package_info(PROG_NAME);
 495             return 0;
 496         case 'd': // how shall we handle duplicate documents?
 497             switch (optarg[0]) {
 498             case 'i':
 499                 skip_duplicates = true;
 500                 break;
 501             case 'r':
 502                 skip_duplicates = false;
 503                 break;
 504             }
 505             break;
 506         case 'e':
 507             if (strcmp(optarg, "index") == 0) {
 508                 empty_body = EMPTY_BODY_INDEX;
 509             } else if (strcmp(optarg, "warn") == 0) {
 510                 empty_body = EMPTY_BODY_WARN;
 511             } else if (strcmp(optarg, "skip") == 0) {
 512                 empty_body = EMPTY_BODY_SKIP;
 513             } else {
 514                 cerr << "Invalid --empty-docs value '" << optarg << "'\n"
 515                         "Valid values are index, warn, and skip." << endl;
 516                 return 1;
 517             }
 518             break;
 519         case 'p': // Keep documents even if the files have been removed.
 520             delete_removed_documents = false;
 521             break;
 522         case 'l': { // Set recursion limit
 523             int arg = atoi(optarg);
 524             if (arg < 0) arg = 0;
 525             depth_limit = size_t(arg);
 526             break;
 527         }
 528         case 'f': // Turn on following of symlinks
 529             follow_symlinks = true;
 530             break;
 531         case 'M': {
 532             const char * s = strrchr(optarg, ':');
 533             if (s == NULL) {
 534                 cerr << "Invalid MIME mapping '" << optarg << "'\n"
 535                         "Should be of the form EXT:TYPE, e.g. txt:text/plain\n"
 536                         "(or txt: to delete a default mapping)" << endl;
 537                 return 1;
 538             }
 539
 540             // -Mtxt: results in an empty string, which effectively removes the
 541             // default mapping for .txt files.
 542             mime_map[string(optarg, s - optarg)] = string(s + 1);
 543             max_ext_len = max(max_ext_len, strlen(s + 1));
 544             break;
 545         }
 546         case 'F':
 547             if (!parse_filter_rule(optarg, mime_map))
 548                 return 1;
 549             break;
 550         case OPT_READ_FILTERS: {
 551             ifstream stream(optarg);
 552             if (!stream) {
 553                 cerr << "Unable to open filter file '" << optarg << "' "
 554                         "(" << strerror(errno) << ')' << endl;
 555                 return 1;
 556             }
 557             string rule;
 558             bool all_ok = true;
 559             while (getline(stream, rule)) {
 560                 if (startswith(rule, '#')) continue;
 561                 if (!parse_filter_rule(rule.c_str(), mime_map))
 562                     all_ok = false;
 563             }
 564             if (!all_ok)
 565                 return 1;
 566             break;
 567         }
 568         case 'D':
 569             dbpath = optarg;
 570             break;
 571         case 'U':
 572             baseurl = optarg;
 573             break;
 574         case 'o': // --overwrite
 575             overwrite = true;
 576             break;
 577         case 'i':
 578             ignore_exclusions = true;
 579             break;
 580         case 'R': // --retry-failed
 581             retry_failed = true;
 582             break;
 583         case 's':
 584             try {
 585                 stemmer = Xapian::Stem(optarg);
 586             } catch (const Xapian::InvalidArgumentError &) {
 587                 cerr << "Unknown stemming language '" << optarg << "'.\n"
 588                         "Available language names are: "
 589                      << Xapian::Stem::get_available_languages() << endl;
 590                 return 1;
 591             }
 592             break;
 593         case 'S':
 594             spelling = true;
 595             break;
 596         case 'v':
 597             verbose = true;
 598             break;
 599         case 'E': {
 600             off_t arg = parse_size(optarg);
 601             if (arg >= 0) {
 602                 sample_size = size_t(arg);
 603                 break;
 604             }
 605             cerr << PROG_NAME": bad sample size '" << optarg << "'" << endl;
 606             return 1;
 607         }
 608         case 'T': {
 609             off_t arg = parse_size(optarg);
 610             if (arg >= 0) {
 611                 title_size = size_t(arg);
 612                 break;
 613             }
 614             cerr << PROG_NAME": bad title size '" << optarg << "'" << endl;
 615             return 1;
 616         }
 617         case 'm': {
 618             off_t size = parse_size(optarg);
 619             if (size >= 0) {
 620                 max_size = size;
 621                 const char * suffix;
 622                 // Set lsb to the lowest set bit in max_size.
 623                 off_t lsb = max_size & -max_size;
 624                 if (lsb >= off_t(1L << 30)) {
 625                     size >>= 30;
 626                     suffix = "GB";
 627                 } else if (lsb >= off_t(1L << 20)) {
 628                     size >>= 20;
 629                     suffix = "MB";
 630                 } else if (lsb >= off_t(1L << 10)) {
 631                     size >>= 10;
 632                     suffix = "KB";
 633                 } else {
 634                     suffix = "B";
 635                 }
 636                 pretty_max_size = str(size);
 637                 pretty_max_size += suffix;
 638                 break;
 639             }
 640             cerr << PROG_NAME": bad max size '" << optarg << "'" << endl;
 641             return 1;
 642         }
 643         case OPT_OPENDIR_SLEEP: {
 644             // Don't want negative numbers, infinity, NaN, or hex numbers.
 645             char * p = optarg;
 646             if (C_isdigit(p[0]) && (p[1] | 32) != 'x') {
 647                 sleep_before_opendir = strtod(p, &p);
 648                 if (*p == '\0')
 649                     break;
 650             }
 651             cerr << PROG_NAME": bad --opendir-sleep argument: "
 652                  "'" << optarg << "'" << endl;
 653             return 1;
 654         }
 655         case OPT_SAMPLE:
 656             if (strcmp(optarg, "description") == 0) {
 657                 description_as_sample = true;
 658             } else if (strcmp(optarg, "body") == 0) {
 659                 description_as_sample = false;
 660             } else {
 661                 cerr << "Invalid --sample value '" << optarg << "'\n"
 662                         "Valid values are body and description." << endl;
 663                 return 1;
 664             }
 665             break;
 666         case 'C':
 667             use_ctime = true;
 668             break;
 669         case 'G': {
 670             char * s = strrchr(optarg, ':');
 671             if (s == NULL) {
 672                 cerr << "Invalid MIME mapping '" << optarg << "'\n"
 673                         "Should be of the form GLOB:TYPE, e.g. *~:ignore"
 674                      << endl;
 675                 return 1;
 676             }
 677 #ifndef HAVE_FNMATCH
 678             cerr << "--mime-type-match isn't supported in this build because "
 679                     "the fnmatch() function wasn't found at configure time."
 680                  << endl;
 681             return 1;
 682 #else
 683             if (s == optarg) {
 684                 cerr << "--mime-type-match with an empty pattern can never "
 685                         "match." << endl;
 686                 return 1;
 687             }
 688             if (memchr(optarg, '/', s - optarg)) {
 689                 cerr << "--mime-type-match only matches against the leaf "
 690                         "filename so a pattern containing '/' can never match."
 691                      << endl;
 692                 return 1;
 693             }
 694             const char* type = s + 1;
 695             if (*type == '\0') {
 696                 cerr << "--mime-type-match doesn't support an empty MIME type"
 697                      << endl;
 698                 return 1;
 699             }
 700             *s = '\0';
 701             mime_patterns.emplace_back(optarg, type);
 702             break;
 703 #endif
 704         }
 705         case ':': // missing param
 706             return 1;
 707         case '?': // unknown option: FIXME -> char
 708             return 1;
 709         }
 710     }
 711
 712     if (dbpath.empty()) {
 713         cerr << PROG_NAME": you must specify a database with --db." << endl;
 714         return 1;
 715     }
 716     if (baseurl.empty()) {
 717         cerr << PROG_NAME": --url not specified, assuming '/'." << endl;
 718     }
 719     // baseurl must end in a '/'.
 720     if (!endswith(baseurl, '/')) {
 721         baseurl += '/';
 722     }
 723
 724     // Site term (omits the trailing slash):
 725     site_term = "J";
 726     site_term.append(baseurl, 0, baseurl.size() - 1);
 727     if (site_term.size() > MAX_SAFE_TERM_LENGTH)
 728         site_term = hash_long_term(site_term, MAX_SAFE_TERM_LENGTH);
 729
 730     // Host term, if the URL contains a hostname (omits any port number):
 731     string::size_type j;
 732     j = find_if(baseurl.begin(), baseurl.end(), p_notalnum) - baseurl.begin();
 733     if (j > 0 && baseurl.substr(j, 3) == "://" && j + 3 < baseurl.size()) {
 734         j += 3;
 735         // We must find a '/' - we ensured baseurl ended with a '/' above.
 736         string::size_type k = baseurl.find('/', j);
 737         url_start_path.assign(baseurl, k, string::npos);
 738         string::const_iterator l;
 739         l = find(baseurl.begin() + j, baseurl.begin() + k, ':');
 740         string::size_type host_len = l - baseurl.begin() - j;
 741         host_term = "H";
 742         host_term.append(baseurl, j, host_len);
 743         // DNS hostname limit is 253.
 744         if (host_term.size() > MAX_SAFE_TERM_LENGTH)
 745             host_term = hash_long_term(host_term, MAX_SAFE_TERM_LENGTH);
 746     } else {
 747         url_start_path = baseurl;
 748     }
 749
 750     if (optind >= argc || optind + 2 < argc) {
 751         cerr << PROG_NAME": you must specify a directory to index.\n"
 752 "Do this either as a single directory (corresponding to the base URL)\n"
 753 "or two directories - the first corresponding to the base URL and the second\n"
 754 "a subdirectory of that to index." << endl;
 755         return 1;
 756     }
 757
 758     root = argv[optind];
 759     if (root.empty()) {
 760         cerr << PROG_NAME": start directory can not be empty." << endl;
 761         return 1;
 762     }
 763     if (!endswith(root, '/')) {
 764         root += '/';
 765     }
 766     if (optind + 2 == argc) {
 767         string start_url = argv[optind + 1];
 768         if (startswith(start_url, '/')) {
 769             // Make relative to root.
 770             if (!startswith(start_url, root)) {
 771                 cerr << PROG_NAME": '" << argv[optind + 1] << "' "
 772                     "is not a subdirectory of '" << argv[optind] << "'."
 773                      << endl;
 774                 return 1;
 775             }
 776             start_url.erase(0, root.size());
 777         }
 778         if (!endswith(start_url, '/')) {
 779             start_url += '/';
 780         }
 781         root += start_url;
 782         url_encode_path(baseurl, start_url);
 783     }
 784
 785     int exitcode = 1;
 786     try {
 787         index_init(dbpath, stemmer, root, site_term, host_term, empty_body,
 788                    (skip_duplicates ? DUP_SKIP : DUP_CHECK_LAZILY),
 789                    sample_size, title_size, max_ext_len,
 790                    overwrite, retry_failed, delete_removed_documents, verbose,
 791                    use_ctime, spelling, ignore_exclusions,
 792                    description_as_sample);
 793         index_directory(root, baseurl, depth_limit, mime_map);
 794         index_handle_deletion();
 795         index_commit();
 796         exitcode = 0;
 797     } catch (const CommitAndExit &e) {
 798         cout << "Exception: " << e.what() << endl;
 799         cout << "Committing pending changes..." << endl;
 800         index_commit();
 801     } catch (const Xapian::Error &e) {
 802         cout << "Exception: " << e.get_description() << endl;
 803     } catch (const exception &e) {
 804         cout << "Exception: " << e.what() << endl;
 805     } catch (const string &s) {
 806         cout << "Exception: " << s << endl;
 807     } catch (const char *s) {
 808         cout << "Exception: " << s << endl;
 809     } catch (...) {
 810         cout << "Caught unknown exception" << endl;
 811     }
 812
 813     index_done();
 814
 815     return exitcode;
 816 }