xapian-applications/omega/omindex.cc

   1 /** @file
   2  * @brief index static documents into the omega db
   3  */
   4 /* Copyright 1999,2000,2001 BrightStation PLC
   5  * Copyright 2001,2005 James Aylett
   6  * Copyright 2001,2002 Ananova Ltd
   7  * Copyright 2002-2022 Olly Betts
   8  * Copyright 2009 Frank J Bruzzaniti
   9  * Copyright 2012 Mihai Bivol
  10  *
  11  * This program is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU General Public License as
  13  * published by the Free Software Foundation; either version 2 of the
  14  * License, or (at your option) any later version.
  15  *
  16  * This program is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19  * GNU General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU General Public License
  22  * along with this program; if not, write to the Free Software
  23  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
  24  * USA
  25  */
  26
  27 #include <config.h>
  28
  29 #include <algorithm>
  30 #include <fstream>
  31 #include <iostream>
  32 #include <string>
  33 #include <map>
  34
  35 #include <sys/types.h>
  36 #include "safeunistd.h"
  37 #include <cerrno>
  38 #include <cstdio>
  39 #include <cstdlib>
  40 #include <cstring>
  41 #include "safefcntl.h"
  42
  43 #ifdef HAVE_FNMATCH
  44 # include <fnmatch.h>
  45 #endif
  46
  47 #include <xapian.h>
  48
  49 #include "commonhelp.h"
  50 #include "diritor.h"
  51 #include "hashterm.h"
  52 #include "index_file.h"
  53 #include "mime.h"
  54 #include "parseint.h"
  55 #include "pkglibbindir.h"
  56 #include "realtime.h"
  57 #include "str.h"
  58 #include "stringutils.h"
  59 #include "urlencode.h"
  60 #include "worker.h"
  61
  62 #include "gnu_getopt.h"
  63
  64 using namespace std;
  65
  66 #define PROG_NAME "omindex"
  67 #define PROG_DESC "Index static website data via the filesystem"
  68
  69 #define TITLE_SIZE 128
  70 #define SAMPLE_SIZE 512
  71
  72 static bool follow_symlinks = false;
  73 static off_t max_size = 0;
  74 static std::string pretty_max_size;
  75 static bool verbose = false;
  76 static double sleep_before_opendir = 0;
  77 static bool date_terms = false;
  78
  79 static string root;
  80 static string url_start_path;
  81
  82 #ifdef HAVE_FNMATCH
  83 static vector<pair<const char*, const char*>> mime_patterns;
  84 #endif
  85
  86 static inline bool
  87 p_notalnum(unsigned int c)
  88 {
  89     return !C_isalnum(c);
  90 }
  91
  92 static void
  93 index_file(const string &file, const string &url, DirectoryIterator & d,
  94            map<string, string>& mime_map)
  95 {
  96     string urlterm("U");
  97     urlterm += url;
  98
  99     if (urlterm.length() > MAX_SAFE_TERM_LENGTH)
 100         urlterm = hash_long_term(urlterm, MAX_SAFE_TERM_LENGTH);
 101
 102     const char* leafname = d.leafname();
 103
 104     string mimetype;
 105 #ifdef HAVE_FNMATCH
 106     for (auto&& i : mime_patterns) {
 107         if (fnmatch(i.first, leafname, 0) == 0) {
 108             if (strcmp(i.second, "ignore") == 0)
 109                 return;
 110             if (strcmp(i.second, "skip") == 0) {
 111                 string m = "Leafname '";
 112                 m += leafname;
 113                 m += "' matches pattern: ";
 114                 m += i.first;
 115                 skip(urlterm, file.substr(root.size()), m,
 116                      d.get_size(), d.get_mtime());
 117                 return;
 118             }
 119             mimetype = i.second;
 120             break;
 121         }
 122     }
 123 #endif
 124
 125     string ext;
 126     const char * dot_ptr = strrchr(leafname, '.');
 127     if (dot_ptr) {
 128         ext.assign(dot_ptr + 1);
 129         if (ext.size() > max_ext_len)
 130             ext.resize(0);
 131     }
 132
 133     if (mimetype.empty()) {
 134         mimetype = mimetype_from_ext(mime_map, ext);
 135         if (mimetype == "ignore") {
 136             // Remove any existing failed entry for this file.
 137             index_remove_failed_entry(urlterm);
 138             return;
 139         } else if (mimetype == "skip") {
 140             // Ignore mimetype, skipped mimetype should not be quietly ignored.
 141             string m = "skipping extension '";
 142             m += ext;
 143             m += "'";
 144             skip(urlterm, file.substr(root.size()), m,
 145                  d.get_size(), d.get_mtime());
 146             return;
 147         }
 148     }
 149
 150     // Check the file size.
 151     off_t size = d.get_size();
 152     if (size == 0) {
 153         skip(urlterm, file.substr(root.size()), "Zero-sized file",
 154              size, d.get_mtime(), SKIP_VERBOSE_ONLY);
 155         return;
 156     }
 157
 158     if (max_size > 0 && size > max_size) {
 159         skip(urlterm, file.substr(root.size()),
 160              "Larger than size limit of " + pretty_max_size,
 161              size, d.get_mtime(),
 162              SKIP_VERBOSE_ONLY);
 163         return;
 164     }
 165
 166     string path_term("P");
 167     path_term += url_start_path;
 168     path_term.append(file, root.size(), string::npos);
 169
 170     index_mimetype(file, urlterm, url, ext, mimetype, d, path_term, string());
 171 }
 172
 173 static void
 174 index_directory(const string &path, const string &url_, size_t depth_limit,
 175                 map<string, string>& mime_map)
 176 {
 177     if (verbose)
 178         cout << "[Entering directory \"" << path.substr(root.size()) << "\"]"
 179              << endl;
 180
 181     DirectoryIterator d(follow_symlinks);
 182     try {
 183         // Crude workaround for MS-DFS share misbehaviour.
 184         if (sleep_before_opendir > 0.0)
 185             RealTime::sleep(RealTime::now() + sleep_before_opendir);
 186
 187         d.start(path);
 188
 189         while (d.next()) {
 190             string url = url_;
 191             url_encode(url, d.leafname());
 192             string file = path;
 193             file += d.leafname();
 194
 195             try {
 196                 switch (d.get_type()) {
 197                     case DirectoryIterator::DIRECTORY: {
 198                         size_t new_limit = depth_limit;
 199                         if (new_limit) {
 200                             if (--new_limit == 0) continue;
 201                         }
 202                         url += '/';
 203                         file += '/';
 204                         index_directory(file, url, new_limit, mime_map);
 205                         break;
 206                     }
 207                     case DirectoryIterator::REGULAR_FILE:
 208                         index_file(file, url, d, mime_map);
 209                         break;
 210                     default:
 211                         skip("U" + url, file.substr(root.size()), "Not a regular file",
 212                              d.get_size(), d.get_mtime(),
 213                              SKIP_VERBOSE_ONLY | SKIP_SHOW_FILENAME);
 214                 }
 215             } catch (const FileNotFound & e) {
 216                 skip("U" + url, file.substr(root.size()), "File removed during indexing",
 217                      d.get_size(), d.get_mtime(),
 218                      /*SKIP_VERBOSE_ONLY |*/ SKIP_SHOW_FILENAME);
 219             } catch (const std::string & error) {
 220                 skip("U" + url, file.substr(root.size()), error,
 221                      d.get_size(), d.get_mtime(), SKIP_SHOW_FILENAME);
 222             }
 223         }
 224     } catch (const FileNotFound&) {
 225         if (verbose)
 226             cout << "Directory \"" << path.substr(root.size()) << "\" "
 227                     "deleted during indexing" << endl;
 228     } catch (const std::string & error) {
 229         cout << error << " - skipping directory "
 230                 "\"" << path.substr(root.size()) << "\"" << endl;
 231     }
 232 }
 233
 234 static off_t
 235 parse_size(char* p)
 236 {
 237     // Don't want negative numbers, infinity, NaN, or hex numbers.
 238     if (C_isdigit(p[0]) && (p[1] | 32) != 'x') {
 239         double arg = strtod(p, &p);
 240         switch (*p) {
 241             case '\0':
 242                 break;
 243             case 'k': case 'K':
 244                 arg *= 1024;
 245                 ++p;
 246                 break;
 247             case 'm': case 'M':
 248                 arg *= (1024 * 1024);
 249                 ++p;
 250                 break;
 251             case 'g': case 'G':
 252                 arg *= (1024 * 1024 * 1024);
 253                 ++p;
 254                 break;
 255         }
 256         if (*p == '\0') {
 257             return off_t(arg);
 258         }
 259     }
 260     return -1;
 261 }
 262
 263 static bool
 264 parse_filter_rule(const char* rule, map<string, string>& mime_map)
 265 {
 266     const char* s = strchr(rule, ':');
 267     if (s == NULL || s[1] == '\0') {
 268         cerr << "Invalid filter mapping '" << rule << "'\n"
 269                 "Should be of the form TYPE:COMMAND or TYPE1,TYPE2:COMMAND or "
 270                 "TYPE,EXT:COMMAND\n"
 271                 "e.g. 'application/octet-stream:strings -n8'"
 272              << endl;
 273         return false;
 274     }
 275
 276     const char* c = static_cast<const char*>(memchr(rule, ',', s - rule));
 277     string output_type, output_charset;
 278     if (c) {
 279         // Filter produces a specified content-type.
 280         ++c;
 281         const char* c2 = static_cast<const char *>(memchr(c, ',', s - c));
 282         if (c2) {
 283             output_type.assign(c, c2 - c);
 284             ++c2;
 285             output_charset.assign(c2, s - c2);
 286         } else {
 287             output_type.assign(c, s - c);
 288         }
 289         --c;
 290         if (output_type.find('/') == string::npos) {
 291             auto m = mime_map.find(output_type);
 292             if (m != mime_map.end()) {
 293                 output_type = m->second;
 294             } else {
 295                 const char* r = built_in_mime_map(output_type);
 296                 if (r) output_type = r;
 297             }
 298         }
 299         if (output_type != "text/html" &&
 300             output_type != "text/plain" &&
 301             output_type != "image/svg+xml") {
 302             cerr << "Currently only output types 'image/svg+xml', "
 303                     "'text/html' and 'text/plain' are supported."
 304                  << endl;
 305             return false;
 306         }
 307     } else {
 308         c = s;
 309     }
 310
 311     const char* cmd = s + 1;
 312     unsigned flags = 0;
 313     if (cmd[0] == '|') {
 314         flags |= Filter::PIPE_IN;
 315         ++cmd;
 316         // FIXME: Do we need a way to set PIPE_DEV_STDIN and SEEK_DEV_STDIN?
 317         //
 318         // PIPE_DEV_STDIN doesn't seem to offer much over |foo2txt /dev/stdin
 319         // for user-specified filters (internally it provides a way to
 320         // gracefully handle platforms without /dev/stdin).
 321         //
 322         // SEEK_DEV_STDIN isn't currently easily approximated though.
 323     }
 324     // Analyse the command string to decide if it needs a shell.
 325     if (command_needs_shell(cmd))
 326         flags |= Filter::USE_SHELL;
 327     index_command(string(rule, c - rule),
 328                   Filter(string(cmd), output_type, output_charset, flags));
 329
 330     return true;
 331 }
 332
 333 static bool
 334 parse_worker_rule(const char* rule)
 335 {
 336     const char* s = strchr(rule, ':');
 337     if (s == NULL || s[1] == '\0') {
 338         cerr << "Invalid worker mapping '" << rule << "'\n"
 339                 "Should be of the form TYPE:WORKER\n"
 340                 "e.g. 'application/msword:omindex_libreofficekit\n";
 341         return false;
 342     }
 343
 344     index_library(string(rule, s - rule), new Worker(string(s + 1)));
 345     return true;
 346 }
 347
 348 int
 349 main(int argc, char **argv)
 350 {
 351     // If overwrite is true, the database will be created anew even if it
 352     // already exists.
 353     bool overwrite = false;
 354     // If delete_removed_documents is true, delete any documents we don't see.
 355     bool delete_removed_documents = true;
 356     // Retry files which we failed to index on a previous run?
 357     bool retry_failed = false;
 358     bool use_ctime = false;
 359     bool spelling = false;
 360     bool skip_duplicates = false;
 361     bool ignore_exclusions = false;
 362     bool description_as_sample = false;
 363     string baseurl;
 364     size_t depth_limit = 0;
 365     size_t title_size = TITLE_SIZE;
 366     size_t sample_size = SAMPLE_SIZE;
 367     empty_body_type empty_body = EMPTY_BODY_WARN;
 368     string site_term, host_term;
 369     Xapian::Stem stemmer("english");
 370
 371     enum {
 372         OPT_OPENDIR_SLEEP = 256,
 373         OPT_SAMPLE,
 374         OPT_DATE_TERMS,
 375         OPT_NO_DATE_TERMS,
 376         OPT_READ_FILTERS,
 377         OPT_READ_WORKERS
 378     };
 379     constexpr auto NO_ARG = no_argument;
 380     constexpr auto REQ_ARG = required_argument;
 381     static const struct option longopts[] = {
 382         { "help",               NO_ARG,         NULL, 'h' },
 383         { "version",            NO_ARG,         NULL, 'V' },
 384         { "overwrite",          NO_ARG,         NULL, 'o' },
 385         { "duplicates",         REQ_ARG,        NULL, 'd' },
 386         { "no-delete",          NO_ARG,         NULL, 'p' },
 387         { "db",                 REQ_ARG,        NULL, 'D' },
 388         { "url",                REQ_ARG,        NULL, 'U' },
 389         { "mime-type",          REQ_ARG,        NULL, 'M' },
 390         { "mime-type-match",    REQ_ARG,        NULL, 'G' },
 391         { "filter",             REQ_ARG,        NULL, 'F' },
 392         { "worker",             REQ_ARG,        NULL, 'W' },
 393         { "read-filters",       REQ_ARG,        NULL, OPT_READ_FILTERS },
 394         { "read-workers",       REQ_ARG,        NULL, OPT_READ_WORKERS },
 395         { "depth-limit",        REQ_ARG,        NULL, 'l' },
 396         { "follow",             NO_ARG,         NULL, 'f' },
 397         { "ignore-exclusions",  NO_ARG,         NULL, 'i' },
 398         { "stemmer",            REQ_ARG,        NULL, 's' },
 399         { "spelling",           NO_ARG,         NULL, 'S' },
 400         { "verbose",            NO_ARG,         NULL, 'v' },
 401         { "empty-docs",         REQ_ARG,        NULL, 'e' },
 402         { "max-size",           REQ_ARG,        NULL, 'm' },
 403         { "sample",             REQ_ARG,        NULL, OPT_SAMPLE },
 404         { "sample-size",        REQ_ARG,        NULL, 'E' },
 405         { "title-size",         REQ_ARG,        NULL, 'T' },
 406         { "retry-failed",       NO_ARG,         NULL, 'R' },
 407         { "opendir-sleep",      REQ_ARG,        NULL, OPT_OPENDIR_SLEEP },
 408         { "track-ctime",        NO_ARG,         NULL, 'C' },
 409         { "date-terms",         NO_ARG,         NULL, OPT_DATE_TERMS },
 410         { "no-date-terms",      NO_ARG,         NULL, OPT_NO_DATE_TERMS },
 411         { 0, 0, NULL, 0 }
 412     };
 413
 414     map<string, string> mime_map;
 415
 416     index_add_default_filters();
 417     index_add_default_libraries();
 418
 419     if (argc == 2 && strcmp(argv[1], "-v") == 0) {
 420         // -v was the short option for --version in 1.2.3 and earlier, but
 421         // now it is short for --verbose (for consistency with scriptindex)
 422         // so if "-v" is the only option, translate it to "--version" for
 423         // backwards compatibility.
 424         argv[1] = const_cast<char *>("--version");
 425     }
 426
 427     string dbpath;
 428     int getopt_ret;
 429     while ((getopt_ret = gnu_getopt_long(argc, argv,
 430                                          "hvd:D:U:M:G:F:W:l:s:pfRSVe:im:E:T:C",
 431                                          longopts, NULL)) != -1) {
 432         switch (getopt_ret) {
 433         case 'h': {
 434             cout << PROG_NAME " - " PROG_DESC "\n\n"
 435 "Usage: " PROG_NAME " [OPTIONS] --db DATABASE [BASEDIR] DIRECTORY\n"
 436 "\n"
 437 "DIRECTORY is the directory to start indexing from.\n"
 438 "\n"
 439 "BASEDIR is the directory corresponding to URL (default: DIRECTORY).\n"
 440 "\n"
 441 "Options:\n"
 442 "  -d, --duplicates=ARG      set duplicate handling: ARG can be 'ignore' or\n"
 443 "                            'replace' (default: replace)\n"
 444 "  -p, --no-delete           skip the deletion of documents corresponding to\n"
 445 "                            deleted files\n"
 446 "  -e, --empty-docs=ARG      how to handle documents we extract no text from:\n"
 447 "                            ARG can be index, warn (issue a diagnostic and\n"
 448 "                            index), or skip.  (default: warn)\n"
 449 "  -D, --db=DATABASE         path to database to use\n"
 450 "  -U, --url=URL             base url BASEDIR corresponds to (default: /)\n"
 451 "  -M, --mime-type=EXT:TYPE  assume any file with extension EXT has MIME\n"
 452 "                            Content-Type TYPE, instead of using libmagic\n"
 453 "                            (empty TYPE removes any existing mapping for EXT;\n"
 454 "                            other special TYPE values: 'ignore' and 'skip')\n"
 455 "  -G, --mime-type-match=GLOB:TYPE\n"
 456 "                            assume any file with leaf name matching shell\n"
 457 "                            wildcard pattern GLOB has MIME Content-Type TYPE\n"
 458 "                            (special TYPE values: 'ignore' and 'skip')\n"
 459 "  -F, --filter=M[,[T][,C]]:CMD\n"
 460 "                            process files with MIME Content-Type M using\n"
 461 "                            command CMD, which produces output (on stdout or\n"
 462 "                            in a temporary file) with format T (Content-Type\n"
 463 "                            or file extension; currently txt (default), html\n"
 464 "                            or svg) in character encoding C (default: UTF-8).\n"
 465 "                            E.g. -Fapplication/octet-stream:'|strings -n8'\n"
 466 "                            or -Ftext/x-foo,,utf-16:'foo2utf16 %f %t'\n"
 467 "  -W, --worker=TYPE:WORKER  process files with MIME Content-Type TYPE using\n"
 468 "                            worker sub-process WORKER.  WORKER is the name of\n"
 469 "                            the program to run to start the worker. If it has\n"
 470 "                            no path then it's looked for in pkglibbindir (which\n"
 471 "                            can be overridden by setting environment variable\n"
 472 "                            XAPIAN_OMEGA_PKGLIBBINDIR).  This invocation will\n"
 473 "                            look in: " << get_pkglibbindir() << "\n"
 474 "      --read-filters=FILE   bulk-load --filter arguments from FILE, which\n"
 475 "                            should contain one such argument per line (e.g.\n"
 476 "                            text/x-bar:bar2txt --utf8).  Lines starting with #\n"
 477 "                            are treated as comments and ignored.\n"
 478 "      --read-workers=FILE   bulk-load --worker arguments from FILE, which\n"
 479 "                            should contain one such argument per line (e.g.\n"
 480 "                            text/x-bar:omindex_libbar).  Lines starting with #\n"
 481 "                            are treated as comments and ignored.\n"
 482 "  -l, --depth-limit=LIMIT   set recursion limit (0 = unlimited)\n"
 483 "  -f, --follow              follow symbolic links\n"
 484 "  -i, --ignore-exclusions   ignore meta robots tags and similar exclusions\n"
 485 "  -S, --spelling            index data for spelling correction\n"
 486 "  -m, --max-size=N[SUFFIX]  maximum size of file to index (in bytes or with a\n"
 487 "                            suffix of 'K'/'k', 'M'/'m', 'G'/'g')\n"
 488 "                            (default: unlimited)\n"
 489 "      --sample=SOURCE       what to use for the stored sample of text for\n"
 490 "                            HTML documents - SOURCE can be 'body' or\n"
 491 "                            'description' (default: 'body')\n"
 492 "  -E, --sample-size=SIZE    maximum size for the document text sample\n"
 493 "                            (supports the same formats as --max-size).\n"
 494 "                            (default: " STRINGIZE(SAMPLE_SIZE) ")\n"
 495 "  -T, --title-size=SIZE     maximum size for the document title\n"
 496 "                            (supports the same formats as --max-size).\n"
 497 "                            (default: " STRINGIZE(TITLE_SIZE) ")\n"
 498 "  -R, --retry-failed        retry files which omindex failed to extract text\n"
 499 "                            from on a previous run\n"
 500 "      --opendir-sleep=SECS  sleep for SECS seconds before opening each\n"
 501 "                            directory - sleeping for 2 seconds seems to\n"
 502 "                            reliably work around problems with indexing files\n"
 503 "                            on Microsoft DFS shares.\n"
 504 "  -C, --track-ctime         track each file's ctime so we can detect changes\n"
 505 "                            to ownership or permissions.\n"
 506 "      --date-terms          index D, M and Y prefixed terms to support date\n"
 507 "                            range filtering using terms (we now recommend\n"
 508 "                            using a value slot for this instead).\n"
 509 "      --no-date-terms       ignored for compatibility with Omega 1.4.x.\n"
 510 "  -v, --verbose             show more information about what is happening\n"
 511 "      --overwrite           create the database anew (the default is to update\n"
 512 "                            if the database already exists)" << endl;
 513             print_stemmer_help("      ");
 514             print_help_and_version_help("      ");
 515             return 0;
 516         }
 517         case 'V':
 518             print_package_info(PROG_NAME);
 519             return 0;
 520         case 'd': // how shall we handle duplicate documents?
 521             switch (optarg[0]) {
 522             case 'i':
 523                 skip_duplicates = true;
 524                 break;
 525             case 'r':
 526                 skip_duplicates = false;
 527                 break;
 528             }
 529             break;
 530         case 'e':
 531             if (strcmp(optarg, "index") == 0) {
 532                 empty_body = EMPTY_BODY_INDEX;
 533             } else if (strcmp(optarg, "warn") == 0) {
 534                 empty_body = EMPTY_BODY_WARN;
 535             } else if (strcmp(optarg, "skip") == 0) {
 536                 empty_body = EMPTY_BODY_SKIP;
 537             } else {
 538                 cerr << "Invalid --empty-docs value '" << optarg << "'\n"
 539                         "Valid values are index, warn, and skip." << endl;
 540                 return 1;
 541             }
 542             break;
 543         case 'p': // Keep documents even if the files have been removed.
 544             delete_removed_documents = false;
 545             break;
 546         case 'l': { // Set recursion limit
 547             int arg;
 548             if (!parse_signed(optarg, arg)) {
 549                 throw "Recursion limit must be an integer";
 550             }
 551             if (arg < 0) arg = 0;
 552             depth_limit = size_t(arg);
 553             break;
 554         }
 555         case 'f': // Turn on following of symlinks
 556             follow_symlinks = true;
 557             break;
 558         case 'M': {
 559             const char * s = strrchr(optarg, ':');
 560             if (s == NULL) {
 561                 cerr << "Invalid MIME mapping '" << optarg << "'\n"
 562                         "Should be of the form EXT:TYPE, e.g. txt:text/plain\n"
 563                         "(or txt: to delete a default mapping)" << endl;
 564                 return 1;
 565             }
 566
 567             // -Mtxt: results in an empty string, which effectively removes the
 568             // default mapping for .txt files.
 569             size_t ext_len = s - optarg;
 570             mime_map[string(optarg, ext_len)] = string(s + 1);
 571             max_ext_len = max(max_ext_len, ext_len);
 572             break;
 573         }
 574         case 'F':
 575             if (!parse_filter_rule(optarg, mime_map))
 576                 return 1;
 577             break;
 578         case 'W':
 579             if (!parse_worker_rule(optarg))
 580                 return 1;
 581             break;
 582         case OPT_READ_FILTERS: {
 583             ifstream stream(optarg);
 584             if (!stream) {
 585                 cerr << "Unable to open filter file '" << optarg << "' "
 586                         "(" << strerror(errno) << ')' << endl;
 587                 return 1;
 588             }
 589             string rule;
 590             bool all_ok = true;
 591             while (getline(stream, rule)) {
 592                 if (startswith(rule, '#')) continue;
 593                 if (!parse_filter_rule(rule.c_str(), mime_map))
 594                     all_ok = false;
 595             }
 596             if (!all_ok)
 597                 return 1;
 598             break;
 599         }
 600         case OPT_READ_WORKERS: {
 601             ifstream stream(optarg);
 602             if (!stream) {
 603                 cerr << "Unable to open worker file '" << optarg << "' "
 604                         "(" << strerror(errno) << ')' << endl;
 605                 return 1;
 606             }
 607             string rule;
 608             bool all_ok = true;
 609             while (getline(stream, rule)) {
 610                 if (startswith(rule, '#')) continue;
 611                 if (!parse_worker_rule(rule.c_str()))
 612                     all_ok = false;
 613             }
 614             if (!all_ok)
 615                 return 1;
 616             break;
 617         }
 618         case 'D':
 619             dbpath = optarg;
 620             break;
 621         case 'U':
 622             baseurl = optarg;
 623             break;
 624         case 'o': // --overwrite
 625             overwrite = true;
 626             break;
 627         case 'i':
 628             ignore_exclusions = true;
 629             break;
 630         case 'R': // --retry-failed
 631             retry_failed = true;
 632             break;
 633         case 's':
 634             try {
 635                 stemmer = Xapian::Stem(optarg);
 636             } catch (const Xapian::InvalidArgumentError &) {
 637                 cerr << "Unknown stemming language '" << optarg << "'.\n"
 638                         "Available language names are: "
 639                      << Xapian::Stem::get_available_languages() << endl;
 640                 return 1;
 641             }
 642             break;
 643         case 'S':
 644             spelling = true;
 645             break;
 646         case 'v':
 647             verbose = true;
 648             break;
 649         case 'E': {
 650             off_t arg = parse_size(optarg);
 651             if (arg >= 0) {
 652                 sample_size = size_t(arg);
 653                 break;
 654             }
 655             cerr << PROG_NAME": bad sample size '" << optarg << "'" << endl;
 656             return 1;
 657         }
 658         case 'T': {
 659             off_t arg = parse_size(optarg);
 660             if (arg >= 0) {
 661                 title_size = size_t(arg);
 662                 break;
 663             }
 664             cerr << PROG_NAME": bad title size '" << optarg << "'" << endl;
 665             return 1;
 666         }
 667         case 'm': {
 668             off_t size = parse_size(optarg);
 669             if (size >= 0) {
 670                 max_size = size;
 671                 const char * suffix;
 672                 // Set lsb to the lowest set bit in max_size.
 673                 off_t lsb = max_size & -max_size;
 674                 if (lsb >= off_t(1L << 30)) {
 675                     size >>= 30;
 676                     suffix = "GB";
 677                 } else if (lsb >= off_t(1L << 20)) {
 678                     size >>= 20;
 679                     suffix = "MB";
 680                 } else if (lsb >= off_t(1L << 10)) {
 681                     size >>= 10;
 682                     suffix = "KB";
 683                 } else {
 684                     suffix = "B";
 685                 }
 686                 pretty_max_size = str(size);
 687                 pretty_max_size += suffix;
 688                 break;
 689             }
 690             cerr << PROG_NAME": bad max size '" << optarg << "'" << endl;
 691             return 1;
 692         }
 693         case OPT_OPENDIR_SLEEP: {
 694             // Don't want negative numbers, infinity, NaN, or hex numbers.
 695             char * p = optarg;
 696             if (C_isdigit(p[0]) && (p[1] | 32) != 'x') {
 697                 sleep_before_opendir = strtod(p, &p);
 698                 if (*p == '\0')
 699                     break;
 700             }
 701             cerr << PROG_NAME": bad --opendir-sleep argument: "
 702                  "'" << optarg << "'" << endl;
 703             return 1;
 704         }
 705         case OPT_SAMPLE:
 706             if (strcmp(optarg, "description") == 0) {
 707                 description_as_sample = true;
 708             } else if (strcmp(optarg, "body") == 0) {
 709                 description_as_sample = false;
 710             } else {
 711                 cerr << "Invalid --sample value '" << optarg << "'\n"
 712                         "Valid values are body and description." << endl;
 713                 return 1;
 714             }
 715             break;
 716         case 'C':
 717             use_ctime = true;
 718             break;
 719         case OPT_DATE_TERMS:
 720             date_terms = true;
 721             break;
 722         case OPT_NO_DATE_TERMS:
 723             // Ignored for compatibility with Omega 1.4.0.
 724             break;
 725         case 'G': {
 726             char * s = strrchr(optarg, ':');
 727             if (s == NULL) {
 728                 cerr << "Invalid MIME mapping '" << optarg << "'\n"
 729                         "Should be of the form GLOB:TYPE, e.g. *~:ignore"
 730                      << endl;
 731                 return 1;
 732             }
 733 #ifndef HAVE_FNMATCH
 734             cerr << "--mime-type-match isn't supported in this build because "
 735                     "the fnmatch() function wasn't found at configure time."
 736                  << endl;
 737             return 1;
 738 #else
 739             if (s == optarg) {
 740                 cerr << "--mime-type-match with an empty pattern can never "
 741                         "match." << endl;
 742                 return 1;
 743             }
 744             if (memchr(optarg, '/', s - optarg)) {
 745                 cerr << "--mime-type-match only matches against the leaf "
 746                         "filename so a pattern containing '/' can never match."
 747                      << endl;
 748                 return 1;
 749             }
 750             const char* type = s + 1;
 751             if (*type == '\0') {
 752                 cerr << "--mime-type-match doesn't support an empty MIME type"
 753                      << endl;
 754                 return 1;
 755             }
 756             *s = '\0';
 757             mime_patterns.emplace_back(optarg, type);
 758             break;
 759 #endif
 760         }
 761         case ':': // missing param
 762             return 1;
 763         case '?': // unknown option: FIXME -> char
 764             return 1;
 765         }
 766     }
 767
 768     if (dbpath.empty()) {
 769         cerr << PROG_NAME": you must specify a database with --db." << endl;
 770         return 1;
 771     }
 772     if (baseurl.empty()) {
 773         cerr << PROG_NAME": --url not specified, assuming '/'." << endl;
 774     }
 775     // baseurl must end in a '/'.
 776     if (!endswith(baseurl, '/')) {
 777         baseurl += '/';
 778     }
 779
 780     // Site term (omits the trailing slash):
 781     site_term = "J";
 782     site_term.append(baseurl, 0, baseurl.size() - 1);
 783     if (site_term.size() > MAX_SAFE_TERM_LENGTH)
 784         site_term = hash_long_term(site_term, MAX_SAFE_TERM_LENGTH);
 785
 786     // Host term, if the URL contains a hostname (omits any port number):
 787     string::size_type j;
 788     j = find_if(baseurl.begin(), baseurl.end(), p_notalnum) - baseurl.begin();
 789     if (j > 0 && baseurl.substr(j, 3) == "://" && j + 3 < baseurl.size()) {
 790         j += 3;
 791         // We must find a '/' - we ensured baseurl ended with a '/' above.
 792         string::size_type k = baseurl.find('/', j);
 793         url_start_path.assign(baseurl, k, string::npos);
 794         string::const_iterator l;
 795         l = find(baseurl.begin() + j, baseurl.begin() + k, ':');
 796         string::size_type host_len = l - baseurl.begin() - j;
 797         host_term = "H";
 798         host_term.append(baseurl, j, host_len);
 799         // DNS hostname limit is 253.
 800         if (host_term.size() > MAX_SAFE_TERM_LENGTH)
 801             host_term = hash_long_term(host_term, MAX_SAFE_TERM_LENGTH);
 802     } else {
 803         url_start_path = baseurl;
 804     }
 805
 806     if (optind >= argc || optind + 2 < argc) {
 807         cerr << PROG_NAME": you must specify a directory to index.\n"
 808 "Do this either as a single directory (corresponding to the base URL)\n"
 809 "or two directories - the first corresponding to the base URL and the second\n"
 810 "a subdirectory of that to index." << endl;
 811         return 1;
 812     }
 813
 814     root = argv[optind];
 815     if (root.empty()) {
 816         cerr << PROG_NAME": start directory can not be empty string." << endl;
 817         return 1;
 818     }
 819     if (!endswith(root, '/')) {
 820         root += '/';
 821     }
 822     if (optind + 2 == argc) {
 823         string start_url = argv[optind + 1];
 824         if (startswith(start_url, '/')) {
 825             // Make relative to root.
 826             if (!startswith(start_url, root)) {
 827                 cerr << PROG_NAME": '" << argv[optind + 1] << "' "
 828                     "is not a subdirectory of '" << argv[optind] << "'."
 829                      << endl;
 830                 return 1;
 831             }
 832             start_url.erase(0, root.size());
 833         }
 834         if (!endswith(start_url, '/')) {
 835             start_url += '/';
 836         }
 837         root += start_url;
 838         url_encode_path(baseurl, start_url);
 839     }
 840
 841     int exitcode = 1;
 842     try {
 843         index_init(dbpath, stemmer, root, site_term, host_term, empty_body,
 844                    (skip_duplicates ? DUP_SKIP : DUP_CHECK_LAZILY),
 845                    sample_size, title_size, max_ext_len,
 846                    overwrite, retry_failed, delete_removed_documents, verbose,
 847                    use_ctime, spelling, ignore_exclusions,
 848                    description_as_sample, date_terms);
 849         index_directory(root, baseurl, depth_limit, mime_map);
 850         index_handle_deletion();
 851         index_commit();
 852         exitcode = 0;
 853     } catch (const CommitAndExit &e) {
 854         cout << "Exception: " << e.what() << endl;
 855         cout << "Committing pending changes..." << endl;
 856         index_commit();
 857     } catch (const Xapian::Error &e) {
 858         cout << "Exception: " << e.get_description() << endl;
 859     } catch (const exception &e) {
 860         cout << "Exception: " << e.what() << endl;
 861     } catch (const string &s) {
 862         cout << "Exception: " << s << endl;
 863     } catch (const char *s) {
 864         cout << "Exception: " << s << endl;
 865     } catch (...) {
 866         cout << "Caught unknown exception" << endl;
 867     }
 868
 869     index_done();
 870
 871     return exitcode;
 872 }