Fix tg_termpos1 for 64-bit termpos
[xapian.git] / xapian-applications / omega / omindex.cc
blob77bed29226d1c3e02317b36e7f0139be436403f5
1 /** @file
2 * @brief index static documents into the omega db
3 */
4 /* Copyright 1999,2000,2001 BrightStation PLC
5 * Copyright 2001,2005 James Aylett
6 * Copyright 2001,2002 Ananova Ltd
7 * Copyright 2002-2022 Olly Betts
8 * Copyright 2009 Frank J Bruzzaniti
9 * Copyright 2012 Mihai Bivol
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public License as
13 * published by the Free Software Foundation; either version 2 of the
14 * License, or (at your option) any later version.
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
24 * USA
27 #include <config.h>
29 #include <algorithm>
30 #include <fstream>
31 #include <iostream>
32 #include <string>
33 #include <map>
35 #include <sys/types.h>
36 #include "safeunistd.h"
37 #include <cerrno>
38 #include <cstdio>
39 #include <cstdlib>
40 #include <cstring>
41 #include "safefcntl.h"
43 #ifdef HAVE_FNMATCH
44 # include <fnmatch.h>
45 #endif
47 #include <xapian.h>
49 #include "commonhelp.h"
50 #include "diritor.h"
51 #include "hashterm.h"
52 #include "index_file.h"
53 #include "mime.h"
54 #include "parseint.h"
55 #include "pkglibbindir.h"
56 #include "realtime.h"
57 #include "str.h"
58 #include "stringutils.h"
59 #include "urlencode.h"
60 #include "worker.h"
62 #include "gnu_getopt.h"
64 using namespace std;
66 #define PROG_NAME "omindex"
67 #define PROG_DESC "Index static website data via the filesystem"
69 #define TITLE_SIZE 128
70 #define SAMPLE_SIZE 512
72 static bool follow_symlinks = false;
73 static off_t max_size = 0;
74 static std::string pretty_max_size;
75 static bool verbose = false;
76 static double sleep_before_opendir = 0;
77 static bool date_terms = false;
79 static string root;
80 static string url_start_path;
82 #ifdef HAVE_FNMATCH
83 static vector<pair<const char*, const char*>> mime_patterns;
84 #endif
86 static inline bool
87 p_notalnum(unsigned int c)
89 return !C_isalnum(c);
92 static void
93 index_file(const string &file, const string &url, DirectoryIterator & d,
94 map<string, string>& mime_map)
96 string urlterm("U");
97 urlterm += url;
99 if (urlterm.length() > MAX_SAFE_TERM_LENGTH)
100 urlterm = hash_long_term(urlterm, MAX_SAFE_TERM_LENGTH);
102 const char* leafname = d.leafname();
104 string mimetype;
105 #ifdef HAVE_FNMATCH
106 for (auto&& i : mime_patterns) {
107 if (fnmatch(i.first, leafname, 0) == 0) {
108 if (strcmp(i.second, "ignore") == 0)
109 return;
110 if (strcmp(i.second, "skip") == 0) {
111 string m = "Leafname '";
112 m += leafname;
113 m += "' matches pattern: ";
114 m += i.first;
115 skip(urlterm, file.substr(root.size()), m,
116 d.get_size(), d.get_mtime());
117 return;
119 mimetype = i.second;
120 break;
123 #endif
125 string ext;
126 const char * dot_ptr = strrchr(leafname, '.');
127 if (dot_ptr) {
128 ext.assign(dot_ptr + 1);
129 if (ext.size() > max_ext_len)
130 ext.resize(0);
133 if (mimetype.empty()) {
134 mimetype = mimetype_from_ext(mime_map, ext);
135 if (mimetype == "ignore") {
136 // Remove any existing failed entry for this file.
137 index_remove_failed_entry(urlterm);
138 return;
139 } else if (mimetype == "skip") {
140 // Ignore mimetype, skipped mimetype should not be quietly ignored.
141 string m = "skipping extension '";
142 m += ext;
143 m += "'";
144 skip(urlterm, file.substr(root.size()), m,
145 d.get_size(), d.get_mtime());
146 return;
150 // Check the file size.
151 off_t size = d.get_size();
152 if (size == 0) {
153 skip(urlterm, file.substr(root.size()), "Zero-sized file",
154 size, d.get_mtime(), SKIP_VERBOSE_ONLY);
155 return;
158 if (max_size > 0 && size > max_size) {
159 skip(urlterm, file.substr(root.size()),
160 "Larger than size limit of " + pretty_max_size,
161 size, d.get_mtime(),
162 SKIP_VERBOSE_ONLY);
163 return;
166 string path_term("P");
167 path_term += url_start_path;
168 path_term.append(file, root.size(), string::npos);
170 index_mimetype(file, urlterm, url, ext, mimetype, d, path_term, string());
173 static void
174 index_directory(const string &path, const string &url_, size_t depth_limit,
175 map<string, string>& mime_map)
177 if (verbose)
178 cout << "[Entering directory \"" << path.substr(root.size()) << "\"]"
179 << endl;
181 DirectoryIterator d(follow_symlinks);
182 try {
183 // Crude workaround for MS-DFS share misbehaviour.
184 if (sleep_before_opendir > 0.0)
185 RealTime::sleep(RealTime::now() + sleep_before_opendir);
187 d.start(path);
189 while (d.next()) {
190 string url = url_;
191 url_encode(url, d.leafname());
192 string file = path;
193 file += d.leafname();
195 try {
196 switch (d.get_type()) {
197 case DirectoryIterator::DIRECTORY: {
198 size_t new_limit = depth_limit;
199 if (new_limit) {
200 if (--new_limit == 0) continue;
202 url += '/';
203 file += '/';
204 index_directory(file, url, new_limit, mime_map);
205 break;
207 case DirectoryIterator::REGULAR_FILE:
208 index_file(file, url, d, mime_map);
209 break;
210 default:
211 skip("U" + url, file.substr(root.size()), "Not a regular file",
212 d.get_size(), d.get_mtime(),
213 SKIP_VERBOSE_ONLY | SKIP_SHOW_FILENAME);
215 } catch (const FileNotFound & e) {
216 skip("U" + url, file.substr(root.size()), "File removed during indexing",
217 d.get_size(), d.get_mtime(),
218 /*SKIP_VERBOSE_ONLY |*/ SKIP_SHOW_FILENAME);
219 } catch (const std::string & error) {
220 skip("U" + url, file.substr(root.size()), error,
221 d.get_size(), d.get_mtime(), SKIP_SHOW_FILENAME);
224 } catch (const FileNotFound&) {
225 if (verbose)
226 cout << "Directory \"" << path.substr(root.size()) << "\" "
227 "deleted during indexing" << endl;
228 } catch (const std::string & error) {
229 cout << error << " - skipping directory "
230 "\"" << path.substr(root.size()) << "\"" << endl;
234 static off_t
235 parse_size(char* p)
237 // Don't want negative numbers, infinity, NaN, or hex numbers.
238 if (C_isdigit(p[0]) && (p[1] | 32) != 'x') {
239 double arg = strtod(p, &p);
240 switch (*p) {
241 case '\0':
242 break;
243 case 'k': case 'K':
244 arg *= 1024;
245 ++p;
246 break;
247 case 'm': case 'M':
248 arg *= (1024 * 1024);
249 ++p;
250 break;
251 case 'g': case 'G':
252 arg *= (1024 * 1024 * 1024);
253 ++p;
254 break;
256 if (*p == '\0') {
257 return off_t(arg);
260 return -1;
263 static bool
264 parse_filter_rule(const char* rule, map<string, string>& mime_map)
266 const char* s = strchr(rule, ':');
267 if (s == NULL || s[1] == '\0') {
268 cerr << "Invalid filter mapping '" << rule << "'\n"
269 "Should be of the form TYPE:COMMAND or TYPE1,TYPE2:COMMAND or "
270 "TYPE,EXT:COMMAND\n"
271 "e.g. 'application/octet-stream:strings -n8'"
272 << endl;
273 return false;
276 const char* c = static_cast<const char*>(memchr(rule, ',', s - rule));
277 string output_type, output_charset;
278 if (c) {
279 // Filter produces a specified content-type.
280 ++c;
281 const char* c2 = static_cast<const char *>(memchr(c, ',', s - c));
282 if (c2) {
283 output_type.assign(c, c2 - c);
284 ++c2;
285 output_charset.assign(c2, s - c2);
286 } else {
287 output_type.assign(c, s - c);
289 --c;
290 if (output_type.find('/') == string::npos) {
291 auto m = mime_map.find(output_type);
292 if (m != mime_map.end()) {
293 output_type = m->second;
294 } else {
295 const char* r = built_in_mime_map(output_type);
296 if (r) output_type = r;
299 if (output_type != "text/html" &&
300 output_type != "text/plain" &&
301 output_type != "image/svg+xml") {
302 cerr << "Currently only output types 'image/svg+xml', "
303 "'text/html' and 'text/plain' are supported."
304 << endl;
305 return false;
307 } else {
308 c = s;
311 const char* cmd = s + 1;
312 unsigned flags = 0;
313 if (cmd[0] == '|') {
314 flags |= Filter::PIPE_IN;
315 ++cmd;
316 // FIXME: Do we need a way to set PIPE_DEV_STDIN and SEEK_DEV_STDIN?
318 // PIPE_DEV_STDIN doesn't seem to offer much over |foo2txt /dev/stdin
319 // for user-specified filters (internally it provides a way to
320 // gracefully handle platforms without /dev/stdin).
322 // SEEK_DEV_STDIN isn't currently easily approximated though.
324 // Analyse the command string to decide if it needs a shell.
325 if (command_needs_shell(cmd))
326 flags |= Filter::USE_SHELL;
327 index_command(string(rule, c - rule),
328 Filter(string(cmd), output_type, output_charset, flags));
330 return true;
333 static bool
334 parse_worker_rule(const char* rule)
336 const char* s = strchr(rule, ':');
337 if (s == NULL || s[1] == '\0') {
338 cerr << "Invalid worker mapping '" << rule << "'\n"
339 "Should be of the form TYPE:WORKER\n"
340 "e.g. 'application/msword:omindex_libreofficekit\n";
341 return false;
344 index_library(string(rule, s - rule), new Worker(string(s + 1)));
345 return true;
349 main(int argc, char **argv)
351 // If overwrite is true, the database will be created anew even if it
352 // already exists.
353 bool overwrite = false;
354 // If delete_removed_documents is true, delete any documents we don't see.
355 bool delete_removed_documents = true;
356 // Retry files which we failed to index on a previous run?
357 bool retry_failed = false;
358 bool use_ctime = false;
359 bool spelling = false;
360 bool skip_duplicates = false;
361 bool ignore_exclusions = false;
362 bool description_as_sample = false;
363 string baseurl;
364 size_t depth_limit = 0;
365 size_t title_size = TITLE_SIZE;
366 size_t sample_size = SAMPLE_SIZE;
367 empty_body_type empty_body = EMPTY_BODY_WARN;
368 string site_term, host_term;
369 Xapian::Stem stemmer("english");
371 enum {
372 OPT_OPENDIR_SLEEP = 256,
373 OPT_SAMPLE,
374 OPT_DATE_TERMS,
375 OPT_NO_DATE_TERMS,
376 OPT_READ_FILTERS,
377 OPT_READ_WORKERS
379 constexpr auto NO_ARG = no_argument;
380 constexpr auto REQ_ARG = required_argument;
381 static const struct option longopts[] = {
382 { "help", NO_ARG, NULL, 'h' },
383 { "version", NO_ARG, NULL, 'V' },
384 { "overwrite", NO_ARG, NULL, 'o' },
385 { "duplicates", REQ_ARG, NULL, 'd' },
386 { "no-delete", NO_ARG, NULL, 'p' },
387 { "db", REQ_ARG, NULL, 'D' },
388 { "url", REQ_ARG, NULL, 'U' },
389 { "mime-type", REQ_ARG, NULL, 'M' },
390 { "mime-type-match", REQ_ARG, NULL, 'G' },
391 { "filter", REQ_ARG, NULL, 'F' },
392 { "worker", REQ_ARG, NULL, 'W' },
393 { "read-filters", REQ_ARG, NULL, OPT_READ_FILTERS },
394 { "read-workers", REQ_ARG, NULL, OPT_READ_WORKERS },
395 { "depth-limit", REQ_ARG, NULL, 'l' },
396 { "follow", NO_ARG, NULL, 'f' },
397 { "ignore-exclusions", NO_ARG, NULL, 'i' },
398 { "stemmer", REQ_ARG, NULL, 's' },
399 { "spelling", NO_ARG, NULL, 'S' },
400 { "verbose", NO_ARG, NULL, 'v' },
401 { "empty-docs", REQ_ARG, NULL, 'e' },
402 { "max-size", REQ_ARG, NULL, 'm' },
403 { "sample", REQ_ARG, NULL, OPT_SAMPLE },
404 { "sample-size", REQ_ARG, NULL, 'E' },
405 { "title-size", REQ_ARG, NULL, 'T' },
406 { "retry-failed", NO_ARG, NULL, 'R' },
407 { "opendir-sleep", REQ_ARG, NULL, OPT_OPENDIR_SLEEP },
408 { "track-ctime", NO_ARG, NULL, 'C' },
409 { "date-terms", NO_ARG, NULL, OPT_DATE_TERMS },
410 { "no-date-terms", NO_ARG, NULL, OPT_NO_DATE_TERMS },
411 { 0, 0, NULL, 0 }
414 map<string, string> mime_map;
416 index_add_default_filters();
417 index_add_default_libraries();
419 if (argc == 2 && strcmp(argv[1], "-v") == 0) {
420 // -v was the short option for --version in 1.2.3 and earlier, but
421 // now it is short for --verbose (for consistency with scriptindex)
422 // so if "-v" is the only option, translate it to "--version" for
423 // backwards compatibility.
424 argv[1] = const_cast<char *>("--version");
427 string dbpath;
428 int getopt_ret;
429 while ((getopt_ret = gnu_getopt_long(argc, argv,
430 "hvd:D:U:M:G:F:W:l:s:pfRSVe:im:E:T:C",
431 longopts, NULL)) != -1) {
432 switch (getopt_ret) {
433 case 'h': {
434 cout << PROG_NAME " - " PROG_DESC "\n\n"
435 "Usage: " PROG_NAME " [OPTIONS] --db DATABASE [BASEDIR] DIRECTORY\n"
436 "\n"
437 "DIRECTORY is the directory to start indexing from.\n"
438 "\n"
439 "BASEDIR is the directory corresponding to URL (default: DIRECTORY).\n"
440 "\n"
441 "Options:\n"
442 " -d, --duplicates=ARG set duplicate handling: ARG can be 'ignore' or\n"
443 " 'replace' (default: replace)\n"
444 " -p, --no-delete skip the deletion of documents corresponding to\n"
445 " deleted files\n"
446 " -e, --empty-docs=ARG how to handle documents we extract no text from:\n"
447 " ARG can be index, warn (issue a diagnostic and\n"
448 " index), or skip. (default: warn)\n"
449 " -D, --db=DATABASE path to database to use\n"
450 " -U, --url=URL base url BASEDIR corresponds to (default: /)\n"
451 " -M, --mime-type=EXT:TYPE assume any file with extension EXT has MIME\n"
452 " Content-Type TYPE, instead of using libmagic\n"
453 " (empty TYPE removes any existing mapping for EXT;\n"
454 " other special TYPE values: 'ignore' and 'skip')\n"
455 " -G, --mime-type-match=GLOB:TYPE\n"
456 " assume any file with leaf name matching shell\n"
457 " wildcard pattern GLOB has MIME Content-Type TYPE\n"
458 " (special TYPE values: 'ignore' and 'skip')\n"
459 " -F, --filter=M[,[T][,C]]:CMD\n"
460 " process files with MIME Content-Type M using\n"
461 " command CMD, which produces output (on stdout or\n"
462 " in a temporary file) with format T (Content-Type\n"
463 " or file extension; currently txt (default), html\n"
464 " or svg) in character encoding C (default: UTF-8).\n"
465 " E.g. -Fapplication/octet-stream:'|strings -n8'\n"
466 " or -Ftext/x-foo,,utf-16:'foo2utf16 %f %t'\n"
467 " -W, --worker=TYPE:WORKER process files with MIME Content-Type TYPE using\n"
468 " worker sub-process WORKER. WORKER is the name of\n"
469 " the program to run to start the worker. If it has\n"
470 " no path then it's looked for in pkglibbindir (which\n"
471 " can be overridden by setting environment variable\n"
472 " XAPIAN_OMEGA_PKGLIBBINDIR). This invocation will\n"
473 " look in: " << get_pkglibbindir() << "\n"
474 " --read-filters=FILE bulk-load --filter arguments from FILE, which\n"
475 " should contain one such argument per line (e.g.\n"
476 " text/x-bar:bar2txt --utf8). Lines starting with #\n"
477 " are treated as comments and ignored.\n"
478 " --read-workers=FILE bulk-load --worker arguments from FILE, which\n"
479 " should contain one such argument per line (e.g.\n"
480 " text/x-bar:omindex_libbar). Lines starting with #\n"
481 " are treated as comments and ignored.\n"
482 " -l, --depth-limit=LIMIT set recursion limit (0 = unlimited)\n"
483 " -f, --follow follow symbolic links\n"
484 " -i, --ignore-exclusions ignore meta robots tags and similar exclusions\n"
485 " -S, --spelling index data for spelling correction\n"
486 " -m, --max-size=N[SUFFIX] maximum size of file to index (in bytes or with a\n"
487 " suffix of 'K'/'k', 'M'/'m', 'G'/'g')\n"
488 " (default: unlimited)\n"
489 " --sample=SOURCE what to use for the stored sample of text for\n"
490 " HTML documents - SOURCE can be 'body' or\n"
491 " 'description' (default: 'body')\n"
492 " -E, --sample-size=SIZE maximum size for the document text sample\n"
493 " (supports the same formats as --max-size).\n"
494 " (default: " STRINGIZE(SAMPLE_SIZE) ")\n"
495 " -T, --title-size=SIZE maximum size for the document title\n"
496 " (supports the same formats as --max-size).\n"
497 " (default: " STRINGIZE(TITLE_SIZE) ")\n"
498 " -R, --retry-failed retry files which omindex failed to extract text\n"
499 " from on a previous run\n"
500 " --opendir-sleep=SECS sleep for SECS seconds before opening each\n"
501 " directory - sleeping for 2 seconds seems to\n"
502 " reliably work around problems with indexing files\n"
503 " on Microsoft DFS shares.\n"
504 " -C, --track-ctime track each file's ctime so we can detect changes\n"
505 " to ownership or permissions.\n"
506 " --date-terms index D, M and Y prefixed terms to support date\n"
507 " range filtering using terms (we now recommend\n"
508 " using a value slot for this instead).\n"
509 " --no-date-terms ignored for compatibility with Omega 1.4.x.\n"
510 " -v, --verbose show more information about what is happening\n"
511 " --overwrite create the database anew (the default is to update\n"
512 " if the database already exists)" << endl;
513 print_stemmer_help(" ");
514 print_help_and_version_help(" ");
515 return 0;
517 case 'V':
518 print_package_info(PROG_NAME);
519 return 0;
520 case 'd': // how shall we handle duplicate documents?
521 switch (optarg[0]) {
522 case 'i':
523 skip_duplicates = true;
524 break;
525 case 'r':
526 skip_duplicates = false;
527 break;
529 break;
530 case 'e':
531 if (strcmp(optarg, "index") == 0) {
532 empty_body = EMPTY_BODY_INDEX;
533 } else if (strcmp(optarg, "warn") == 0) {
534 empty_body = EMPTY_BODY_WARN;
535 } else if (strcmp(optarg, "skip") == 0) {
536 empty_body = EMPTY_BODY_SKIP;
537 } else {
538 cerr << "Invalid --empty-docs value '" << optarg << "'\n"
539 "Valid values are index, warn, and skip." << endl;
540 return 1;
542 break;
543 case 'p': // Keep documents even if the files have been removed.
544 delete_removed_documents = false;
545 break;
546 case 'l': { // Set recursion limit
547 int arg;
548 if (!parse_signed(optarg, arg)) {
549 throw "Recursion limit must be an integer";
551 if (arg < 0) arg = 0;
552 depth_limit = size_t(arg);
553 break;
555 case 'f': // Turn on following of symlinks
556 follow_symlinks = true;
557 break;
558 case 'M': {
559 const char * s = strrchr(optarg, ':');
560 if (s == NULL) {
561 cerr << "Invalid MIME mapping '" << optarg << "'\n"
562 "Should be of the form EXT:TYPE, e.g. txt:text/plain\n"
563 "(or txt: to delete a default mapping)" << endl;
564 return 1;
567 // -Mtxt: results in an empty string, which effectively removes the
568 // default mapping for .txt files.
569 size_t ext_len = s - optarg;
570 mime_map[string(optarg, ext_len)] = string(s + 1);
571 max_ext_len = max(max_ext_len, ext_len);
572 break;
574 case 'F':
575 if (!parse_filter_rule(optarg, mime_map))
576 return 1;
577 break;
578 case 'W':
579 if (!parse_worker_rule(optarg))
580 return 1;
581 break;
582 case OPT_READ_FILTERS: {
583 ifstream stream(optarg);
584 if (!stream) {
585 cerr << "Unable to open filter file '" << optarg << "' "
586 "(" << strerror(errno) << ')' << endl;
587 return 1;
589 string rule;
590 bool all_ok = true;
591 while (getline(stream, rule)) {
592 if (startswith(rule, '#')) continue;
593 if (!parse_filter_rule(rule.c_str(), mime_map))
594 all_ok = false;
596 if (!all_ok)
597 return 1;
598 break;
600 case OPT_READ_WORKERS: {
601 ifstream stream(optarg);
602 if (!stream) {
603 cerr << "Unable to open worker file '" << optarg << "' "
604 "(" << strerror(errno) << ')' << endl;
605 return 1;
607 string rule;
608 bool all_ok = true;
609 while (getline(stream, rule)) {
610 if (startswith(rule, '#')) continue;
611 if (!parse_worker_rule(rule.c_str()))
612 all_ok = false;
614 if (!all_ok)
615 return 1;
616 break;
618 case 'D':
619 dbpath = optarg;
620 break;
621 case 'U':
622 baseurl = optarg;
623 break;
624 case 'o': // --overwrite
625 overwrite = true;
626 break;
627 case 'i':
628 ignore_exclusions = true;
629 break;
630 case 'R': // --retry-failed
631 retry_failed = true;
632 break;
633 case 's':
634 try {
635 stemmer = Xapian::Stem(optarg);
636 } catch (const Xapian::InvalidArgumentError &) {
637 cerr << "Unknown stemming language '" << optarg << "'.\n"
638 "Available language names are: "
639 << Xapian::Stem::get_available_languages() << endl;
640 return 1;
642 break;
643 case 'S':
644 spelling = true;
645 break;
646 case 'v':
647 verbose = true;
648 break;
649 case 'E': {
650 off_t arg = parse_size(optarg);
651 if (arg >= 0) {
652 sample_size = size_t(arg);
653 break;
655 cerr << PROG_NAME": bad sample size '" << optarg << "'" << endl;
656 return 1;
658 case 'T': {
659 off_t arg = parse_size(optarg);
660 if (arg >= 0) {
661 title_size = size_t(arg);
662 break;
664 cerr << PROG_NAME": bad title size '" << optarg << "'" << endl;
665 return 1;
667 case 'm': {
668 off_t size = parse_size(optarg);
669 if (size >= 0) {
670 max_size = size;
671 const char * suffix;
672 // Set lsb to the lowest set bit in max_size.
673 off_t lsb = max_size & -max_size;
674 if (lsb >= off_t(1L << 30)) {
675 size >>= 30;
676 suffix = "GB";
677 } else if (lsb >= off_t(1L << 20)) {
678 size >>= 20;
679 suffix = "MB";
680 } else if (lsb >= off_t(1L << 10)) {
681 size >>= 10;
682 suffix = "KB";
683 } else {
684 suffix = "B";
686 pretty_max_size = str(size);
687 pretty_max_size += suffix;
688 break;
690 cerr << PROG_NAME": bad max size '" << optarg << "'" << endl;
691 return 1;
693 case OPT_OPENDIR_SLEEP: {
694 // Don't want negative numbers, infinity, NaN, or hex numbers.
695 char * p = optarg;
696 if (C_isdigit(p[0]) && (p[1] | 32) != 'x') {
697 sleep_before_opendir = strtod(p, &p);
698 if (*p == '\0')
699 break;
701 cerr << PROG_NAME": bad --opendir-sleep argument: "
702 "'" << optarg << "'" << endl;
703 return 1;
705 case OPT_SAMPLE:
706 if (strcmp(optarg, "description") == 0) {
707 description_as_sample = true;
708 } else if (strcmp(optarg, "body") == 0) {
709 description_as_sample = false;
710 } else {
711 cerr << "Invalid --sample value '" << optarg << "'\n"
712 "Valid values are body and description." << endl;
713 return 1;
715 break;
716 case 'C':
717 use_ctime = true;
718 break;
719 case OPT_DATE_TERMS:
720 date_terms = true;
721 break;
722 case OPT_NO_DATE_TERMS:
723 // Ignored for compatibility with Omega 1.4.0.
724 break;
725 case 'G': {
726 char * s = strrchr(optarg, ':');
727 if (s == NULL) {
728 cerr << "Invalid MIME mapping '" << optarg << "'\n"
729 "Should be of the form GLOB:TYPE, e.g. *~:ignore"
730 << endl;
731 return 1;
733 #ifndef HAVE_FNMATCH
734 cerr << "--mime-type-match isn't supported in this build because "
735 "the fnmatch() function wasn't found at configure time."
736 << endl;
737 return 1;
738 #else
739 if (s == optarg) {
740 cerr << "--mime-type-match with an empty pattern can never "
741 "match." << endl;
742 return 1;
744 if (memchr(optarg, '/', s - optarg)) {
745 cerr << "--mime-type-match only matches against the leaf "
746 "filename so a pattern containing '/' can never match."
747 << endl;
748 return 1;
750 const char* type = s + 1;
751 if (*type == '\0') {
752 cerr << "--mime-type-match doesn't support an empty MIME type"
753 << endl;
754 return 1;
756 *s = '\0';
757 mime_patterns.emplace_back(optarg, type);
758 break;
759 #endif
761 case ':': // missing param
762 return 1;
763 case '?': // unknown option: FIXME -> char
764 return 1;
768 if (dbpath.empty()) {
769 cerr << PROG_NAME": you must specify a database with --db." << endl;
770 return 1;
772 if (baseurl.empty()) {
773 cerr << PROG_NAME": --url not specified, assuming '/'." << endl;
775 // baseurl must end in a '/'.
776 if (!endswith(baseurl, '/')) {
777 baseurl += '/';
780 // Site term (omits the trailing slash):
781 site_term = "J";
782 site_term.append(baseurl, 0, baseurl.size() - 1);
783 if (site_term.size() > MAX_SAFE_TERM_LENGTH)
784 site_term = hash_long_term(site_term, MAX_SAFE_TERM_LENGTH);
786 // Host term, if the URL contains a hostname (omits any port number):
787 string::size_type j;
788 j = find_if(baseurl.begin(), baseurl.end(), p_notalnum) - baseurl.begin();
789 if (j > 0 && baseurl.substr(j, 3) == "://" && j + 3 < baseurl.size()) {
790 j += 3;
791 // We must find a '/' - we ensured baseurl ended with a '/' above.
792 string::size_type k = baseurl.find('/', j);
793 url_start_path.assign(baseurl, k, string::npos);
794 string::const_iterator l;
795 l = find(baseurl.begin() + j, baseurl.begin() + k, ':');
796 string::size_type host_len = l - baseurl.begin() - j;
797 host_term = "H";
798 host_term.append(baseurl, j, host_len);
799 // DNS hostname limit is 253.
800 if (host_term.size() > MAX_SAFE_TERM_LENGTH)
801 host_term = hash_long_term(host_term, MAX_SAFE_TERM_LENGTH);
802 } else {
803 url_start_path = baseurl;
806 if (optind >= argc || optind + 2 < argc) {
807 cerr << PROG_NAME": you must specify a directory to index.\n"
808 "Do this either as a single directory (corresponding to the base URL)\n"
809 "or two directories - the first corresponding to the base URL and the second\n"
810 "a subdirectory of that to index." << endl;
811 return 1;
814 root = argv[optind];
815 if (root.empty()) {
816 cerr << PROG_NAME": start directory can not be empty string." << endl;
817 return 1;
819 if (!endswith(root, '/')) {
820 root += '/';
822 if (optind + 2 == argc) {
823 string start_url = argv[optind + 1];
824 if (startswith(start_url, '/')) {
825 // Make relative to root.
826 if (!startswith(start_url, root)) {
827 cerr << PROG_NAME": '" << argv[optind + 1] << "' "
828 "is not a subdirectory of '" << argv[optind] << "'."
829 << endl;
830 return 1;
832 start_url.erase(0, root.size());
834 if (!endswith(start_url, '/')) {
835 start_url += '/';
837 root += start_url;
838 url_encode_path(baseurl, start_url);
841 int exitcode = 1;
842 try {
843 index_init(dbpath, stemmer, root, site_term, host_term, empty_body,
844 (skip_duplicates ? DUP_SKIP : DUP_CHECK_LAZILY),
845 sample_size, title_size, max_ext_len,
846 overwrite, retry_failed, delete_removed_documents, verbose,
847 use_ctime, spelling, ignore_exclusions,
848 description_as_sample, date_terms);
849 index_directory(root, baseurl, depth_limit, mime_map);
850 index_handle_deletion();
851 index_commit();
852 exitcode = 0;
853 } catch (const CommitAndExit &e) {
854 cout << "Exception: " << e.what() << endl;
855 cout << "Committing pending changes..." << endl;
856 index_commit();
857 } catch (const Xapian::Error &e) {
858 cout << "Exception: " << e.get_description() << endl;
859 } catch (const exception &e) {
860 cout << "Exception: " << e.what() << endl;
861 } catch (const string &s) {
862 cout << "Exception: " << s << endl;
863 } catch (const char *s) {
864 cout << "Exception: " << s << endl;
865 } catch (...) {
866 cout << "Caught unknown exception" << endl;
869 index_done();
871 return exitcode;