2 * @brief index static documents into the omega db
4 /* Copyright 1999,2000,2001 BrightStation PLC
5 * Copyright 2001,2005 James Aylett
6 * Copyright 2001,2002 Ananova Ltd
7 * Copyright 2002-2022 Olly Betts
8 * Copyright 2009 Frank J Bruzzaniti
9 * Copyright 2012 Mihai Bivol
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public License as
13 * published by the Free Software Foundation; either version 2 of the
14 * License, or (at your option) any later version.
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
35 #include <sys/types.h>
36 #include "safeunistd.h"
41 #include "safefcntl.h"
49 #include "commonhelp.h"
52 #include "index_file.h"
55 #include "pkglibbindir.h"
58 #include "stringutils.h"
59 #include "urlencode.h"
62 #include "gnu_getopt.h"
66 #define PROG_NAME "omindex"
67 #define PROG_DESC "Index static website data via the filesystem"
69 #define TITLE_SIZE 128
70 #define SAMPLE_SIZE 512
72 static bool follow_symlinks
= false;
73 static off_t max_size
= 0;
74 static std::string pretty_max_size
;
75 static bool verbose
= false;
76 static double sleep_before_opendir
= 0;
77 static bool date_terms
= false;
80 static string url_start_path
;
83 static vector
<pair
<const char*, const char*>> mime_patterns
;
87 p_notalnum(unsigned int c
)
93 index_file(const string
&file
, const string
&url
, DirectoryIterator
& d
,
94 map
<string
, string
>& mime_map
)
99 if (urlterm
.length() > MAX_SAFE_TERM_LENGTH
)
100 urlterm
= hash_long_term(urlterm
, MAX_SAFE_TERM_LENGTH
);
102 const char* leafname
= d
.leafname();
106 for (auto&& i
: mime_patterns
) {
107 if (fnmatch(i
.first
, leafname
, 0) == 0) {
108 if (strcmp(i
.second
, "ignore") == 0)
110 if (strcmp(i
.second
, "skip") == 0) {
111 string m
= "Leafname '";
113 m
+= "' matches pattern: ";
115 skip(urlterm
, file
.substr(root
.size()), m
,
116 d
.get_size(), d
.get_mtime());
126 const char * dot_ptr
= strrchr(leafname
, '.');
128 ext
.assign(dot_ptr
+ 1);
129 if (ext
.size() > max_ext_len
)
133 if (mimetype
.empty()) {
134 mimetype
= mimetype_from_ext(mime_map
, ext
);
135 if (mimetype
== "ignore") {
136 // Remove any existing failed entry for this file.
137 index_remove_failed_entry(urlterm
);
139 } else if (mimetype
== "skip") {
140 // Ignore mimetype, skipped mimetype should not be quietly ignored.
141 string m
= "skipping extension '";
144 skip(urlterm
, file
.substr(root
.size()), m
,
145 d
.get_size(), d
.get_mtime());
150 // Check the file size.
151 off_t size
= d
.get_size();
153 skip(urlterm
, file
.substr(root
.size()), "Zero-sized file",
154 size
, d
.get_mtime(), SKIP_VERBOSE_ONLY
);
158 if (max_size
> 0 && size
> max_size
) {
159 skip(urlterm
, file
.substr(root
.size()),
160 "Larger than size limit of " + pretty_max_size
,
166 string
path_term("P");
167 path_term
+= url_start_path
;
168 path_term
.append(file
, root
.size(), string::npos
);
170 index_mimetype(file
, urlterm
, url
, ext
, mimetype
, d
, path_term
, string());
174 index_directory(const string
&path
, const string
&url_
, size_t depth_limit
,
175 map
<string
, string
>& mime_map
)
178 cout
<< "[Entering directory \"" << path
.substr(root
.size()) << "\"]"
181 DirectoryIterator
d(follow_symlinks
);
183 // Crude workaround for MS-DFS share misbehaviour.
184 if (sleep_before_opendir
> 0.0)
185 RealTime::sleep(RealTime::now() + sleep_before_opendir
);
191 url_encode(url
, d
.leafname());
193 file
+= d
.leafname();
196 switch (d
.get_type()) {
197 case DirectoryIterator::DIRECTORY
: {
198 size_t new_limit
= depth_limit
;
200 if (--new_limit
== 0) continue;
204 index_directory(file
, url
, new_limit
, mime_map
);
207 case DirectoryIterator::REGULAR_FILE
:
208 index_file(file
, url
, d
, mime_map
);
211 skip("U" + url
, file
.substr(root
.size()), "Not a regular file",
212 d
.get_size(), d
.get_mtime(),
213 SKIP_VERBOSE_ONLY
| SKIP_SHOW_FILENAME
);
215 } catch (const FileNotFound
& e
) {
216 skip("U" + url
, file
.substr(root
.size()), "File removed during indexing",
217 d
.get_size(), d
.get_mtime(),
218 /*SKIP_VERBOSE_ONLY |*/ SKIP_SHOW_FILENAME
);
219 } catch (const std::string
& error
) {
220 skip("U" + url
, file
.substr(root
.size()), error
,
221 d
.get_size(), d
.get_mtime(), SKIP_SHOW_FILENAME
);
224 } catch (const FileNotFound
&) {
226 cout
<< "Directory \"" << path
.substr(root
.size()) << "\" "
227 "deleted during indexing" << endl
;
228 } catch (const std::string
& error
) {
229 cout
<< error
<< " - skipping directory "
230 "\"" << path
.substr(root
.size()) << "\"" << endl
;
237 // Don't want negative numbers, infinity, NaN, or hex numbers.
238 if (C_isdigit(p
[0]) && (p
[1] | 32) != 'x') {
239 double arg
= strtod(p
, &p
);
248 arg
*= (1024 * 1024);
252 arg
*= (1024 * 1024 * 1024);
264 parse_filter_rule(const char* rule
, map
<string
, string
>& mime_map
)
266 const char* s
= strchr(rule
, ':');
267 if (s
== NULL
|| s
[1] == '\0') {
268 cerr
<< "Invalid filter mapping '" << rule
<< "'\n"
269 "Should be of the form TYPE:COMMAND or TYPE1,TYPE2:COMMAND or "
271 "e.g. 'application/octet-stream:strings -n8'"
276 const char* c
= static_cast<const char*>(memchr(rule
, ',', s
- rule
));
277 string output_type
, output_charset
;
279 // Filter produces a specified content-type.
281 const char* c2
= static_cast<const char *>(memchr(c
, ',', s
- c
));
283 output_type
.assign(c
, c2
- c
);
285 output_charset
.assign(c2
, s
- c2
);
287 output_type
.assign(c
, s
- c
);
290 if (output_type
.find('/') == string::npos
) {
291 auto m
= mime_map
.find(output_type
);
292 if (m
!= mime_map
.end()) {
293 output_type
= m
->second
;
295 const char* r
= built_in_mime_map(output_type
);
296 if (r
) output_type
= r
;
299 if (output_type
!= "text/html" &&
300 output_type
!= "text/plain" &&
301 output_type
!= "image/svg+xml") {
302 cerr
<< "Currently only output types 'image/svg+xml', "
303 "'text/html' and 'text/plain' are supported."
311 const char* cmd
= s
+ 1;
314 flags
|= Filter::PIPE_IN
;
316 // FIXME: Do we need a way to set PIPE_DEV_STDIN and SEEK_DEV_STDIN?
318 // PIPE_DEV_STDIN doesn't seem to offer much over |foo2txt /dev/stdin
319 // for user-specified filters (internally it provides a way to
320 // gracefully handle platforms without /dev/stdin).
322 // SEEK_DEV_STDIN isn't currently easily approximated though.
324 // Analyse the command string to decide if it needs a shell.
325 if (command_needs_shell(cmd
))
326 flags
|= Filter::USE_SHELL
;
327 index_command(string(rule
, c
- rule
),
328 Filter(string(cmd
), output_type
, output_charset
, flags
));
334 parse_worker_rule(const char* rule
)
336 const char* s
= strchr(rule
, ':');
337 if (s
== NULL
|| s
[1] == '\0') {
338 cerr
<< "Invalid worker mapping '" << rule
<< "'\n"
339 "Should be of the form TYPE:WORKER\n"
340 "e.g. 'application/msword:omindex_libreofficekit\n";
344 index_library(string(rule
, s
- rule
), new Worker(string(s
+ 1)));
349 main(int argc
, char **argv
)
351 // If overwrite is true, the database will be created anew even if it
353 bool overwrite
= false;
354 // If delete_removed_documents is true, delete any documents we don't see.
355 bool delete_removed_documents
= true;
356 // Retry files which we failed to index on a previous run?
357 bool retry_failed
= false;
358 bool use_ctime
= false;
359 bool spelling
= false;
360 bool skip_duplicates
= false;
361 bool ignore_exclusions
= false;
362 bool description_as_sample
= false;
364 size_t depth_limit
= 0;
365 size_t title_size
= TITLE_SIZE
;
366 size_t sample_size
= SAMPLE_SIZE
;
367 empty_body_type empty_body
= EMPTY_BODY_WARN
;
368 string site_term
, host_term
;
369 Xapian::Stem
stemmer("english");
372 OPT_OPENDIR_SLEEP
= 256,
379 constexpr auto NO_ARG
= no_argument
;
380 constexpr auto REQ_ARG
= required_argument
;
381 static const struct option longopts
[] = {
382 { "help", NO_ARG
, NULL
, 'h' },
383 { "version", NO_ARG
, NULL
, 'V' },
384 { "overwrite", NO_ARG
, NULL
, 'o' },
385 { "duplicates", REQ_ARG
, NULL
, 'd' },
386 { "no-delete", NO_ARG
, NULL
, 'p' },
387 { "db", REQ_ARG
, NULL
, 'D' },
388 { "url", REQ_ARG
, NULL
, 'U' },
389 { "mime-type", REQ_ARG
, NULL
, 'M' },
390 { "mime-type-match", REQ_ARG
, NULL
, 'G' },
391 { "filter", REQ_ARG
, NULL
, 'F' },
392 { "worker", REQ_ARG
, NULL
, 'W' },
393 { "read-filters", REQ_ARG
, NULL
, OPT_READ_FILTERS
},
394 { "read-workers", REQ_ARG
, NULL
, OPT_READ_WORKERS
},
395 { "depth-limit", REQ_ARG
, NULL
, 'l' },
396 { "follow", NO_ARG
, NULL
, 'f' },
397 { "ignore-exclusions", NO_ARG
, NULL
, 'i' },
398 { "stemmer", REQ_ARG
, NULL
, 's' },
399 { "spelling", NO_ARG
, NULL
, 'S' },
400 { "verbose", NO_ARG
, NULL
, 'v' },
401 { "empty-docs", REQ_ARG
, NULL
, 'e' },
402 { "max-size", REQ_ARG
, NULL
, 'm' },
403 { "sample", REQ_ARG
, NULL
, OPT_SAMPLE
},
404 { "sample-size", REQ_ARG
, NULL
, 'E' },
405 { "title-size", REQ_ARG
, NULL
, 'T' },
406 { "retry-failed", NO_ARG
, NULL
, 'R' },
407 { "opendir-sleep", REQ_ARG
, NULL
, OPT_OPENDIR_SLEEP
},
408 { "track-ctime", NO_ARG
, NULL
, 'C' },
409 { "date-terms", NO_ARG
, NULL
, OPT_DATE_TERMS
},
410 { "no-date-terms", NO_ARG
, NULL
, OPT_NO_DATE_TERMS
},
414 map
<string
, string
> mime_map
;
416 index_add_default_filters();
417 index_add_default_libraries();
419 if (argc
== 2 && strcmp(argv
[1], "-v") == 0) {
420 // -v was the short option for --version in 1.2.3 and earlier, but
421 // now it is short for --verbose (for consistency with scriptindex)
422 // so if "-v" is the only option, translate it to "--version" for
423 // backwards compatibility.
424 argv
[1] = const_cast<char *>("--version");
429 while ((getopt_ret
= gnu_getopt_long(argc
, argv
,
430 "hvd:D:U:M:G:F:W:l:s:pfRSVe:im:E:T:C",
431 longopts
, NULL
)) != -1) {
432 switch (getopt_ret
) {
434 cout
<< PROG_NAME
" - " PROG_DESC
"\n\n"
435 "Usage: " PROG_NAME
" [OPTIONS] --db DATABASE [BASEDIR] DIRECTORY\n"
437 "DIRECTORY is the directory to start indexing from.\n"
439 "BASEDIR is the directory corresponding to URL (default: DIRECTORY).\n"
442 " -d, --duplicates=ARG set duplicate handling: ARG can be 'ignore' or\n"
443 " 'replace' (default: replace)\n"
444 " -p, --no-delete skip the deletion of documents corresponding to\n"
446 " -e, --empty-docs=ARG how to handle documents we extract no text from:\n"
447 " ARG can be index, warn (issue a diagnostic and\n"
448 " index), or skip. (default: warn)\n"
449 " -D, --db=DATABASE path to database to use\n"
450 " -U, --url=URL base url BASEDIR corresponds to (default: /)\n"
451 " -M, --mime-type=EXT:TYPE assume any file with extension EXT has MIME\n"
452 " Content-Type TYPE, instead of using libmagic\n"
453 " (empty TYPE removes any existing mapping for EXT;\n"
454 " other special TYPE values: 'ignore' and 'skip')\n"
455 " -G, --mime-type-match=GLOB:TYPE\n"
456 " assume any file with leaf name matching shell\n"
457 " wildcard pattern GLOB has MIME Content-Type TYPE\n"
458 " (special TYPE values: 'ignore' and 'skip')\n"
459 " -F, --filter=M[,[T][,C]]:CMD\n"
460 " process files with MIME Content-Type M using\n"
461 " command CMD, which produces output (on stdout or\n"
462 " in a temporary file) with format T (Content-Type\n"
463 " or file extension; currently txt (default), html\n"
464 " or svg) in character encoding C (default: UTF-8).\n"
465 " E.g. -Fapplication/octet-stream:'|strings -n8'\n"
466 " or -Ftext/x-foo,,utf-16:'foo2utf16 %f %t'\n"
467 " -W, --worker=TYPE:WORKER process files with MIME Content-Type TYPE using\n"
468 " worker sub-process WORKER. WORKER is the name of\n"
469 " the program to run to start the worker. If it has\n"
470 " no path then it's looked for in pkglibbindir (which\n"
471 " can be overridden by setting environment variable\n"
472 " XAPIAN_OMEGA_PKGLIBBINDIR). This invocation will\n"
473 " look in: " << get_pkglibbindir() << "\n"
474 " --read-filters=FILE bulk-load --filter arguments from FILE, which\n"
475 " should contain one such argument per line (e.g.\n"
476 " text/x-bar:bar2txt --utf8). Lines starting with #\n"
477 " are treated as comments and ignored.\n"
478 " --read-workers=FILE bulk-load --worker arguments from FILE, which\n"
479 " should contain one such argument per line (e.g.\n"
480 " text/x-bar:omindex_libbar). Lines starting with #\n"
481 " are treated as comments and ignored.\n"
482 " -l, --depth-limit=LIMIT set recursion limit (0 = unlimited)\n"
483 " -f, --follow follow symbolic links\n"
484 " -i, --ignore-exclusions ignore meta robots tags and similar exclusions\n"
485 " -S, --spelling index data for spelling correction\n"
486 " -m, --max-size=N[SUFFIX] maximum size of file to index (in bytes or with a\n"
487 " suffix of 'K'/'k', 'M'/'m', 'G'/'g')\n"
488 " (default: unlimited)\n"
489 " --sample=SOURCE what to use for the stored sample of text for\n"
490 " HTML documents - SOURCE can be 'body' or\n"
491 " 'description' (default: 'body')\n"
492 " -E, --sample-size=SIZE maximum size for the document text sample\n"
493 " (supports the same formats as --max-size).\n"
494 " (default: " STRINGIZE(SAMPLE_SIZE
) ")\n"
495 " -T, --title-size=SIZE maximum size for the document title\n"
496 " (supports the same formats as --max-size).\n"
497 " (default: " STRINGIZE(TITLE_SIZE
) ")\n"
498 " -R, --retry-failed retry files which omindex failed to extract text\n"
499 " from on a previous run\n"
500 " --opendir-sleep=SECS sleep for SECS seconds before opening each\n"
501 " directory - sleeping for 2 seconds seems to\n"
502 " reliably work around problems with indexing files\n"
503 " on Microsoft DFS shares.\n"
504 " -C, --track-ctime track each file's ctime so we can detect changes\n"
505 " to ownership or permissions.\n"
506 " --date-terms index D, M and Y prefixed terms to support date\n"
507 " range filtering using terms (we now recommend\n"
508 " using a value slot for this instead).\n"
509 " --no-date-terms ignored for compatibility with Omega 1.4.x.\n"
510 " -v, --verbose show more information about what is happening\n"
511 " --overwrite create the database anew (the default is to update\n"
512 " if the database already exists)" << endl
;
513 print_stemmer_help(" ");
514 print_help_and_version_help(" ");
518 print_package_info(PROG_NAME
);
520 case 'd': // how shall we handle duplicate documents?
523 skip_duplicates
= true;
526 skip_duplicates
= false;
531 if (strcmp(optarg
, "index") == 0) {
532 empty_body
= EMPTY_BODY_INDEX
;
533 } else if (strcmp(optarg
, "warn") == 0) {
534 empty_body
= EMPTY_BODY_WARN
;
535 } else if (strcmp(optarg
, "skip") == 0) {
536 empty_body
= EMPTY_BODY_SKIP
;
538 cerr
<< "Invalid --empty-docs value '" << optarg
<< "'\n"
539 "Valid values are index, warn, and skip." << endl
;
543 case 'p': // Keep documents even if the files have been removed.
544 delete_removed_documents
= false;
546 case 'l': { // Set recursion limit
548 if (!parse_signed(optarg
, arg
)) {
549 throw "Recursion limit must be an integer";
551 if (arg
< 0) arg
= 0;
552 depth_limit
= size_t(arg
);
555 case 'f': // Turn on following of symlinks
556 follow_symlinks
= true;
559 const char * s
= strrchr(optarg
, ':');
561 cerr
<< "Invalid MIME mapping '" << optarg
<< "'\n"
562 "Should be of the form EXT:TYPE, e.g. txt:text/plain\n"
563 "(or txt: to delete a default mapping)" << endl
;
567 // -Mtxt: results in an empty string, which effectively removes the
568 // default mapping for .txt files.
569 size_t ext_len
= s
- optarg
;
570 mime_map
[string(optarg
, ext_len
)] = string(s
+ 1);
571 max_ext_len
= max(max_ext_len
, ext_len
);
575 if (!parse_filter_rule(optarg
, mime_map
))
579 if (!parse_worker_rule(optarg
))
582 case OPT_READ_FILTERS
: {
583 ifstream
stream(optarg
);
585 cerr
<< "Unable to open filter file '" << optarg
<< "' "
586 "(" << strerror(errno
) << ')' << endl
;
591 while (getline(stream
, rule
)) {
592 if (startswith(rule
, '#')) continue;
593 if (!parse_filter_rule(rule
.c_str(), mime_map
))
600 case OPT_READ_WORKERS
: {
601 ifstream
stream(optarg
);
603 cerr
<< "Unable to open worker file '" << optarg
<< "' "
604 "(" << strerror(errno
) << ')' << endl
;
609 while (getline(stream
, rule
)) {
610 if (startswith(rule
, '#')) continue;
611 if (!parse_worker_rule(rule
.c_str()))
624 case 'o': // --overwrite
628 ignore_exclusions
= true;
630 case 'R': // --retry-failed
635 stemmer
= Xapian::Stem(optarg
);
636 } catch (const Xapian::InvalidArgumentError
&) {
637 cerr
<< "Unknown stemming language '" << optarg
<< "'.\n"
638 "Available language names are: "
639 << Xapian::Stem::get_available_languages() << endl
;
650 off_t arg
= parse_size(optarg
);
652 sample_size
= size_t(arg
);
655 cerr
<< PROG_NAME
": bad sample size '" << optarg
<< "'" << endl
;
659 off_t arg
= parse_size(optarg
);
661 title_size
= size_t(arg
);
664 cerr
<< PROG_NAME
": bad title size '" << optarg
<< "'" << endl
;
668 off_t size
= parse_size(optarg
);
672 // Set lsb to the lowest set bit in max_size.
673 off_t lsb
= max_size
& -max_size
;
674 if (lsb
>= off_t(1L << 30)) {
677 } else if (lsb
>= off_t(1L << 20)) {
680 } else if (lsb
>= off_t(1L << 10)) {
686 pretty_max_size
= str(size
);
687 pretty_max_size
+= suffix
;
690 cerr
<< PROG_NAME
": bad max size '" << optarg
<< "'" << endl
;
693 case OPT_OPENDIR_SLEEP
: {
694 // Don't want negative numbers, infinity, NaN, or hex numbers.
696 if (C_isdigit(p
[0]) && (p
[1] | 32) != 'x') {
697 sleep_before_opendir
= strtod(p
, &p
);
701 cerr
<< PROG_NAME
": bad --opendir-sleep argument: "
702 "'" << optarg
<< "'" << endl
;
706 if (strcmp(optarg
, "description") == 0) {
707 description_as_sample
= true;
708 } else if (strcmp(optarg
, "body") == 0) {
709 description_as_sample
= false;
711 cerr
<< "Invalid --sample value '" << optarg
<< "'\n"
712 "Valid values are body and description." << endl
;
722 case OPT_NO_DATE_TERMS
:
723 // Ignored for compatibility with Omega 1.4.0.
726 char * s
= strrchr(optarg
, ':');
728 cerr
<< "Invalid MIME mapping '" << optarg
<< "'\n"
729 "Should be of the form GLOB:TYPE, e.g. *~:ignore"
734 cerr
<< "--mime-type-match isn't supported in this build because "
735 "the fnmatch() function wasn't found at configure time."
740 cerr
<< "--mime-type-match with an empty pattern can never "
744 if (memchr(optarg
, '/', s
- optarg
)) {
745 cerr
<< "--mime-type-match only matches against the leaf "
746 "filename so a pattern containing '/' can never match."
750 const char* type
= s
+ 1;
752 cerr
<< "--mime-type-match doesn't support an empty MIME type"
757 mime_patterns
.emplace_back(optarg
, type
);
761 case ':': // missing param
763 case '?': // unknown option: FIXME -> char
768 if (dbpath
.empty()) {
769 cerr
<< PROG_NAME
": you must specify a database with --db." << endl
;
772 if (baseurl
.empty()) {
773 cerr
<< PROG_NAME
": --url not specified, assuming '/'." << endl
;
775 // baseurl must end in a '/'.
776 if (!endswith(baseurl
, '/')) {
780 // Site term (omits the trailing slash):
782 site_term
.append(baseurl
, 0, baseurl
.size() - 1);
783 if (site_term
.size() > MAX_SAFE_TERM_LENGTH
)
784 site_term
= hash_long_term(site_term
, MAX_SAFE_TERM_LENGTH
);
786 // Host term, if the URL contains a hostname (omits any port number):
788 j
= find_if(baseurl
.begin(), baseurl
.end(), p_notalnum
) - baseurl
.begin();
789 if (j
> 0 && baseurl
.substr(j
, 3) == "://" && j
+ 3 < baseurl
.size()) {
791 // We must find a '/' - we ensured baseurl ended with a '/' above.
792 string::size_type k
= baseurl
.find('/', j
);
793 url_start_path
.assign(baseurl
, k
, string::npos
);
794 string::const_iterator l
;
795 l
= find(baseurl
.begin() + j
, baseurl
.begin() + k
, ':');
796 string::size_type host_len
= l
- baseurl
.begin() - j
;
798 host_term
.append(baseurl
, j
, host_len
);
799 // DNS hostname limit is 253.
800 if (host_term
.size() > MAX_SAFE_TERM_LENGTH
)
801 host_term
= hash_long_term(host_term
, MAX_SAFE_TERM_LENGTH
);
803 url_start_path
= baseurl
;
806 if (optind
>= argc
|| optind
+ 2 < argc
) {
807 cerr
<< PROG_NAME
": you must specify a directory to index.\n"
808 "Do this either as a single directory (corresponding to the base URL)\n"
809 "or two directories - the first corresponding to the base URL and the second\n"
810 "a subdirectory of that to index." << endl
;
816 cerr
<< PROG_NAME
": start directory can not be empty string." << endl
;
819 if (!endswith(root
, '/')) {
822 if (optind
+ 2 == argc
) {
823 string start_url
= argv
[optind
+ 1];
824 if (startswith(start_url
, '/')) {
825 // Make relative to root.
826 if (!startswith(start_url
, root
)) {
827 cerr
<< PROG_NAME
": '" << argv
[optind
+ 1] << "' "
828 "is not a subdirectory of '" << argv
[optind
] << "'."
832 start_url
.erase(0, root
.size());
834 if (!endswith(start_url
, '/')) {
838 url_encode_path(baseurl
, start_url
);
843 index_init(dbpath
, stemmer
, root
, site_term
, host_term
, empty_body
,
844 (skip_duplicates
? DUP_SKIP
: DUP_CHECK_LAZILY
),
845 sample_size
, title_size
, max_ext_len
,
846 overwrite
, retry_failed
, delete_removed_documents
, verbose
,
847 use_ctime
, spelling
, ignore_exclusions
,
848 description_as_sample
, date_terms
);
849 index_directory(root
, baseurl
, depth_limit
, mime_map
);
850 index_handle_deletion();
853 } catch (const CommitAndExit
&e
) {
854 cout
<< "Exception: " << e
.what() << endl
;
855 cout
<< "Committing pending changes..." << endl
;
857 } catch (const Xapian::Error
&e
) {
858 cout
<< "Exception: " << e
.get_description() << endl
;
859 } catch (const exception
&e
) {
860 cout
<< "Exception: " << e
.what() << endl
;
861 } catch (const string
&s
) {
862 cout
<< "Exception: " << s
<< endl
;
863 } catch (const char *s
) {
864 cout
<< "Exception: " << s
<< endl
;
866 cout
<< "Caught unknown exception" << endl
;