Process an incomplete final line from a dump file
[xapian.git] / xapian-applications / omega / omindex.cc
blob46b459a0b1c276455f4a537cbefb70cfdb274ad6
1 /** @file
2 * @brief index static documents into the omega db
3 */
4 /* Copyright 1999,2000,2001 BrightStation PLC
5 * Copyright 2001,2005 James Aylett
6 * Copyright 2001,2002 Ananova Ltd
7 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2017,2018 Olly Betts
8 * Copyright 2009 Frank J Bruzzaniti
9 * Copyright 2012 Mihai Bivol
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public License as
13 * published by the Free Software Foundation; either version 2 of the
14 * License, or (at your option) any later version.
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
24 * USA
27 #include <config.h>
29 #include <algorithm>
30 #include <fstream>
31 #include <iostream>
32 #include <string>
33 #include <map>
35 #include <sys/types.h>
36 #include "safeunistd.h"
37 #include <cerrno>
38 #include <cstdio>
39 #include <cstdlib>
40 #include <cstring>
41 #include "safefcntl.h"
43 #ifdef HAVE_FNMATCH
44 # include <fnmatch.h>
45 #endif
47 #include <xapian.h>
49 #include "commonhelp.h"
50 #include "diritor.h"
51 #include "hashterm.h"
52 #include "index_file.h"
53 #include "mime.h"
54 #include "realtime.h"
55 #include "str.h"
56 #include "stringutils.h"
57 #include "urlencode.h"
59 #include "gnu_getopt.h"
61 using namespace std;
63 #define PROG_NAME "omindex"
64 #define PROG_DESC "Index static website data via the filesystem"
66 #define TITLE_SIZE 128
67 #define SAMPLE_SIZE 512
69 static bool follow_symlinks = false;
70 static off_t max_size = 0;
71 static std::string pretty_max_size;
72 static bool verbose = false;
73 static double sleep_before_opendir = 0;
75 static string root;
76 static string url_start_path;
78 #ifdef HAVE_FNMATCH
79 static vector<pair<const char*, const char*>> mime_patterns;
80 #endif
82 static inline bool
83 p_notalnum(unsigned int c)
85 return !C_isalnum(c);
88 static void
89 index_file(const string &file, const string &url, DirectoryIterator & d,
90 map<string, string>& mime_map)
92 string urlterm("U");
93 urlterm += url;
95 if (urlterm.length() > MAX_SAFE_TERM_LENGTH)
96 urlterm = hash_long_term(urlterm, MAX_SAFE_TERM_LENGTH);
98 const char* leafname = d.leafname();
100 string mimetype;
101 #ifdef HAVE_FNMATCH
102 for (auto&& i : mime_patterns) {
103 if (fnmatch(i.first, leafname, 0) == 0) {
104 if (strcmp(i.second, "ignore") == 0)
105 return;
106 if (strcmp(i.second, "skip") == 0) {
107 string m = "Leafname '";
108 m += leafname;
109 m += "' matches pattern: ";
110 m += i.first;
111 skip(urlterm, file.substr(root.size()), m,
112 d.get_size(), d.get_mtime());
113 return;
115 mimetype = i.second;
116 break;
119 #endif
121 string ext;
122 const char * dot_ptr = strrchr(leafname, '.');
123 if (dot_ptr) {
124 ext.assign(dot_ptr + 1);
125 if (ext.size() > max_ext_len)
126 ext.resize(0);
129 if (mimetype.empty()) {
130 mimetype = mimetype_from_ext(mime_map, ext);
131 if (mimetype == "ignore") {
132 // Remove any existing failed entry for this file.
133 index_remove_failed_entry(urlterm);
134 return;
135 } else if (mimetype == "skip") {
136 // Ignore mimetype, skipped mimetype should not be quietly ignored.
137 string m = "skipping extension '";
138 m += ext;
139 m += "'";
140 skip(urlterm, file.substr(root.size()), m,
141 d.get_size(), d.get_mtime());
142 return;
146 // Check the file size.
147 off_t size = d.get_size();
148 if (size == 0) {
149 skip(urlterm, file.substr(root.size()), "Zero-sized file",
150 size, d.get_mtime(), SKIP_VERBOSE_ONLY);
151 return;
154 if (max_size > 0 && size > max_size) {
155 skip(urlterm, file.substr(root.size()),
156 "Larger than size limit of " + pretty_max_size,
157 size, d.get_mtime(),
158 SKIP_VERBOSE_ONLY);
159 return;
162 // If we didn't get the mime type from the extension, call libmagic to get
163 // it.
164 if (mimetype.empty()) {
165 mimetype = d.get_magic_mimetype();
166 if (mimetype.empty()) {
167 skip(urlterm, file.substr(root.size()), "Unknown extension and unrecognised format",
168 d.get_size(), d.get_mtime(), SKIP_SHOW_FILENAME);
169 return;
173 if (verbose)
174 cout << "Indexing \"" << file.substr(root.size()) << "\" as "
175 << mimetype << " ... ";
177 Xapian::Document new_doc;
179 // Use `file` as the basis, as we don't want URL encoding in these terms,
180 // but need to switch over the initial part so we get `/~olly/foo/bar` not
181 // `/home/olly/public_html/foo/bar`.
182 string path_term("P");
183 path_term += url_start_path;
184 path_term.append(file, root.size(), string::npos);
186 size_t i;
187 while ((i = path_term.rfind('/')) > 1 && i != string::npos) {
188 path_term.resize(i);
189 if (path_term.length() > MAX_SAFE_TERM_LENGTH) {
190 new_doc.add_boolean_term(hash_long_term(path_term, MAX_SAFE_TERM_LENGTH));
191 } else {
192 new_doc.add_boolean_term(path_term);
196 index_mimetype(file, urlterm, url, ext, mimetype, d, new_doc, string());
199 static void
200 index_directory(const string &path, const string &url_, size_t depth_limit,
201 map<string, string>& mime_map)
203 if (verbose)
204 cout << "[Entering directory \"" << path.substr(root.size()) << "\"]"
205 << endl;
207 DirectoryIterator d(follow_symlinks);
208 try {
209 // Crude workaround for MS-DFS share misbehaviour.
210 if (sleep_before_opendir > 0.0)
211 RealTime::sleep(RealTime::now() + sleep_before_opendir);
213 d.start(path);
215 while (d.next()) {
216 string url = url_;
217 url_encode(url, d.leafname());
218 string file = path;
219 file += d.leafname();
221 try {
222 switch (d.get_type()) {
223 case DirectoryIterator::DIRECTORY: {
224 size_t new_limit = depth_limit;
225 if (new_limit) {
226 if (--new_limit == 0) continue;
228 url += '/';
229 file += '/';
230 index_directory(file, url, new_limit, mime_map);
231 break;
233 case DirectoryIterator::REGULAR_FILE:
234 index_file(file, url, d, mime_map);
235 break;
236 default:
237 skip("U" + url, file.substr(root.size()), "Not a regular file",
238 d.get_size(), d.get_mtime(),
239 SKIP_VERBOSE_ONLY | SKIP_SHOW_FILENAME);
241 } catch (const FileNotFound & e) {
242 skip("U" + url, file.substr(root.size()), "File removed during indexing",
243 d.get_size(), d.get_mtime(),
244 /*SKIP_VERBOSE_ONLY |*/ SKIP_SHOW_FILENAME);
245 } catch (const std::string & error) {
246 skip("U" + url, file.substr(root.size()), error,
247 d.get_size(), d.get_mtime(), SKIP_SHOW_FILENAME);
250 } catch (const FileNotFound&) {
251 if (verbose)
252 cout << "Directory \"" << path.substr(root.size()) << "\" "
253 "deleted during indexing" << endl;
254 } catch (const std::string & error) {
255 cout << error << " - skipping directory "
256 "\"" << path.substr(root.size()) << "\"" << endl;
260 static off_t
261 parse_size(char* p)
263 // Don't want negative numbers, infinity, NaN, or hex numbers.
264 if (C_isdigit(p[0]) && (p[1] | 32) != 'x') {
265 double arg = strtod(p, &p);
266 switch (*p) {
267 case '\0':
268 break;
269 case 'k': case 'K':
270 arg *= 1024;
271 ++p;
272 break;
273 case 'm': case 'M':
274 arg *= (1024 * 1024);
275 ++p;
276 break;
277 case 'g': case 'G':
278 arg *= (1024 * 1024 * 1024);
279 ++p;
280 break;
282 if (*p == '\0') {
283 return off_t(arg);
286 return -1;
289 static bool
290 parse_filter_rule(const char* rule, map<string, string>& mime_map)
292 const char* s = strchr(rule, ':');
293 if (s == NULL || s[1] == '\0') {
294 cerr << "Invalid filter mapping '" << rule << "'\n"
295 "Should be of the form TYPE:COMMAND or TYPE1,TYPE2:COMMAND or "
296 "TYPE,EXT:COMMAND\n"
297 "e.g. 'application/octet-stream:strings -n8'"
298 << endl;
299 return false;
302 const char* c = static_cast<const char*>(memchr(rule, ',', s - rule));
303 string output_type, output_charset;
304 if (c) {
305 // Filter produces a specified content-type.
306 ++c;
307 const char* c2 = static_cast<const char *>(memchr(c, ',', s - c));
308 if (c2) {
309 output_type.assign(c, c2 - c);
310 ++c2;
311 output_charset.assign(c2, s - c2);
312 } else {
313 output_type.assign(c, s - c);
315 --c;
316 if (output_type.find('/') == string::npos) {
317 auto m = mime_map.find(output_type);
318 if (m != mime_map.end()) {
319 output_type = m->second;
320 } else {
321 const char* r = built_in_mime_map(output_type);
322 if (r) output_type = r;
325 if (output_type != "text/html" &&
326 output_type != "text/plain" &&
327 output_type != "image/svg+xml") {
328 cerr << "Currently only output types 'image/svg+xml', "
329 "'text/html' and 'text/plain' are supported."
330 << endl;
331 return false;
333 } else {
334 c = s;
337 const char* cmd = s + 1;
338 // Analyse the command string to decide if it needs a shell.
339 bool use_shell = command_needs_shell(cmd);
340 index_command(string(rule, c - rule),
341 Filter(string(cmd), output_type, output_charset, use_shell));
343 return true;
347 main(int argc, char **argv)
349 // If overwrite is true, the database will be created anew even if it
350 // already exists.
351 bool overwrite = false;
352 // If delete_removed_documents is true, delete any documents we don't see.
353 bool delete_removed_documents = true;
354 // Retry files which we failed to index on a previous run?
355 bool retry_failed = false;
356 bool use_ctime = false;
357 bool spelling = false;
358 bool skip_duplicates = false;
359 bool ignore_exclusions = false;
360 bool description_as_sample = false;
361 string baseurl;
362 size_t depth_limit = 0;
363 size_t title_size = TITLE_SIZE;
364 size_t sample_size = SAMPLE_SIZE;
365 empty_body_type empty_body = EMPTY_BODY_WARN;
366 string site_term, host_term;
367 Xapian::Stem stemmer("english");
369 enum {
370 OPT_OPENDIR_SLEEP = 256,
371 OPT_SAMPLE,
372 OPT_READ_FILTERS
374 constexpr auto NO_ARG = no_argument;
375 constexpr auto REQ_ARG = required_argument;
376 static const struct option longopts[] = {
377 { "help", NO_ARG, NULL, 'h' },
378 { "version", NO_ARG, NULL, 'V' },
379 { "overwrite", NO_ARG, NULL, 'o' },
380 { "duplicates", REQ_ARG, NULL, 'd' },
381 { "no-delete", NO_ARG, NULL, 'p' },
382 { "preserve-nonduplicates", NO_ARG, NULL, 'p' },
383 { "db", REQ_ARG, NULL, 'D' },
384 { "url", REQ_ARG, NULL, 'U' },
385 { "mime-type", REQ_ARG, NULL, 'M' },
386 { "mime-type-match", REQ_ARG, NULL, 'G' },
387 { "filter", REQ_ARG, NULL, 'F' },
388 { "read-filters", REQ_ARG, NULL, OPT_READ_FILTERS },
389 { "depth-limit", REQ_ARG, NULL, 'l' },
390 { "follow", NO_ARG, NULL, 'f' },
391 { "ignore-exclusions", NO_ARG, NULL, 'i' },
392 { "stemmer", REQ_ARG, NULL, 's' },
393 { "spelling", NO_ARG, NULL, 'S' },
394 { "verbose", NO_ARG, NULL, 'v' },
395 { "empty-docs", REQ_ARG, NULL, 'e' },
396 { "max-size", REQ_ARG, NULL, 'm' },
397 { "sample", REQ_ARG, NULL, OPT_SAMPLE },
398 { "sample-size", REQ_ARG, NULL, 'E' },
399 { "title-size", REQ_ARG, NULL, 'T' },
400 { "retry-failed", NO_ARG, NULL, 'R' },
401 { "opendir-sleep", REQ_ARG, NULL, OPT_OPENDIR_SLEEP },
402 { "track-ctime", NO_ARG, NULL, 'C' },
403 { 0, 0, NULL, 0 }
406 map<string, string> mime_map;
408 index_add_default_filters();
410 if (argc == 2 && strcmp(argv[1], "-v") == 0) {
411 // -v was the short option for --version in 1.2.3 and earlier, but
412 // now it is short for --verbose (for consistency with scriptindex)
413 // so if "-v" is the only option, translate it to "--version" for
414 // backwards compatibility.
415 argv[1] = const_cast<char *>("--version");
418 string dbpath;
419 int getopt_ret;
420 while ((getopt_ret = gnu_getopt_long(argc, argv, "hvd:D:U:M:F:l:s:pfRSVe:im:E:T:",
421 longopts, NULL)) != -1) {
422 switch (getopt_ret) {
423 case 'h': {
424 cout << PROG_NAME " - " PROG_DESC "\n\n"
425 "Usage: " PROG_NAME " [OPTIONS] --db DATABASE [BASEDIR] DIRECTORY\n"
426 "\n"
427 "DIRECTORY is the directory to start indexing from.\n"
428 "\n"
429 "BASEDIR is the directory corresponding to URL (default: DIRECTORY).\n"
430 "\n"
431 "Options:\n"
432 " -d, --duplicates=ARG set duplicate handling: ARG can be 'ignore' or\n"
433 " 'replace' (default: replace)\n"
434 " -p, --no-delete skip the deletion of documents corresponding to\n"
435 " deleted files (--preserve-nonduplicates is a\n"
436 " deprecated alias for --no-delete)\n"
437 " -e, --empty-docs=ARG how to handle documents we extract no text from:\n"
438 " ARG can be index, warn (issue a diagnostic and\n"
439 " index), or skip. (default: warn)\n"
440 " -D, --db=DATABASE path to database to use\n"
441 " -U, --url=URL base url BASEDIR corresponds to (default: /)\n"
442 " -M, --mime-type=EXT:TYPE assume any file with extension EXT has MIME\n"
443 " Content-Type TYPE, instead of using libmagic\n"
444 " (empty TYPE removes any existing mapping for EXT;\n"
445 " other special TYPE values: 'ignore' and 'skip')\n"
446 " -G, --mime-type-match=GLOB:TYPE\n"
447 " assume any file with leaf name matching shell\n"
448 " wildcard pattern GLOB has MIME Content-Type TYPE\n"
449 " (special TYPE values: 'ignore' and 'skip')\n"
450 " -F, --filter=M[,[T][,C]]:CMD\n"
451 " process files with MIME Content-Type M using\n"
452 " command CMD, which produces output (on stdout or\n"
453 " in a temporary file) with format T (Content-Type\n"
454 " or file extension; currently txt (default), html\n"
455 " or svg) in character encoding C (default: UTF-8).\n"
456 " E.g. -Fapplication/octet-stream:'strings -n8'\n"
457 " or -Ftext/x-foo,,utf-16:'foo2utf16 %f %t'\n"
458 " --read-filters=FILE bulk-load --filter arguments from FILE, which\n"
459 " should contain one such argument per line (e.g.\n"
460 " text/x-bar:bar2txt --utf8). Lines starting with #\n"
461 " are treated as comments and ignored.\n"
462 " -l, --depth-limit=LIMIT set recursion limit (0 = unlimited)\n"
463 " -f, --follow follow symbolic links\n"
464 " -i, --ignore-exclusions ignore meta robots tags and similar exclusions\n"
465 " -S, --spelling index data for spelling correction\n"
466 " -m, --max-size maximum size of file to index (in bytes or with a\n"
467 " suffix of 'K'/'k', 'M'/'m', 'G'/'g')\n"
468 " (default: unlimited)\n"
469 " --sample=SOURCE what to use for the stored sample of text for\n"
470 " HTML documents - SOURCE can be 'body' or\n"
471 " 'description' (default: 'body')\n"
472 " -E, --sample-size=SIZE maximum size for the document text sample\n"
473 " (supports the same formats as --max-size).\n"
474 " (default: " STRINGIZE(SAMPLE_SIZE) ")\n"
475 " -T, --title-size=SIZE maximum size for the document title\n"
476 " (supports the same formats as --max-size).\n"
477 " (default: " STRINGIZE(TITLE_SIZE) ")\n"
478 " -R, --retry-failed retry files which omindex failed to extract text\n"
479 " from on a previous run\n"
480 " --opendir-sleep=SECS sleep for SECS seconds before opening each\n"
481 " directory - sleeping for 2 seconds seems to\n"
482 " reliably work around problems with indexing files\n"
483 " on Microsoft DFS shares.\n"
484 " -C, --track-ctime track each file's ctime so we can detect changes\n"
485 " to ownership or permissions.\n"
486 " -v, --verbose show more information about what is happening\n"
487 " --overwrite create the database anew (the default is to update\n"
488 " if the database already exists)" << endl;
489 print_stemmer_help(" ");
490 print_help_and_version_help(" ");
491 return 0;
493 case 'V':
494 print_package_info(PROG_NAME);
495 return 0;
496 case 'd': // how shall we handle duplicate documents?
497 switch (optarg[0]) {
498 case 'i':
499 skip_duplicates = true;
500 break;
501 case 'r':
502 skip_duplicates = false;
503 break;
505 break;
506 case 'e':
507 if (strcmp(optarg, "index") == 0) {
508 empty_body = EMPTY_BODY_INDEX;
509 } else if (strcmp(optarg, "warn") == 0) {
510 empty_body = EMPTY_BODY_WARN;
511 } else if (strcmp(optarg, "skip") == 0) {
512 empty_body = EMPTY_BODY_SKIP;
513 } else {
514 cerr << "Invalid --empty-docs value '" << optarg << "'\n"
515 "Valid values are index, warn, and skip." << endl;
516 return 1;
518 break;
519 case 'p': // Keep documents even if the files have been removed.
520 delete_removed_documents = false;
521 break;
522 case 'l': { // Set recursion limit
523 int arg = atoi(optarg);
524 if (arg < 0) arg = 0;
525 depth_limit = size_t(arg);
526 break;
528 case 'f': // Turn on following of symlinks
529 follow_symlinks = true;
530 break;
531 case 'M': {
532 const char * s = strrchr(optarg, ':');
533 if (s == NULL) {
534 cerr << "Invalid MIME mapping '" << optarg << "'\n"
535 "Should be of the form EXT:TYPE, e.g. txt:text/plain\n"
536 "(or txt: to delete a default mapping)" << endl;
537 return 1;
540 // -Mtxt: results in an empty string, which effectively removes the
541 // default mapping for .txt files.
542 mime_map[string(optarg, s - optarg)] = string(s + 1);
543 max_ext_len = max(max_ext_len, strlen(s + 1));
544 break;
546 case 'F':
547 if (!parse_filter_rule(optarg, mime_map))
548 return 1;
549 break;
550 case OPT_READ_FILTERS: {
551 ifstream stream(optarg);
552 if (!stream) {
553 cerr << "Unable to open filter file '" << optarg << "' "
554 "(" << strerror(errno) << ')' << endl;
555 return 1;
557 string rule;
558 bool all_ok = true;
559 while (getline(stream, rule)) {
560 if (startswith(rule, '#')) continue;
561 if (!parse_filter_rule(rule.c_str(), mime_map))
562 all_ok = false;
564 if (!all_ok)
565 return 1;
566 break;
568 case 'D':
569 dbpath = optarg;
570 break;
571 case 'U':
572 baseurl = optarg;
573 break;
574 case 'o': // --overwrite
575 overwrite = true;
576 break;
577 case 'i':
578 ignore_exclusions = true;
579 break;
580 case 'R': // --retry-failed
581 retry_failed = true;
582 break;
583 case 's':
584 try {
585 stemmer = Xapian::Stem(optarg);
586 } catch (const Xapian::InvalidArgumentError &) {
587 cerr << "Unknown stemming language '" << optarg << "'.\n"
588 "Available language names are: "
589 << Xapian::Stem::get_available_languages() << endl;
590 return 1;
592 break;
593 case 'S':
594 spelling = true;
595 break;
596 case 'v':
597 verbose = true;
598 break;
599 case 'E': {
600 off_t arg = parse_size(optarg);
601 if (arg >= 0) {
602 sample_size = size_t(arg);
603 break;
605 cerr << PROG_NAME": bad sample size '" << optarg << "'" << endl;
606 return 1;
608 case 'T': {
609 off_t arg = parse_size(optarg);
610 if (arg >= 0) {
611 title_size = size_t(arg);
612 break;
614 cerr << PROG_NAME": bad title size '" << optarg << "'" << endl;
615 return 1;
617 case 'm': {
618 off_t size = parse_size(optarg);
619 if (size >= 0) {
620 max_size = size;
621 const char * suffix;
622 // Set lsb to the lowest set bit in max_size.
623 off_t lsb = max_size & -max_size;
624 if (lsb >= off_t(1L << 30)) {
625 size >>= 30;
626 suffix = "GB";
627 } else if (lsb >= off_t(1L << 20)) {
628 size >>= 20;
629 suffix = "MB";
630 } else if (lsb >= off_t(1L << 10)) {
631 size >>= 10;
632 suffix = "KB";
633 } else {
634 suffix = "B";
636 pretty_max_size = str(size);
637 pretty_max_size += suffix;
638 break;
640 cerr << PROG_NAME": bad max size '" << optarg << "'" << endl;
641 return 1;
643 case OPT_OPENDIR_SLEEP: {
644 // Don't want negative numbers, infinity, NaN, or hex numbers.
645 char * p = optarg;
646 if (C_isdigit(p[0]) && (p[1] | 32) != 'x') {
647 sleep_before_opendir = strtod(p, &p);
648 if (*p == '\0')
649 break;
651 cerr << PROG_NAME": bad --opendir-sleep argument: "
652 "'" << optarg << "'" << endl;
653 return 1;
655 case OPT_SAMPLE:
656 if (strcmp(optarg, "description") == 0) {
657 description_as_sample = true;
658 } else if (strcmp(optarg, "body") == 0) {
659 description_as_sample = false;
660 } else {
661 cerr << "Invalid --sample value '" << optarg << "'\n"
662 "Valid values are body and description." << endl;
663 return 1;
665 break;
666 case 'C':
667 use_ctime = true;
668 break;
669 case 'G': {
670 char * s = strrchr(optarg, ':');
671 if (s == NULL) {
672 cerr << "Invalid MIME mapping '" << optarg << "'\n"
673 "Should be of the form GLOB:TYPE, e.g. *~:ignore"
674 << endl;
675 return 1;
677 #ifndef HAVE_FNMATCH
678 cerr << "--mime-type-match isn't supported in this build because "
679 "the fnmatch() function wasn't found at configure time."
680 << endl;
681 return 1;
682 #else
683 if (s == optarg) {
684 cerr << "--mime-type-match with an empty pattern can never "
685 "match." << endl;
686 return 1;
688 if (memchr(optarg, '/', s - optarg)) {
689 cerr << "--mime-type-match only matches against the leaf "
690 "filename so a pattern containing '/' can never match."
691 << endl;
692 return 1;
694 const char* type = s + 1;
695 if (*type == '\0') {
696 cerr << "--mime-type-match doesn't support an empty MIME type"
697 << endl;
698 return 1;
700 *s = '\0';
701 mime_patterns.emplace_back(optarg, type);
702 break;
703 #endif
705 case ':': // missing param
706 return 1;
707 case '?': // unknown option: FIXME -> char
708 return 1;
712 if (dbpath.empty()) {
713 cerr << PROG_NAME": you must specify a database with --db." << endl;
714 return 1;
716 if (baseurl.empty()) {
717 cerr << PROG_NAME": --url not specified, assuming '/'." << endl;
719 // baseurl must end in a '/'.
720 if (!endswith(baseurl, '/')) {
721 baseurl += '/';
724 // Site term (omits the trailing slash):
725 site_term = "J";
726 site_term.append(baseurl, 0, baseurl.size() - 1);
727 if (site_term.size() > MAX_SAFE_TERM_LENGTH)
728 site_term = hash_long_term(site_term, MAX_SAFE_TERM_LENGTH);
730 // Host term, if the URL contains a hostname (omits any port number):
731 string::size_type j;
732 j = find_if(baseurl.begin(), baseurl.end(), p_notalnum) - baseurl.begin();
733 if (j > 0 && baseurl.substr(j, 3) == "://" && j + 3 < baseurl.size()) {
734 j += 3;
735 // We must find a '/' - we ensured baseurl ended with a '/' above.
736 string::size_type k = baseurl.find('/', j);
737 url_start_path.assign(baseurl, k, string::npos);
738 string::const_iterator l;
739 l = find(baseurl.begin() + j, baseurl.begin() + k, ':');
740 string::size_type host_len = l - baseurl.begin() - j;
741 host_term = "H";
742 host_term.append(baseurl, j, host_len);
743 // DNS hostname limit is 253.
744 if (host_term.size() > MAX_SAFE_TERM_LENGTH)
745 host_term = hash_long_term(host_term, MAX_SAFE_TERM_LENGTH);
746 } else {
747 url_start_path = baseurl;
750 if (optind >= argc || optind + 2 < argc) {
751 cerr << PROG_NAME": you must specify a directory to index.\n"
752 "Do this either as a single directory (corresponding to the base URL)\n"
753 "or two directories - the first corresponding to the base URL and the second\n"
754 "a subdirectory of that to index." << endl;
755 return 1;
758 root = argv[optind];
759 if (root.empty()) {
760 cerr << PROG_NAME": start directory can not be empty." << endl;
761 return 1;
763 if (!endswith(root, '/')) {
764 root += '/';
766 if (optind + 2 == argc) {
767 string start_url = argv[optind + 1];
768 if (startswith(start_url, '/')) {
769 // Make relative to root.
770 if (!startswith(start_url, root)) {
771 cerr << PROG_NAME": '" << argv[optind + 1] << "' "
772 "is not a subdirectory of '" << argv[optind] << "'."
773 << endl;
774 return 1;
776 start_url.erase(0, root.size());
778 if (!endswith(start_url, '/')) {
779 start_url += '/';
781 root += start_url;
782 url_encode_path(baseurl, start_url);
785 int exitcode = 1;
786 try {
787 index_init(dbpath, stemmer, root, site_term, host_term, empty_body,
788 (skip_duplicates ? DUP_SKIP : DUP_CHECK_LAZILY),
789 sample_size, title_size, max_ext_len,
790 overwrite, retry_failed, delete_removed_documents, verbose,
791 use_ctime, spelling, ignore_exclusions,
792 description_as_sample);
793 index_directory(root, baseurl, depth_limit, mime_map);
794 index_handle_deletion();
795 index_commit();
796 exitcode = 0;
797 } catch (const CommitAndExit &e) {
798 cout << "Exception: " << e.what() << endl;
799 cout << "Committing pending changes..." << endl;
800 index_commit();
801 } catch (const Xapian::Error &e) {
802 cout << "Exception: " << e.get_description() << endl;
803 } catch (const exception &e) {
804 cout << "Exception: " << e.what() << endl;
805 } catch (const string &s) {
806 cout << "Exception: " << s << endl;
807 } catch (const char *s) {
808 cout << "Exception: " << s << endl;
809 } catch (...) {
810 cout << "Caught unknown exception" << endl;
813 index_done();
815 return exitcode;