scriptindex: Fix weird error cases
[xapian.git] / xapian-applications / omega / index_file.cc
blob07045c143a4ba3368f770e7005033cee5c01099e
1 /** @file
2 * @brief Handle indexing a document from a file
3 */
4 /* Copyright 1999,2000,2001 BrightStation PLC
5 * Copyright 2001,2005 James Aylett
6 * Copyright 2001,2002 Ananova Ltd
7 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018 Olly Betts
8 * Copyright 2009 Frank J Bruzzaniti
9 * Copyright 2012 Mihai Bivol
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public License as
13 * published by the Free Software Foundation; either version 2 of the
14 * License, or (at your option) any later version.
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
24 * USA
27 #include <config.h>
29 #include "index_file.h"
31 #include <algorithm>
32 #include <iostream>
33 #include <limits>
34 #include <string>
35 #include <map>
36 #include <vector>
38 #include <sys/types.h>
39 #include "safeunistd.h"
40 #include <cerrno>
41 #include <cstdio>
42 #include <cstdlib>
43 #include <cstring>
44 #include "safefcntl.h"
45 #include <ctime>
47 #include <xapian.h>
49 #include "append_filename_arg.h"
50 #include "atomparse.h"
51 #include "diritor.h"
52 #include "failed.h"
53 #include "md5wrap.h"
54 #include "metaxmlparse.h"
55 #include "mimemap.h"
56 #include "msxmlparse.h"
57 #include "myhtmlparse.h"
58 #include "opendocparse.h"
59 #include "pkglibbindir.h"
60 #include "runfilter.h"
61 #include "sample.h"
62 #include "str.h"
63 #include "stringutils.h"
64 #include "svgparse.h"
65 #include "tmpdir.h"
66 #include "utf8convert.h"
67 #include "utils.h"
68 #include "values.h"
69 #include "xmlparse.h"
70 #include "xlsxparse.h"
71 #include "xpsxmlparse.h"
73 using namespace std;
75 static Xapian::WritableDatabase db;
76 static Xapian::TermGenerator indexer;
78 static Xapian::doccount old_docs_not_seen;
79 static Xapian::docid old_lastdocid;
80 static vector<bool> updated;
82 static bool verbose;
83 static bool retry_failed;
84 static bool use_ctime;
85 static dup_action_type dup_action;
86 static bool ignore_exclusions;
87 static bool description_as_sample;
89 static time_t last_altered_max;
90 static size_t sample_size;
91 static size_t title_size;
92 static size_t max_ext_len;
94 static empty_body_type empty_body;
96 static string root;
97 static string site_term, host_term;
99 static Failed failed;
101 map<string, Filter> commands;
103 static void
104 mark_as_seen(Xapian::docid did)
106 if (usual(did < updated.size() && !updated[did])) {
107 updated[did] = true;
108 --old_docs_not_seen;
112 void
113 skip(const string & urlterm, const string & context, const string & msg,
114 off_t size, time_t last_mod, unsigned flags)
116 failed.add(urlterm, last_mod, size);
118 if (!verbose || (flags & SKIP_SHOW_FILENAME)) {
119 if (!verbose && (flags & SKIP_VERBOSE_ONLY)) return;
120 cout << context << ": ";
123 cout << "Skipping - " << msg << endl;
126 static void
127 skip_cmd_failed(const string & urlterm, const string & context, const string & cmd,
128 off_t size, time_t last_mod)
130 skip(urlterm, context, "\"" + cmd + "\" failed", size, last_mod);
133 static void
134 skip_meta_tag(const string & urlterm, const string & context,
135 off_t size, time_t last_mod)
137 skip(urlterm, context, "indexing disallowed by meta tag", size, last_mod);
140 static void
141 skip_unknown_mimetype(const string & urlterm, const string & context,
142 const string & mimetype, off_t size, time_t last_mod)
144 skip(urlterm, context, "unknown MIME type '" + mimetype + "'", size, last_mod);
147 void
148 index_add_default_filters()
150 index_command("application/msword", Filter("antiword -mUTF-8.txt", false));
151 index_command("application/vnd.ms-excel",
152 Filter("xls2csv -c' ' -q0 -dutf-8", false));
153 index_command("application/vnd.ms-powerpoint",
154 Filter("catppt -dutf-8", false));
155 // Looking at the source of wpd2html and wpd2text I think both output
156 // UTF-8, but it's hard to be sure without sample Unicode .wpd files
157 // as they don't seem to be at all well documented.
158 index_command("application/vnd.wordperfect", Filter("wpd2text", false));
159 // wps2text produces UTF-8 output from the sample files I've tested.
160 index_command("application/vnd.ms-works", Filter("wps2text", false));
161 // Output is UTF-8 according to "man djvutxt". Generally this seems to
162 // be true, though some examples from djvu.org generate isolated byte
163 // 0x95 in a context which suggests it might be intended to be a bullet
164 // (as it is in CP1252).
165 index_command("image/vnd.djvu", Filter("djvutxt", false));
166 index_command("text/markdown", Filter("markdown", "text/html", false));
167 // The --text option unhelpfully converts all non-ASCII characters to "?"
168 // so we use --html instead, which produces HTML entities. The --nopict
169 // option suppresses exporting picture files as pictNNNN.wmf in the current
170 // directory. Note that this option was ignored in some older versions,
171 // but it was fixed in unrtf 0.20.4.
172 index_command("application/rtf",
173 Filter("unrtf --nopict --html 2>/dev/null", "text/html",
174 false));
175 index_command("text/rtf",
176 Filter("unrtf --nopict --html 2>/dev/null", "text/html",
177 false));
178 index_command("text/x-rst", Filter("rst2html", "text/html", false));
179 index_command("application/x-mspublisher",
180 Filter("pub2xhtml", "text/html", false));
181 index_command("application/vnd.ms-outlook",
182 Filter(get_pkglibbindir() + "/outlookmsg2html", "text/html",
183 false));
184 index_command("application/vnd.ms-visio.drawing",
185 Filter("vsd2xhtml", "image/svg+xml", false));
186 index_command("application/vnd.ms-visio.stencil",
187 Filter("vsd2xhtml", "image/svg+xml", false));
188 index_command("application/vnd.ms-visio.template",
189 Filter("vsd2xhtml", "image/svg+xml", false));
190 index_command("application/vnd.visio",
191 Filter("vsd2xhtml", "image/svg+xml", false));
192 // pod2text's output character set doesn't seem to be documented, but from
193 // inspecting the source it looks like it's probably iso-8859-1. We need
194 // to pass "--errors=stderr" or else minor POD formatting errors cause a
195 // file not to be indexed.
196 index_command("text/x-perl",
197 Filter("pod2text --errors=stderr",
198 "text/plain", "iso-8859-1", false));
199 // FIXME: -e0 means "UTF-8", but that results in "fi", "ff", "ffi", etc
200 // appearing as single ligatures. For European languages, it's actually
201 // better to use -e2 (ISO-8859-1) and then convert, so let's do that for
202 // now until we handle Unicode "compatibility decompositions".
203 index_command("application/x-dvi",
204 Filter("catdvi -e2 -s", "text/plain", "iso-8859-1", false));
205 // Simplistic - ought to look in index.rdf files for filename and character
206 // set.
207 index_command("application/x-maff",
208 Filter("unzip -p %f '*/*.*htm*'", "text/html", "iso-8859-1",
209 false));
210 index_command("application/x-mimearchive",
211 Filter(get_pkglibbindir() + "/mhtml2html", "text/html",
212 false));
213 index_command("message/news",
214 Filter(get_pkglibbindir() + "/rfc822tohtml", "text/html",
215 false));
216 index_command("message/rfc822",
217 Filter(get_pkglibbindir() + "/rfc822tohtml", "text/html",
218 false));
219 index_command("text/vcard",
220 Filter(get_pkglibbindir() + "/vcard2text", false));
221 index_command("application/vnd.apple.keynote",
222 Filter("key2text", false));
223 index_command("application/vnd.apple.numbers",
224 Filter("numbers2text", false));
225 index_command("application/vnd.apple.pages",
226 Filter("pages2text", false));
229 void
230 index_init(const string & dbpath, const Xapian::Stem & stemmer,
231 const string & root_, const string & site_term_,
232 const string & host_term_,
233 empty_body_type empty_body_, dup_action_type dup_action_,
234 size_t sample_size_, size_t title_size_, size_t max_ext_len_,
235 bool overwrite, bool retry_failed_,
236 bool delete_removed_documents, bool verbose_, bool use_ctime_,
237 bool spelling, bool ignore_exclusions_, bool description_as_sample_)
239 root = root_;
240 site_term = site_term_;
241 host_term = host_term_;
242 empty_body = empty_body_;
243 dup_action = dup_action_;
244 sample_size = sample_size_;
245 title_size = title_size_;
246 max_ext_len = max_ext_len_;
247 verbose = verbose_;
248 use_ctime = use_ctime_;
249 ignore_exclusions = ignore_exclusions_;
250 description_as_sample = description_as_sample_;
252 if (!overwrite) {
253 db = Xapian::WritableDatabase(dbpath, Xapian::DB_CREATE_OR_OPEN);
254 old_docs_not_seen = db.get_doccount();
255 // Handle an initially empty database exactly the same way as when
256 // overwrite is true.
257 if (old_docs_not_seen != 0) {
258 old_lastdocid = db.get_lastdocid();
259 if (delete_removed_documents) {
260 // + 1 so that old_lastdocid is a valid subscript.
261 updated.resize(old_lastdocid + 1);
263 Xapian::valueno slot = use_ctime ? VALUE_CTIME : VALUE_LASTMOD;
264 string ubound = db.get_value_upper_bound(slot);
265 if (!ubound.empty())
266 last_altered_max = binary_string_to_int(ubound);
268 } else {
269 db = Xapian::WritableDatabase(dbpath, Xapian::DB_CREATE_OR_OVERWRITE);
272 if (spelling) {
273 indexer.set_database(db);
274 indexer.set_flags(indexer.FLAG_SPELLING);
276 indexer.set_stemmer(stemmer);
278 runfilter_init();
280 failed.init(db);
282 if (overwrite) {
283 // There are no failures to retry, so setting this flag doesn't
284 // change the outcome, but does mean we avoid the overhead of
285 // checking for a previous failure.
286 retry_failed = true;
287 } else if (retry_failed_) {
288 failed.clear();
289 retry_failed = true;
290 } else {
291 // If there are no existing failures, setting this flag doesn't
292 // change the outcome, but does mean we avoid the overhead of
293 // checking for a previous failure.
294 retry_failed = failed.empty();
298 static void
299 parse_pdfinfo_field(const char * p, const char * end, string & out, const char * field, size_t len)
301 if (size_t(end - p) > len && memcmp(p, field, len) == 0) {
302 p += len;
303 while (p != end && *p == ' ')
304 ++p;
305 if (p != end && (end[-1] != '\r' || --end != p))
306 out.assign(p, end - p);
310 #define PARSE_PDFINFO_FIELD(P, END, OUT, FIELD) \
311 parse_pdfinfo_field((P), (END), (OUT), FIELD":", CONST_STRLEN(FIELD) + 1)
313 static void
314 parse_pdf_metainfo(const string& pdfinfo, string &author, string &title,
315 string &keywords, string &topic, int& pages)
317 const char * p = pdfinfo.data();
318 const char * end = p + pdfinfo.size();
319 while (p != end) {
320 const char * start = p;
321 p = static_cast<const char *>(memchr(p, '\n', end - p));
322 const char * eol;
323 if (p) {
324 eol = p;
325 ++p;
326 } else {
327 p = eol = end;
329 switch (*start) {
330 case 'A':
331 PARSE_PDFINFO_FIELD(start, eol, author, "Author");
332 break;
333 case 'K':
334 PARSE_PDFINFO_FIELD(start, eol, keywords, "Keywords");
335 break;
336 case 'P': {
337 string s;
338 PARSE_PDFINFO_FIELD(start, eol, s, "Pages");
339 if (!s.empty())
340 pages = atoi(s.c_str());
341 break;
343 case 'S':
344 PARSE_PDFINFO_FIELD(start, eol, topic, "Subject");
345 break;
346 case 'T':
347 PARSE_PDFINFO_FIELD(start, eol, title, "Title");
348 break;
353 static void
354 get_pdf_metainfo(int fd, string &author, string &title,
355 string &keywords, string &topic, int& pages)
357 try {
358 string pdfinfo;
359 run_filter(fd, "pdfinfo -enc UTF-8 -", false, &pdfinfo);
360 parse_pdf_metainfo(pdfinfo, author, title, keywords, topic, pages);
361 } catch (const ReadError&) {
362 // It's probably best to index the document even if pdfinfo fails.
366 static void
367 get_pdf_metainfo(const string& file, string &author, string &title,
368 string &keywords, string &topic, int& pages)
370 try {
371 string cmd = "pdfinfo -enc UTF-8";
372 append_filename_argument(cmd, file);
373 parse_pdf_metainfo(stdout_to_string(cmd, false),
374 author, title, keywords, topic, pages);
375 } catch (const ReadError&) {
376 // It's probably best to index the document even if pdfinfo fails.
380 static void
381 generate_sample_from_csv(const string & csv_data, string & sample)
383 // Add 3 to allow for a 4 byte utf-8 sequence being appended when
384 // output is sample_size - 1 bytes long. Use csv_data.size() if smaller
385 // since the user might reasonably set sample_size really high.
386 sample.reserve(min(sample_size + 3, csv_data.size()));
387 size_t last_word_end = 0;
388 bool in_space = true;
389 bool in_quotes = false;
390 for (Xapian::Utf8Iterator i(csv_data); i != Xapian::Utf8Iterator(); ++i) {
391 unsigned ch = *i;
393 if (!in_quotes) {
394 // If not already in double quotes, '"' starts quoting and
395 // ',' starts a new field.
396 if (ch == '"') {
397 in_quotes = true;
398 continue;
400 if (ch == ',')
401 ch = ' ';
402 } else if (ch == '"') {
403 // In double quotes, '"' either ends double quotes, or
404 // if followed by another '"', means a literal '"'.
405 if (++i == Xapian::Utf8Iterator())
406 break;
407 ch = *i;
408 if (ch != '"') {
409 in_quotes = false;
410 if (ch == ',')
411 ch = ' ';
415 if (ch <= ' ' || ch == 0xa0) {
416 // FIXME: if all the whitespace characters between two
417 // words are 0xa0 (non-breaking space) then perhaps we
418 // should output 0xa0.
419 if (in_space)
420 continue;
421 last_word_end = sample.size();
422 sample += ' ';
423 in_space = true;
424 } else {
425 Xapian::Unicode::append_utf8(sample, ch);
426 in_space = false;
429 if (sample.size() >= sample_size) {
430 // Need to truncate sample.
431 if (last_word_end <= sample_size / 2) {
432 // Monster word! We'll have to just split it.
433 sample.replace(sample_size - 3, string::npos, "...", 3);
434 } else {
435 sample.replace(last_word_end, string::npos, " ...", 4);
437 break;
442 static bool
443 index_check_existing(const string & urlterm, time_t last_altered,
444 Xapian::docid & did)
446 switch (dup_action) {
447 case DUP_SKIP: {
448 Xapian::PostingIterator p = db.postlist_begin(urlterm);
449 if (p != db.postlist_end(urlterm)) {
450 if (verbose)
451 cout << "already indexed, not updating" << endl;
452 did = *p;
453 mark_as_seen(did);
454 return true;
456 break;
458 case DUP_CHECK_LAZILY: {
459 // If last_altered > last_altered_max, we know for sure that the
460 // file is new or updated.
461 if (last_altered > last_altered_max) {
462 return false;
465 Xapian::PostingIterator p = db.postlist_begin(urlterm);
466 if (p != db.postlist_end(urlterm)) {
467 did = *p;
468 Xapian::Document doc = db.get_document(did);
469 Xapian::valueno slot = use_ctime ? VALUE_CTIME : VALUE_LASTMOD;
470 string value = doc.get_value(slot);
471 time_t old_last_altered = binary_string_to_int(value);
472 if (last_altered <= old_last_altered) {
473 if (verbose)
474 cout << "already indexed" << endl;
475 // The docid should be in updated - the only valid
476 // exception is if the URL was long and hashed to the
477 // same URL as an existing document indexed in the same
478 // batch.
479 mark_as_seen(did);
480 return true;
483 break;
486 return false;
489 void
490 index_remove_failed_entry(const string& urlterm)
492 failed.del(urlterm);
495 void
496 index_add_document(const string & urlterm, time_t last_altered,
497 Xapian::docid did, const Xapian::Document & doc)
499 if (dup_action != DUP_SKIP) {
500 // If this document has already been indexed, update the existing
501 // entry.
502 if (did) {
503 // We already found out the document id above.
504 db.replace_document(did, doc);
505 } else if (last_altered <= last_altered_max) {
506 // We checked for the UID term and didn't find it.
507 did = db.add_document(doc);
508 } else {
509 did = db.replace_document(urlterm, doc);
511 mark_as_seen(did);
512 if (verbose) {
513 if (did <= old_lastdocid) {
514 cout << "updated" << endl;
515 } else {
516 cout << "added" << endl;
519 } else {
520 // If this were a duplicate, we'd have skipped it above.
521 db.add_document(doc);
522 if (verbose)
523 cout << "added" << endl;
527 void
528 index_mimetype(const string & file, const string & urlterm, const string & url,
529 const string & ext,
530 const string &mimetype, DirectoryIterator &d,
531 Xapian::Document & newdocument,
532 string record)
534 string context(file, root.size(), string::npos);
536 // FIXME: We could be cleverer here and check mtime too when use_ctime is
537 // set - if the ctime has changed but the mtime is unchanged, we can just
538 // update the existing Document and avoid having to re-extract text, etc.
539 time_t last_altered = use_ctime ? d.get_ctime() : d.get_mtime();
541 Xapian::docid did = 0;
542 if (index_check_existing(urlterm, last_altered, did))
543 return;
545 if (!retry_failed) {
546 // We only store and check the mtime (last modified) - a change to the
547 // metadata won't generally cause a previous failure to now work
548 // (FIXME: except permissions).
549 time_t failed_last_mod;
550 off_t failed_size;
551 if (failed.contains(urlterm, failed_last_mod, failed_size)) {
552 if (d.get_mtime() <= failed_last_mod &&
553 d.get_size() == failed_size) {
554 if (verbose)
555 cout << "failed to extract text on earlier run" << endl;
556 return;
558 // The file has changed, so remove the entry for it. If it fails
559 // again on this attempt, we'll add a new one.
560 failed.del(urlterm);
564 if (verbose) cout << flush;
566 string author, title, sample, keywords, topic, dump;
567 string md5;
568 time_t created = time_t(-1);
569 int pages = -1;
571 map<string, Filter>::const_iterator cmd_it = commands.find(mimetype);
572 if (cmd_it == commands.end()) {
573 size_t slash = mimetype.find('/');
574 if (slash != string::npos) {
575 string wildtype(mimetype, 0, slash + 2);
576 wildtype[slash + 1] = '*';
577 cmd_it = commands.find(wildtype);
578 if (cmd_it == commands.end()) {
579 cmd_it = commands.find("*/*");
582 if (cmd_it == commands.end()) {
583 cmd_it = commands.find("*");
586 try {
587 if (cmd_it != commands.end()) {
588 // Easy "run a command and read text or HTML from stdout or a
589 // temporary file" cases.
590 auto& filter = cmd_it->second;
591 string cmd = filter.cmd;
592 if (cmd.empty()) {
593 skip(urlterm, context, "required filter not installed",
594 d.get_size(), d.get_mtime(), SKIP_VERBOSE_ONLY);
595 return;
597 if (cmd == "false") {
598 // Allow setting 'false' as a filter to mean that a MIME type
599 // should be quietly ignored.
600 string m = "ignoring MIME type '";
601 m += cmd_it->first;
602 m += "'";
603 skip(urlterm, context, m, d.get_size(), d.get_mtime(),
604 SKIP_VERBOSE_ONLY);
605 return;
607 bool use_shell = filter.use_shell();
608 bool substituted = false;
609 string tmpout;
610 size_t pcent = 0;
611 while (true) {
612 pcent = cmd.find('%', pcent);
613 if (pcent >= cmd.size() - 1)
614 break;
615 switch (cmd[pcent + 1]) {
616 case '%': // %% -> %.
617 cmd.erase(++pcent, 1);
618 break;
619 case 'f': { // %f -> escaped filename.
620 substituted = true;
621 string tail(cmd, pcent + 2);
622 cmd.resize(pcent);
623 // Suppress the space append_filename_argument()
624 // usually adds before the argument - the command
625 // string either includes one, or won't expect one
626 // (e.g. --input=%f).
627 append_filename_argument(cmd, file, false);
628 pcent = cmd.size();
629 cmd += tail;
630 break;
632 case 't': { // %t -> temporary output file.
633 if (tmpout.empty()) {
634 // Use a temporary file with a suitable extension
635 // in case the command cares, and for more helpful
636 // error messages from the command.
637 if (filter.output_type == "text/html") {
638 tmpout = get_tmpfile("tmp.html");
639 } else if (filter.output_type == "image/svg+xml") {
640 tmpout = get_tmpfile("tmp.svg");
641 } else {
642 tmpout = get_tmpfile("tmp.txt");
645 substituted = true;
646 string tail(cmd, pcent + 2);
647 cmd.resize(pcent);
648 // Suppress the space append_filename_argument()
649 // usually adds before the argument - the command
650 // string either includes one, or won't expect one
651 // (e.g. --output=%t).
652 append_filename_argument(cmd, tmpout, false);
653 pcent = cmd.size();
654 cmd += tail;
655 break;
657 default:
658 // Leave anything else alone for now.
659 pcent += 2;
660 break;
663 if (!substituted && cmd != "true") {
664 // If no %f, append the filename to the command.
665 append_filename_argument(cmd, file);
667 try {
668 if (!tmpout.empty()) {
669 // Output in temporary file.
670 run_filter(cmd, use_shell);
671 if (!load_file(tmpout, dump, NOCACHE)) {
672 throw ReadError("Couldn't read output file");
674 unlink(tmpout.c_str());
675 } else if (cmd == "true") {
676 // Ignore the file's contents, just index metadata from the
677 // filing system.
678 } else {
679 // Output on stdout.
680 run_filter(cmd, use_shell, &dump);
682 const string & charset = filter.output_charset;
683 if (filter.output_type == "text/html") {
684 MyHtmlParser p;
685 p.ignore_metarobots();
686 p.description_as_sample = description_as_sample;
687 try {
688 p.parse_html(dump, charset, false);
689 } catch (const string & newcharset) {
690 p.reset();
691 p.ignore_metarobots();
692 p.description_as_sample = description_as_sample;
693 p.parse_html(dump, newcharset, true);
694 } catch (const ReadError&) {
695 skip_cmd_failed(urlterm, context, cmd,
696 d.get_size(), d.get_mtime());
697 return;
699 dump = p.dump;
700 title = p.title;
701 keywords = p.keywords;
702 topic = p.topic;
703 sample = p.sample;
704 author = p.author;
705 created = p.created;
706 } else if (filter.output_type == "image/svg+xml") {
707 SvgParser svgparser;
708 svgparser.parse(dump);
709 dump = svgparser.dump;
710 title = svgparser.title;
711 keywords = svgparser.keywords;
712 // FIXME: topic = svgparser.topic;
713 author = svgparser.author;
714 } else if (!charset.empty()) {
715 convert_to_utf8(dump, charset);
717 } catch (const ReadError&) {
718 skip_cmd_failed(urlterm, context, cmd,
719 d.get_size(), d.get_mtime());
720 return;
722 } else if (mimetype == "text/html" || mimetype == "text/x-php") {
723 const string & text = d.file_to_string();
724 MyHtmlParser p;
725 if (ignore_exclusions) p.ignore_metarobots();
726 p.description_as_sample = description_as_sample;
727 try {
728 // Default HTML character set is latin 1, though not specifying
729 // one is deprecated these days.
730 p.parse_html(text, "iso-8859-1", false);
731 } catch (const string & newcharset) {
732 p.reset();
733 if (ignore_exclusions) p.ignore_metarobots();
734 p.description_as_sample = description_as_sample;
735 p.parse_html(text, newcharset, true);
737 if (!p.indexing_allowed) {
738 skip_meta_tag(urlterm, context,
739 d.get_size(), d.get_mtime());
740 return;
742 dump = p.dump;
743 title = p.title;
744 keywords = p.keywords;
745 topic = p.topic;
746 sample = p.sample;
747 author = p.author;
748 created = p.created;
749 md5_string(text, md5);
750 } else if (mimetype == "text/plain") {
751 // Currently we assume that text files are UTF-8 unless they have a
752 // byte-order mark.
753 dump = d.file_to_string();
754 md5_string(dump, md5);
756 // Look for Byte-Order Mark (BOM).
757 if (startswith(dump, "\xfe\xff") || startswith(dump, "\xff\xfe")) {
758 // UTF-16 in big-endian/little-endian order - we just convert
759 // it as "UTF-16" and let the conversion handle the BOM as that
760 // way we avoid the copying overhead of erasing 2 bytes from
761 // the start of dump.
762 convert_to_utf8(dump, "UTF-16");
763 } else if (startswith(dump, "\xef\xbb\xbf")) {
764 // UTF-8 with stupid Windows not-the-byte-order mark.
765 dump.erase(0, 3);
766 } else {
767 // FIXME: What charset is the file? Look at contents?
769 } else if (mimetype == "application/pdf") {
770 const char* cmd = "pdftotext -enc UTF-8 - -";
771 try {
772 run_filter(d.get_fd(), cmd, false, &dump);
773 } catch (const ReadError&) {
774 skip_cmd_failed(urlterm, context, cmd,
775 d.get_size(), d.get_mtime());
776 return;
778 get_pdf_metainfo(d.get_fd(), author, title, keywords, topic, pages);
779 } else if (mimetype == "application/postscript") {
780 // There simply doesn't seem to be a Unicode capable PostScript to
781 // text converter (e.g. pstotext always outputs ISO-8859-1). The
782 // only solution seems to be to convert via PDF using ps2pdf and
783 // then pdftotext. This gives plausible looking UTF-8 output for
784 // some Chinese PostScript files I found using Google. It also has
785 // the benefit of allowing us to extract meta information from
786 // PostScript files.
787 string tmpfile = get_tmpfile("tmp.pdf");
788 if (tmpfile.empty()) {
789 // FIXME: should this be fatal? Or disable indexing postscript?
790 string msg = "Couldn't create temporary directory (";
791 msg += strerror(errno);
792 msg += ")";
793 skip(urlterm, context, msg,
794 d.get_size(), d.get_mtime());
795 return;
797 string cmd = "ps2pdf -";
798 append_filename_argument(cmd, tmpfile);
799 try {
800 run_filter(d.get_fd(), cmd, false);
801 cmd = "pdftotext -enc UTF-8";
802 append_filename_argument(cmd, tmpfile);
803 cmd += " -";
804 run_filter(cmd, false, &dump);
805 } catch (const ReadError&) {
806 skip_cmd_failed(urlterm, context, cmd,
807 d.get_size(), d.get_mtime());
808 unlink(tmpfile.c_str());
809 return;
810 } catch (...) {
811 unlink(tmpfile.c_str());
812 throw;
814 try {
815 get_pdf_metainfo(tmpfile, author, title, keywords, topic,
816 pages);
817 } catch (...) {
818 unlink(tmpfile.c_str());
819 throw;
821 unlink(tmpfile.c_str());
822 } else if (startswith(mimetype, "application/vnd.sun.xml.") ||
823 startswith(mimetype, "application/vnd.oasis.opendocument."))
825 // Inspired by http://mjr.towers.org.uk/comp/sxw2text
826 string cmd = "unzip -p";
827 append_filename_argument(cmd, file);
828 cmd += " content.xml ; unzip -p";
829 append_filename_argument(cmd, file);
830 cmd += " styles.xml";
831 try {
832 OpenDocParser parser;
833 parser.parse(stdout_to_string(cmd, true));
834 dump = parser.dump;
835 } catch (const ReadError&) {
836 skip_cmd_failed(urlterm, context, cmd,
837 d.get_size(), d.get_mtime());
838 return;
841 cmd = "unzip -p";
842 append_filename_argument(cmd, file);
843 cmd += " meta.xml";
844 try {
845 MetaXmlParser metaxmlparser;
846 metaxmlparser.parse(stdout_to_string(cmd, false));
847 title = metaxmlparser.title;
848 keywords = metaxmlparser.keywords;
849 // FIXME: topic = metaxmlparser.topic;
850 sample = metaxmlparser.sample;
851 author = metaxmlparser.author;
852 } catch (const ReadError&) {
853 // It's probably best to index the document even if this fails.
855 } else if (startswith(mimetype, "application/vnd.openxmlformats-officedocument.")) {
856 const char * args = NULL;
857 string tail(mimetype, 46);
858 if (startswith(tail, "wordprocessingml.")) {
859 // unzip returns exit code 11 if a file to extract wasn't found
860 // which we want to ignore, because there may be no headers or
861 // no footers.
862 args = " word/document.xml 'word/header*.xml' 'word/footer*.xml' 2>/dev/null";
863 } else if (startswith(tail, "spreadsheetml.")) {
864 // Extract the shared string table first, so our parser can
865 // grab those ready for parsing the sheets which will reference
866 // the shared strings.
867 string cmd = "unzip -p";
868 append_filename_argument(cmd, file);
869 cmd += " xl/styles.xml xl/workbook.xml xl/sharedStrings.xml ; unzip -p";
870 append_filename_argument(cmd, file);
871 cmd += " xl/worksheets/sheet\\*.xml";
872 try {
873 XlsxParser parser;
874 parser.parse(stdout_to_string(cmd, true));
875 dump = parser.dump;
876 } catch (const ReadError&) {
877 skip_cmd_failed(urlterm, context, cmd,
878 d.get_size(), d.get_mtime());
879 return;
881 } else if (startswith(tail, "presentationml.")) {
882 // unzip returns exit code 11 if a file to extract wasn't found
883 // which we want to ignore, because there may be no notesSlides
884 // or comments.
885 args = " 'ppt/slides/slide*.xml' 'ppt/notesSlides/notesSlide*.xml' 'ppt/comments/comment*.xml' 2>/dev/null";
886 } else {
887 // Don't know how to index this type.
888 skip_unknown_mimetype(urlterm, context, mimetype,
889 d.get_size(), d.get_mtime());
890 return;
893 if (args) {
894 string cmd = "unzip -p";
895 append_filename_argument(cmd, file);
896 cmd += args;
897 try {
898 MSXmlParser xmlparser;
899 // Treat exit status 11 from unzip as success - this is
900 // what we get if one of the listed filenames to extract
901 // doesn't match anything in the zip file.
902 xmlparser.parse_xml(stdout_to_string(cmd, false, 11));
903 dump = xmlparser.dump;
904 } catch (const ReadError&) {
905 skip_cmd_failed(urlterm, context, cmd,
906 d.get_size(), d.get_mtime());
907 return;
911 string cmd = "unzip -p";
912 append_filename_argument(cmd, file);
913 cmd += " docProps/core.xml";
914 try {
915 MetaXmlParser metaxmlparser;
916 metaxmlparser.parse(stdout_to_string(cmd, false));
917 title = metaxmlparser.title;
918 keywords = metaxmlparser.keywords;
919 // FIXME: topic = metaxmlparser.topic;
920 sample = metaxmlparser.sample;
921 author = metaxmlparser.author;
922 } catch (const ReadError&) {
923 // It's probably best to index the document even if this fails.
925 } else if (mimetype == "application/x-abiword") {
926 // FIXME: Implement support for metadata.
927 XmlParser xmlparser;
928 const string & text = d.file_to_string();
929 xmlparser.parse_xml(text);
930 dump = xmlparser.dump;
931 md5_string(text, md5);
932 } else if (mimetype == "application/x-abiword-compressed") {
933 // FIXME: Implement support for metadata.
934 XmlParser xmlparser;
935 xmlparser.parse_xml(d.gzfile_to_string());
936 dump = xmlparser.dump;
937 } else if (mimetype == "application/oxps" ||
938 mimetype == "application/vnd.ms-xpsdocument") {
939 string cmd = "unzip -p";
940 append_filename_argument(cmd, file);
941 cmd += " 'Documents/1/Pages/*.fpage'";
942 try {
943 XpsXmlParser xpsparser;
944 run_filter(cmd, false, &dump);
945 xpsparser.parse(dump);
946 dump = xpsparser.dump;
947 } catch (const ReadError&) {
948 skip_cmd_failed(urlterm, context, cmd,
949 d.get_size(), d.get_mtime());
950 return;
953 cmd = "unzip -p";
954 append_filename_argument(cmd, file);
955 cmd += " docProps/core.xml";
956 try {
957 MetaXmlParser metaparser;
958 metaparser.parse(stdout_to_string(cmd, false));
959 title = metaparser.title;
960 keywords = metaparser.keywords;
961 // FIXME: topic = metaparser.topic;
962 sample = metaparser.sample;
963 author = metaparser.author;
964 } catch (const ReadError&) {
965 // Ignore errors as not all XPS files contain this file.
967 } else if (mimetype == "text/csv") {
968 // Currently we assume that text files are UTF-8 unless they have a
969 // byte-order mark.
970 dump = d.file_to_string();
971 md5_string(dump, md5);
973 // Look for Byte-Order Mark (BOM).
974 if (startswith(dump, "\xfe\xff") || startswith(dump, "\xff\xfe")) {
975 // UTF-16 in big-endian/little-endian order - we just convert
976 // it as "UTF-16" and let the conversion handle the BOM as that
977 // way we avoid the copying overhead of erasing 2 bytes from
978 // the start of dump.
979 convert_to_utf8(dump, "UTF-16");
980 } else if (startswith(dump, "\xef\xbb\xbf")) {
981 // UTF-8 with stupid Windows not-the-byte-order mark.
982 dump.erase(0, 3);
983 } else {
984 // FIXME: What charset is the file? Look at contents?
987 generate_sample_from_csv(dump, sample);
988 } else if (mimetype == "image/svg+xml") {
989 SvgParser svgparser;
990 const string & text = d.file_to_string();
991 md5_string(text, md5);
992 svgparser.parse(text);
993 dump = svgparser.dump;
994 title = svgparser.title;
995 keywords = svgparser.keywords;
996 // FIXME: topic = svgparser.topic;
997 author = svgparser.author;
998 } else if (mimetype == "application/vnd.debian.binary-package" ||
999 mimetype == "application/x-debian-package") {
1000 const char* cmd = "dpkg-deb -f - Description";
1001 string desc;
1002 run_filter(d.get_fd(), cmd, false, &desc);
1003 // First line is short description, which we use as the title.
1004 string::size_type idx = desc.find('\n');
1005 title.assign(desc, 0, idx);
1006 if (idx != string::npos) {
1007 dump.assign(desc, idx + 1, string::npos);
1009 } else if (mimetype == "application/x-redhat-package-manager" ||
1010 mimetype == "application/x-rpm") {
1011 string cmd("rpm -q --qf '%{SUMMARY}\\n%{DESCRIPTION}' -p");
1012 append_filename_argument(cmd, file);
1013 string desc;
1014 run_filter(cmd, false, &desc);
1015 // First line is summary, which we use as the title.
1016 string::size_type idx = desc.find('\n');
1017 title.assign(desc, 0, idx);
1018 if (idx != string::npos) {
1019 dump.assign(desc, idx + 1, string::npos);
1021 } else if (mimetype == "application/atom+xml") {
1022 AtomParser atomparser;
1023 const string & text = d.file_to_string();
1024 md5_string(text, md5);
1025 atomparser.parse(text);
1026 dump = atomparser.dump;
1027 title = atomparser.title;
1028 keywords = atomparser.keywords;
1029 // FIXME: topic = atomparser.topic;
1030 author = atomparser.author;
1031 } else {
1032 // Don't know how to index this type.
1033 skip_unknown_mimetype(urlterm, context, mimetype,
1034 d.get_size(), d.get_mtime());
1035 return;
1038 // Compute the MD5 of the file if we haven't already.
1039 if (md5.empty() && !d.md5(md5)) {
1040 if (errno == ENOENT || errno == ENOTDIR) {
1041 skip(urlterm, context, "File removed during indexing",
1042 d.get_size(), d.get_mtime(),
1043 SKIP_VERBOSE_ONLY | SKIP_SHOW_FILENAME);
1044 } else {
1045 skip(urlterm, context, "failed to read file to calculate MD5 checksum",
1046 d.get_size(), d.get_mtime());
1048 return;
1051 // Remove any trailing formfeeds, so we don't consider them when
1052 // considering if we extracted any text (e.g. pdftotext outputs a
1053 // formfeed between each page, even for blank pages).
1055 // If dump contain only formfeeds, then trim_end will be string::npos
1056 // and ++trim_end will be 0, which is the correct new size.
1057 string::size_type trim_end = dump.find_last_not_of('\f');
1058 if (++trim_end != dump.size())
1059 dump.resize(trim_end);
1061 if (dump.empty()) {
1062 switch (empty_body) {
1063 case EMPTY_BODY_INDEX:
1064 break;
1065 case EMPTY_BODY_WARN:
1066 cout << "no text extracted from document body, "
1067 "but indexing metadata anyway" << endl;
1068 break;
1069 case EMPTY_BODY_SKIP:
1070 skip(urlterm, context, "no text extracted from document body",
1071 d.get_size(), d.get_mtime());
1072 return;
1076 // Produce a sample
1077 if (sample.empty()) {
1078 sample = generate_sample(dump, sample_size, "...", " ...");
1079 } else {
1080 sample = generate_sample(sample, sample_size, "...", " ...");
1083 // Put the data in the document
1084 if (record.empty()) {
1085 record = "url=";
1086 } else {
1087 record += "\nurl=";
1089 record += url;
1090 record += "\nsample=";
1091 record += sample;
1092 if (!title.empty()) {
1093 record += "\ncaption=";
1094 record += generate_sample(title, title_size, "...", " ...");
1096 if (!author.empty()) {
1097 record += "\nauthor=";
1098 record += author;
1100 record += "\ntype=";
1101 record += mimetype;
1102 time_t mtime = d.get_mtime();
1103 if (mtime != static_cast<time_t>(-1)) {
1104 record += "\nmodtime=";
1105 record += str(mtime);
1107 if (created != static_cast<time_t>(-1)) {
1108 record += "\ncreated=";
1109 record += str(created);
1111 if (pages >= 0) {
1112 record += "\npages=";
1113 record += str(pages);
1115 off_t size = d.get_size();
1116 record += "\nsize=";
1117 record += str(size);
1118 newdocument.set_data(record);
1120 // Index the title, document text, keywords and topic.
1121 indexer.set_document(newdocument);
1122 if (!title.empty()) {
1123 indexer.index_text(title, 5, "S");
1124 indexer.increase_termpos(100);
1126 if (!dump.empty()) {
1127 indexer.index_text(dump);
1129 if (!keywords.empty()) {
1130 indexer.increase_termpos(100);
1131 indexer.index_text(keywords);
1133 if (!topic.empty()) {
1134 indexer.increase_termpos(100);
1135 indexer.index_text(topic, 1, "B");
1137 // Index the leafname of the file.
1139 indexer.increase_termpos(100);
1140 string leaf = d.leafname();
1141 string::size_type dot = leaf.find_last_of('.');
1142 if (dot != string::npos && leaf.size() - dot - 1 <= max_ext_len)
1143 leaf.resize(dot);
1144 indexer.index_text(leaf, 1, "F");
1146 // Also index with underscores and ampersands replaced by spaces.
1147 bool modified = false;
1148 string::size_type rep = 0;
1149 while ((rep = leaf.find_first_of("_&", rep)) != string::npos) {
1150 leaf[rep++] = ' ';
1151 modified = true;
1153 if (modified) {
1154 indexer.increase_termpos(100);
1155 indexer.index_text(leaf, 1, "F");
1159 if (!author.empty()) {
1160 indexer.increase_termpos(100);
1161 indexer.index_text(author, 1, "A");
1164 // mimeType:
1165 newdocument.add_boolean_term("T" + mimetype);
1167 newdocument.add_boolean_term(site_term);
1169 if (!host_term.empty())
1170 newdocument.add_boolean_term(host_term);
1172 struct tm *tm = localtime(&mtime);
1173 string date_term = "D" + date_to_string(tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday);
1174 newdocument.add_boolean_term(date_term); // Date (YYYYMMDD)
1175 date_term.resize(7);
1176 date_term[0] = 'M';
1177 newdocument.add_boolean_term(date_term); // Month (YYYYMM)
1178 date_term.resize(5);
1179 date_term[0] = 'Y';
1180 newdocument.add_boolean_term(date_term); // Year (YYYY)
1182 newdocument.add_boolean_term(urlterm); // Url
1184 // Add mtime as a value to allow "sort by date".
1185 newdocument.add_value(VALUE_LASTMOD,
1186 int_to_binary_string(uint32_t(mtime)));
1187 if (use_ctime) {
1188 // Add ctime as a value to track modifications.
1189 time_t ctime = d.get_ctime();
1190 newdocument.add_value(VALUE_CTIME,
1191 int_to_binary_string(uint32_t(ctime)));
1194 // Add MD5 as a value to allow duplicate documents to be collapsed
1195 // together.
1196 newdocument.add_value(VALUE_MD5, md5);
1198 // Add the file size as a value to allow "sort by size" and size ranges.
1199 newdocument.add_value(VALUE_SIZE,
1200 Xapian::sortable_serialise(size));
1202 bool inc_tag_added = false;
1203 if (d.is_other_readable()) {
1204 inc_tag_added = true;
1205 newdocument.add_boolean_term("I*");
1206 } else if (d.is_group_readable()) {
1207 const char * group = d.get_group();
1208 if (group) {
1209 newdocument.add_boolean_term(string("I#") + group);
1212 const char * owner = d.get_owner();
1213 if (owner) {
1214 newdocument.add_boolean_term(string("O") + owner);
1215 if (!inc_tag_added && d.is_owner_readable())
1216 newdocument.add_boolean_term(string("I@") + owner);
1219 string ext_term("E");
1220 for (string::const_iterator i = ext.begin(); i != ext.end(); ++i) {
1221 char ch = *i;
1222 if (ch >= 'A' && ch <= 'Z')
1223 ch |= 32;
1224 ext_term += ch;
1226 newdocument.add_boolean_term(ext_term);
1228 index_add_document(urlterm, last_altered, did, newdocument);
1229 } catch (const ReadError&) {
1230 skip(urlterm, context, string("can't read file: ") + strerror(errno),
1231 d.get_size(), d.get_mtime());
1232 } catch (const NoSuchFilter&) {
1233 string filter_entry;
1234 if (cmd_it != commands.end()) {
1235 filter_entry = cmd_it->first;
1236 } else {
1237 filter_entry = mimetype;
1239 string m = "Filter for \"";
1240 m += filter_entry;
1241 m += "\" not installed";
1242 skip(urlterm, context, m, d.get_size(), d.get_mtime());
1243 commands[filter_entry] = Filter();
1244 } catch (const FileNotFound&) {
1245 skip(urlterm, context, "File removed during indexing",
1246 d.get_size(), d.get_mtime(),
1247 SKIP_VERBOSE_ONLY | SKIP_SHOW_FILENAME);
1248 } catch (const std::string & error) {
1249 skip(urlterm, context, error, d.get_size(), d.get_mtime());
1250 } catch (const std::bad_alloc&) {
1251 // Attempt to flag the file as failed and commit changes, though that
1252 // might fail too if we're low on memory rather than being asked to
1253 // allocate a ludicrous amount.
1254 skip(urlterm, context, "Out of memory trying to extract text from file",
1255 d.get_size(), d.get_mtime(),
1256 SKIP_SHOW_FILENAME);
1257 throw CommitAndExit("Caught std::bad_alloc", "");
1261 void
1262 index_handle_deletion()
1264 if (updated.empty() || old_docs_not_seen == 0) return;
1266 if (verbose) {
1267 cout << "Deleting " << old_docs_not_seen << " old documents which weren't found" << endl;
1269 Xapian::PostingIterator alldocs = db.postlist_begin(string());
1270 Xapian::docid did = *alldocs;
1271 while (did < updated.size()) {
1272 if (!updated[did]) {
1273 alldocs.skip_to(did);
1274 if (alldocs == db.postlist_end(string()))
1275 break;
1276 if (*alldocs != did) {
1277 // Document #did didn't exist before we started.
1278 did = *alldocs;
1279 continue;
1281 db.delete_document(did);
1282 if (--old_docs_not_seen == 0)
1283 break;
1285 ++did;
1289 void
1290 index_commit()
1292 db.commit();
1295 void
1296 index_done()
1298 // If we created a temporary directory then delete it.
1299 remove_tmpdir();