2 * @brief Handle indexing a document from a file
4 /* Copyright 1999,2000,2001 BrightStation PLC
5 * Copyright 2001,2005 James Aylett
6 * Copyright 2001,2002 Ananova Ltd
7 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018 Olly Betts
8 * Copyright 2009 Frank J Bruzzaniti
9 * Copyright 2012 Mihai Bivol
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public License as
13 * published by the Free Software Foundation; either version 2 of the
14 * License, or (at your option) any later version.
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
29 #include "index_file.h"
38 #include <sys/types.h>
39 #include "safeunistd.h"
44 #include "safefcntl.h"
49 #include "append_filename_arg.h"
50 #include "atomparse.h"
54 #include "metaxmlparse.h"
56 #include "msxmlparse.h"
57 #include "myhtmlparse.h"
58 #include "opendocparse.h"
59 #include "pkglibbindir.h"
60 #include "runfilter.h"
63 #include "stringutils.h"
66 #include "utf8convert.h"
70 #include "xlsxparse.h"
71 #include "xpsxmlparse.h"
75 static Xapian::WritableDatabase db
;
76 static Xapian::TermGenerator indexer
;
78 static Xapian::doccount old_docs_not_seen
;
79 static Xapian::docid old_lastdocid
;
80 static vector
<bool> updated
;
83 static bool retry_failed
;
84 static bool use_ctime
;
85 static dup_action_type dup_action
;
86 static bool ignore_exclusions
;
87 static bool description_as_sample
;
89 static time_t last_altered_max
;
90 static size_t sample_size
;
91 static size_t title_size
;
92 static size_t max_ext_len
;
94 static empty_body_type empty_body
;
97 static string site_term
, host_term
;
101 map
<string
, Filter
> commands
;
104 mark_as_seen(Xapian::docid did
)
106 if (usual(did
< updated
.size() && !updated
[did
])) {
113 skip(const string
& urlterm
, const string
& context
, const string
& msg
,
114 off_t size
, time_t last_mod
, unsigned flags
)
116 failed
.add(urlterm
, last_mod
, size
);
118 if (!verbose
|| (flags
& SKIP_SHOW_FILENAME
)) {
119 if (!verbose
&& (flags
& SKIP_VERBOSE_ONLY
)) return;
120 cout
<< context
<< ": ";
123 cout
<< "Skipping - " << msg
<< endl
;
127 skip_cmd_failed(const string
& urlterm
, const string
& context
, const string
& cmd
,
128 off_t size
, time_t last_mod
)
130 skip(urlterm
, context
, "\"" + cmd
+ "\" failed", size
, last_mod
);
134 skip_meta_tag(const string
& urlterm
, const string
& context
,
135 off_t size
, time_t last_mod
)
137 skip(urlterm
, context
, "indexing disallowed by meta tag", size
, last_mod
);
141 skip_unknown_mimetype(const string
& urlterm
, const string
& context
,
142 const string
& mimetype
, off_t size
, time_t last_mod
)
144 skip(urlterm
, context
, "unknown MIME type '" + mimetype
+ "'", size
, last_mod
);
148 index_add_default_filters()
150 index_command("application/msword", Filter("antiword -mUTF-8.txt", false));
151 index_command("application/vnd.ms-excel",
152 Filter("xls2csv -c' ' -q0 -dutf-8", false));
153 index_command("application/vnd.ms-powerpoint",
154 Filter("catppt -dutf-8", false));
155 // Looking at the source of wpd2html and wpd2text I think both output
156 // UTF-8, but it's hard to be sure without sample Unicode .wpd files
157 // as they don't seem to be at all well documented.
158 index_command("application/vnd.wordperfect", Filter("wpd2text", false));
159 // wps2text produces UTF-8 output from the sample files I've tested.
160 index_command("application/vnd.ms-works", Filter("wps2text", false));
161 // Output is UTF-8 according to "man djvutxt". Generally this seems to
162 // be true, though some examples from djvu.org generate isolated byte
163 // 0x95 in a context which suggests it might be intended to be a bullet
164 // (as it is in CP1252).
165 index_command("image/vnd.djvu", Filter("djvutxt", false));
166 index_command("text/markdown", Filter("markdown", "text/html", false));
167 // The --text option unhelpfully converts all non-ASCII characters to "?"
168 // so we use --html instead, which produces HTML entities. The --nopict
169 // option suppresses exporting picture files as pictNNNN.wmf in the current
170 // directory. Note that this option was ignored in some older versions,
171 // but it was fixed in unrtf 0.20.4.
172 index_command("application/rtf",
173 Filter("unrtf --nopict --html 2>/dev/null", "text/html",
175 index_command("text/rtf",
176 Filter("unrtf --nopict --html 2>/dev/null", "text/html",
178 index_command("text/x-rst", Filter("rst2html", "text/html", false));
179 index_command("application/x-mspublisher",
180 Filter("pub2xhtml", "text/html", false));
181 index_command("application/vnd.ms-outlook",
182 Filter(get_pkglibbindir() + "/outlookmsg2html", "text/html",
184 index_command("application/vnd.ms-visio.drawing",
185 Filter("vsd2xhtml", "image/svg+xml", false));
186 index_command("application/vnd.ms-visio.stencil",
187 Filter("vsd2xhtml", "image/svg+xml", false));
188 index_command("application/vnd.ms-visio.template",
189 Filter("vsd2xhtml", "image/svg+xml", false));
190 index_command("application/vnd.visio",
191 Filter("vsd2xhtml", "image/svg+xml", false));
192 // pod2text's output character set doesn't seem to be documented, but from
193 // inspecting the source it looks like it's probably iso-8859-1. We need
194 // to pass "--errors=stderr" or else minor POD formatting errors cause a
195 // file not to be indexed.
196 index_command("text/x-perl",
197 Filter("pod2text --errors=stderr",
198 "text/plain", "iso-8859-1", false));
199 // FIXME: -e0 means "UTF-8", but that results in "fi", "ff", "ffi", etc
200 // appearing as single ligatures. For European languages, it's actually
201 // better to use -e2 (ISO-8859-1) and then convert, so let's do that for
202 // now until we handle Unicode "compatibility decompositions".
203 index_command("application/x-dvi",
204 Filter("catdvi -e2 -s", "text/plain", "iso-8859-1", false));
205 // Simplistic - ought to look in index.rdf files for filename and character
207 index_command("application/x-maff",
208 Filter("unzip -p %f '*/*.*htm*'", "text/html", "iso-8859-1",
210 index_command("application/x-mimearchive",
211 Filter(get_pkglibbindir() + "/mhtml2html", "text/html",
213 index_command("message/news",
214 Filter(get_pkglibbindir() + "/rfc822tohtml", "text/html",
216 index_command("message/rfc822",
217 Filter(get_pkglibbindir() + "/rfc822tohtml", "text/html",
219 index_command("text/vcard",
220 Filter(get_pkglibbindir() + "/vcard2text", false));
221 index_command("application/vnd.apple.keynote",
222 Filter("key2text", false));
223 index_command("application/vnd.apple.numbers",
224 Filter("numbers2text", false));
225 index_command("application/vnd.apple.pages",
226 Filter("pages2text", false));
230 index_init(const string
& dbpath
, const Xapian::Stem
& stemmer
,
231 const string
& root_
, const string
& site_term_
,
232 const string
& host_term_
,
233 empty_body_type empty_body_
, dup_action_type dup_action_
,
234 size_t sample_size_
, size_t title_size_
, size_t max_ext_len_
,
235 bool overwrite
, bool retry_failed_
,
236 bool delete_removed_documents
, bool verbose_
, bool use_ctime_
,
237 bool spelling
, bool ignore_exclusions_
, bool description_as_sample_
)
240 site_term
= site_term_
;
241 host_term
= host_term_
;
242 empty_body
= empty_body_
;
243 dup_action
= dup_action_
;
244 sample_size
= sample_size_
;
245 title_size
= title_size_
;
246 max_ext_len
= max_ext_len_
;
248 use_ctime
= use_ctime_
;
249 ignore_exclusions
= ignore_exclusions_
;
250 description_as_sample
= description_as_sample_
;
253 db
= Xapian::WritableDatabase(dbpath
, Xapian::DB_CREATE_OR_OPEN
);
254 old_docs_not_seen
= db
.get_doccount();
255 // Handle an initially empty database exactly the same way as when
256 // overwrite is true.
257 if (old_docs_not_seen
!= 0) {
258 old_lastdocid
= db
.get_lastdocid();
259 if (delete_removed_documents
) {
260 // + 1 so that old_lastdocid is a valid subscript.
261 updated
.resize(old_lastdocid
+ 1);
263 Xapian::valueno slot
= use_ctime
? VALUE_CTIME
: VALUE_LASTMOD
;
264 string ubound
= db
.get_value_upper_bound(slot
);
266 last_altered_max
= binary_string_to_int(ubound
);
269 db
= Xapian::WritableDatabase(dbpath
, Xapian::DB_CREATE_OR_OVERWRITE
);
273 indexer
.set_database(db
);
274 indexer
.set_flags(indexer
.FLAG_SPELLING
);
276 indexer
.set_stemmer(stemmer
);
283 // There are no failures to retry, so setting this flag doesn't
284 // change the outcome, but does mean we avoid the overhead of
285 // checking for a previous failure.
287 } else if (retry_failed_
) {
291 // If there are no existing failures, setting this flag doesn't
292 // change the outcome, but does mean we avoid the overhead of
293 // checking for a previous failure.
294 retry_failed
= failed
.empty();
299 parse_pdfinfo_field(const char * p
, const char * end
, string
& out
, const char * field
, size_t len
)
301 if (size_t(end
- p
) > len
&& memcmp(p
, field
, len
) == 0) {
303 while (p
!= end
&& *p
== ' ')
305 if (p
!= end
&& (end
[-1] != '\r' || --end
!= p
))
306 out
.assign(p
, end
- p
);
310 #define PARSE_PDFINFO_FIELD(P, END, OUT, FIELD) \
311 parse_pdfinfo_field((P), (END), (OUT), FIELD":", CONST_STRLEN(FIELD) + 1)
314 parse_pdf_metainfo(const string
& pdfinfo
, string
&author
, string
&title
,
315 string
&keywords
, string
&topic
, int& pages
)
317 const char * p
= pdfinfo
.data();
318 const char * end
= p
+ pdfinfo
.size();
320 const char * start
= p
;
321 p
= static_cast<const char *>(memchr(p
, '\n', end
- p
));
331 PARSE_PDFINFO_FIELD(start
, eol
, author
, "Author");
334 PARSE_PDFINFO_FIELD(start
, eol
, keywords
, "Keywords");
338 PARSE_PDFINFO_FIELD(start
, eol
, s
, "Pages");
340 pages
= atoi(s
.c_str());
344 PARSE_PDFINFO_FIELD(start
, eol
, topic
, "Subject");
347 PARSE_PDFINFO_FIELD(start
, eol
, title
, "Title");
354 get_pdf_metainfo(int fd
, string
&author
, string
&title
,
355 string
&keywords
, string
&topic
, int& pages
)
359 run_filter(fd
, "pdfinfo -enc UTF-8 -", false, &pdfinfo
);
360 parse_pdf_metainfo(pdfinfo
, author
, title
, keywords
, topic
, pages
);
361 } catch (const ReadError
&) {
362 // It's probably best to index the document even if pdfinfo fails.
367 get_pdf_metainfo(const string
& file
, string
&author
, string
&title
,
368 string
&keywords
, string
&topic
, int& pages
)
371 string cmd
= "pdfinfo -enc UTF-8";
372 append_filename_argument(cmd
, file
);
373 parse_pdf_metainfo(stdout_to_string(cmd
, false),
374 author
, title
, keywords
, topic
, pages
);
375 } catch (const ReadError
&) {
376 // It's probably best to index the document even if pdfinfo fails.
381 generate_sample_from_csv(const string
& csv_data
, string
& sample
)
383 // Add 3 to allow for a 4 byte utf-8 sequence being appended when
384 // output is sample_size - 1 bytes long. Use csv_data.size() if smaller
385 // since the user might reasonably set sample_size really high.
386 sample
.reserve(min(sample_size
+ 3, csv_data
.size()));
387 size_t last_word_end
= 0;
388 bool in_space
= true;
389 bool in_quotes
= false;
390 for (Xapian::Utf8Iterator
i(csv_data
); i
!= Xapian::Utf8Iterator(); ++i
) {
394 // If not already in double quotes, '"' starts quoting and
395 // ',' starts a new field.
402 } else if (ch
== '"') {
403 // In double quotes, '"' either ends double quotes, or
404 // if followed by another '"', means a literal '"'.
405 if (++i
== Xapian::Utf8Iterator())
415 if (ch
<= ' ' || ch
== 0xa0) {
416 // FIXME: if all the whitespace characters between two
417 // words are 0xa0 (non-breaking space) then perhaps we
418 // should output 0xa0.
421 last_word_end
= sample
.size();
425 Xapian::Unicode::append_utf8(sample
, ch
);
429 if (sample
.size() >= sample_size
) {
430 // Need to truncate sample.
431 if (last_word_end
<= sample_size
/ 2) {
432 // Monster word! We'll have to just split it.
433 sample
.replace(sample_size
- 3, string::npos
, "...", 3);
435 sample
.replace(last_word_end
, string::npos
, " ...", 4);
443 index_check_existing(const string
& urlterm
, time_t last_altered
,
446 switch (dup_action
) {
448 Xapian::PostingIterator p
= db
.postlist_begin(urlterm
);
449 if (p
!= db
.postlist_end(urlterm
)) {
451 cout
<< "already indexed, not updating" << endl
;
458 case DUP_CHECK_LAZILY
: {
459 // If last_altered > last_altered_max, we know for sure that the
460 // file is new or updated.
461 if (last_altered
> last_altered_max
) {
465 Xapian::PostingIterator p
= db
.postlist_begin(urlterm
);
466 if (p
!= db
.postlist_end(urlterm
)) {
468 Xapian::Document doc
= db
.get_document(did
);
469 Xapian::valueno slot
= use_ctime
? VALUE_CTIME
: VALUE_LASTMOD
;
470 string value
= doc
.get_value(slot
);
471 time_t old_last_altered
= binary_string_to_int(value
);
472 if (last_altered
<= old_last_altered
) {
474 cout
<< "already indexed" << endl
;
475 // The docid should be in updated - the only valid
476 // exception is if the URL was long and hashed to the
477 // same URL as an existing document indexed in the same
490 index_remove_failed_entry(const string
& urlterm
)
496 index_add_document(const string
& urlterm
, time_t last_altered
,
497 Xapian::docid did
, const Xapian::Document
& doc
)
499 if (dup_action
!= DUP_SKIP
) {
500 // If this document has already been indexed, update the existing
503 // We already found out the document id above.
504 db
.replace_document(did
, doc
);
505 } else if (last_altered
<= last_altered_max
) {
506 // We checked for the UID term and didn't find it.
507 did
= db
.add_document(doc
);
509 did
= db
.replace_document(urlterm
, doc
);
513 if (did
<= old_lastdocid
) {
514 cout
<< "updated" << endl
;
516 cout
<< "added" << endl
;
520 // If this were a duplicate, we'd have skipped it above.
521 db
.add_document(doc
);
523 cout
<< "added" << endl
;
528 index_mimetype(const string
& file
, const string
& urlterm
, const string
& url
,
530 const string
&mimetype
, DirectoryIterator
&d
,
531 Xapian::Document
& newdocument
,
534 string
context(file
, root
.size(), string::npos
);
536 // FIXME: We could be cleverer here and check mtime too when use_ctime is
537 // set - if the ctime has changed but the mtime is unchanged, we can just
538 // update the existing Document and avoid having to re-extract text, etc.
539 time_t last_altered
= use_ctime
? d
.get_ctime() : d
.get_mtime();
541 Xapian::docid did
= 0;
542 if (index_check_existing(urlterm
, last_altered
, did
))
546 // We only store and check the mtime (last modified) - a change to the
547 // metadata won't generally cause a previous failure to now work
548 // (FIXME: except permissions).
549 time_t failed_last_mod
;
551 if (failed
.contains(urlterm
, failed_last_mod
, failed_size
)) {
552 if (d
.get_mtime() <= failed_last_mod
&&
553 d
.get_size() == failed_size
) {
555 cout
<< "failed to extract text on earlier run" << endl
;
558 // The file has changed, so remove the entry for it. If it fails
559 // again on this attempt, we'll add a new one.
564 if (verbose
) cout
<< flush
;
566 string author
, title
, sample
, keywords
, topic
, dump
;
568 time_t created
= time_t(-1);
571 map
<string
, Filter
>::const_iterator cmd_it
= commands
.find(mimetype
);
572 if (cmd_it
== commands
.end()) {
573 size_t slash
= mimetype
.find('/');
574 if (slash
!= string::npos
) {
575 string
wildtype(mimetype
, 0, slash
+ 2);
576 wildtype
[slash
+ 1] = '*';
577 cmd_it
= commands
.find(wildtype
);
578 if (cmd_it
== commands
.end()) {
579 cmd_it
= commands
.find("*/*");
582 if (cmd_it
== commands
.end()) {
583 cmd_it
= commands
.find("*");
587 if (cmd_it
!= commands
.end()) {
588 // Easy "run a command and read text or HTML from stdout or a
589 // temporary file" cases.
590 auto& filter
= cmd_it
->second
;
591 string cmd
= filter
.cmd
;
593 skip(urlterm
, context
, "required filter not installed",
594 d
.get_size(), d
.get_mtime(), SKIP_VERBOSE_ONLY
);
597 if (cmd
== "false") {
598 // Allow setting 'false' as a filter to mean that a MIME type
599 // should be quietly ignored.
600 string m
= "ignoring MIME type '";
603 skip(urlterm
, context
, m
, d
.get_size(), d
.get_mtime(),
607 bool use_shell
= filter
.use_shell();
608 bool substituted
= false;
612 pcent
= cmd
.find('%', pcent
);
613 if (pcent
>= cmd
.size() - 1)
615 switch (cmd
[pcent
+ 1]) {
616 case '%': // %% -> %.
617 cmd
.erase(++pcent
, 1);
619 case 'f': { // %f -> escaped filename.
621 string
tail(cmd
, pcent
+ 2);
623 // Suppress the space append_filename_argument()
624 // usually adds before the argument - the command
625 // string either includes one, or won't expect one
626 // (e.g. --input=%f).
627 append_filename_argument(cmd
, file
, false);
632 case 't': { // %t -> temporary output file.
633 if (tmpout
.empty()) {
634 // Use a temporary file with a suitable extension
635 // in case the command cares, and for more helpful
636 // error messages from the command.
637 if (filter
.output_type
== "text/html") {
638 tmpout
= get_tmpfile("tmp.html");
639 } else if (filter
.output_type
== "image/svg+xml") {
640 tmpout
= get_tmpfile("tmp.svg");
642 tmpout
= get_tmpfile("tmp.txt");
646 string
tail(cmd
, pcent
+ 2);
648 // Suppress the space append_filename_argument()
649 // usually adds before the argument - the command
650 // string either includes one, or won't expect one
651 // (e.g. --output=%t).
652 append_filename_argument(cmd
, tmpout
, false);
658 // Leave anything else alone for now.
663 if (!substituted
&& cmd
!= "true") {
664 // If no %f, append the filename to the command.
665 append_filename_argument(cmd
, file
);
668 if (!tmpout
.empty()) {
669 // Output in temporary file.
670 run_filter(cmd
, use_shell
);
671 if (!load_file(tmpout
, dump
, NOCACHE
)) {
672 throw ReadError("Couldn't read output file");
674 unlink(tmpout
.c_str());
675 } else if (cmd
== "true") {
676 // Ignore the file's contents, just index metadata from the
680 run_filter(cmd
, use_shell
, &dump
);
682 const string
& charset
= filter
.output_charset
;
683 if (filter
.output_type
== "text/html") {
685 p
.ignore_metarobots();
686 p
.description_as_sample
= description_as_sample
;
688 p
.parse_html(dump
, charset
, false);
689 } catch (const string
& newcharset
) {
691 p
.ignore_metarobots();
692 p
.description_as_sample
= description_as_sample
;
693 p
.parse_html(dump
, newcharset
, true);
694 } catch (const ReadError
&) {
695 skip_cmd_failed(urlterm
, context
, cmd
,
696 d
.get_size(), d
.get_mtime());
701 keywords
= p
.keywords
;
706 } else if (filter
.output_type
== "image/svg+xml") {
708 svgparser
.parse(dump
);
709 dump
= svgparser
.dump
;
710 title
= svgparser
.title
;
711 keywords
= svgparser
.keywords
;
712 // FIXME: topic = svgparser.topic;
713 author
= svgparser
.author
;
714 } else if (!charset
.empty()) {
715 convert_to_utf8(dump
, charset
);
717 } catch (const ReadError
&) {
718 skip_cmd_failed(urlterm
, context
, cmd
,
719 d
.get_size(), d
.get_mtime());
722 } else if (mimetype
== "text/html" || mimetype
== "text/x-php") {
723 const string
& text
= d
.file_to_string();
725 if (ignore_exclusions
) p
.ignore_metarobots();
726 p
.description_as_sample
= description_as_sample
;
728 // Default HTML character set is latin 1, though not specifying
729 // one is deprecated these days.
730 p
.parse_html(text
, "iso-8859-1", false);
731 } catch (const string
& newcharset
) {
733 if (ignore_exclusions
) p
.ignore_metarobots();
734 p
.description_as_sample
= description_as_sample
;
735 p
.parse_html(text
, newcharset
, true);
737 if (!p
.indexing_allowed
) {
738 skip_meta_tag(urlterm
, context
,
739 d
.get_size(), d
.get_mtime());
744 keywords
= p
.keywords
;
749 md5_string(text
, md5
);
750 } else if (mimetype
== "text/plain") {
751 // Currently we assume that text files are UTF-8 unless they have a
753 dump
= d
.file_to_string();
754 md5_string(dump
, md5
);
756 // Look for Byte-Order Mark (BOM).
757 if (startswith(dump
, "\xfe\xff") || startswith(dump
, "\xff\xfe")) {
758 // UTF-16 in big-endian/little-endian order - we just convert
759 // it as "UTF-16" and let the conversion handle the BOM as that
760 // way we avoid the copying overhead of erasing 2 bytes from
761 // the start of dump.
762 convert_to_utf8(dump
, "UTF-16");
763 } else if (startswith(dump
, "\xef\xbb\xbf")) {
764 // UTF-8 with stupid Windows not-the-byte-order mark.
767 // FIXME: What charset is the file? Look at contents?
769 } else if (mimetype
== "application/pdf") {
770 const char* cmd
= "pdftotext -enc UTF-8 - -";
772 run_filter(d
.get_fd(), cmd
, false, &dump
);
773 } catch (const ReadError
&) {
774 skip_cmd_failed(urlterm
, context
, cmd
,
775 d
.get_size(), d
.get_mtime());
778 get_pdf_metainfo(d
.get_fd(), author
, title
, keywords
, topic
, pages
);
779 } else if (mimetype
== "application/postscript") {
780 // There simply doesn't seem to be a Unicode capable PostScript to
781 // text converter (e.g. pstotext always outputs ISO-8859-1). The
782 // only solution seems to be to convert via PDF using ps2pdf and
783 // then pdftotext. This gives plausible looking UTF-8 output for
784 // some Chinese PostScript files I found using Google. It also has
785 // the benefit of allowing us to extract meta information from
787 string tmpfile
= get_tmpfile("tmp.pdf");
788 if (tmpfile
.empty()) {
789 // FIXME: should this be fatal? Or disable indexing postscript?
790 string msg
= "Couldn't create temporary directory (";
791 msg
+= strerror(errno
);
793 skip(urlterm
, context
, msg
,
794 d
.get_size(), d
.get_mtime());
797 string cmd
= "ps2pdf -";
798 append_filename_argument(cmd
, tmpfile
);
800 run_filter(d
.get_fd(), cmd
, false);
801 cmd
= "pdftotext -enc UTF-8";
802 append_filename_argument(cmd
, tmpfile
);
804 run_filter(cmd
, false, &dump
);
805 } catch (const ReadError
&) {
806 skip_cmd_failed(urlterm
, context
, cmd
,
807 d
.get_size(), d
.get_mtime());
808 unlink(tmpfile
.c_str());
811 unlink(tmpfile
.c_str());
815 get_pdf_metainfo(tmpfile
, author
, title
, keywords
, topic
,
818 unlink(tmpfile
.c_str());
821 unlink(tmpfile
.c_str());
822 } else if (startswith(mimetype
, "application/vnd.sun.xml.") ||
823 startswith(mimetype
, "application/vnd.oasis.opendocument."))
825 // Inspired by http://mjr.towers.org.uk/comp/sxw2text
826 string cmd
= "unzip -p";
827 append_filename_argument(cmd
, file
);
828 cmd
+= " content.xml ; unzip -p";
829 append_filename_argument(cmd
, file
);
830 cmd
+= " styles.xml";
832 OpenDocParser parser
;
833 parser
.parse(stdout_to_string(cmd
, true));
835 } catch (const ReadError
&) {
836 skip_cmd_failed(urlterm
, context
, cmd
,
837 d
.get_size(), d
.get_mtime());
842 append_filename_argument(cmd
, file
);
845 MetaXmlParser metaxmlparser
;
846 metaxmlparser
.parse(stdout_to_string(cmd
, false));
847 title
= metaxmlparser
.title
;
848 keywords
= metaxmlparser
.keywords
;
849 // FIXME: topic = metaxmlparser.topic;
850 sample
= metaxmlparser
.sample
;
851 author
= metaxmlparser
.author
;
852 } catch (const ReadError
&) {
853 // It's probably best to index the document even if this fails.
855 } else if (startswith(mimetype
, "application/vnd.openxmlformats-officedocument.")) {
856 const char * args
= NULL
;
857 string
tail(mimetype
, 46);
858 if (startswith(tail
, "wordprocessingml.")) {
859 // unzip returns exit code 11 if a file to extract wasn't found
860 // which we want to ignore, because there may be no headers or
862 args
= " word/document.xml 'word/header*.xml' 'word/footer*.xml' 2>/dev/null";
863 } else if (startswith(tail
, "spreadsheetml.")) {
864 // Extract the shared string table first, so our parser can
865 // grab those ready for parsing the sheets which will reference
866 // the shared strings.
867 string cmd
= "unzip -p";
868 append_filename_argument(cmd
, file
);
869 cmd
+= " xl/styles.xml xl/workbook.xml xl/sharedStrings.xml ; unzip -p";
870 append_filename_argument(cmd
, file
);
871 cmd
+= " xl/worksheets/sheet\\*.xml";
874 parser
.parse(stdout_to_string(cmd
, true));
876 } catch (const ReadError
&) {
877 skip_cmd_failed(urlterm
, context
, cmd
,
878 d
.get_size(), d
.get_mtime());
881 } else if (startswith(tail
, "presentationml.")) {
882 // unzip returns exit code 11 if a file to extract wasn't found
883 // which we want to ignore, because there may be no notesSlides
885 args
= " 'ppt/slides/slide*.xml' 'ppt/notesSlides/notesSlide*.xml' 'ppt/comments/comment*.xml' 2>/dev/null";
887 // Don't know how to index this type.
888 skip_unknown_mimetype(urlterm
, context
, mimetype
,
889 d
.get_size(), d
.get_mtime());
894 string cmd
= "unzip -p";
895 append_filename_argument(cmd
, file
);
898 MSXmlParser xmlparser
;
899 // Treat exit status 11 from unzip as success - this is
900 // what we get if one of the listed filenames to extract
901 // doesn't match anything in the zip file.
902 xmlparser
.parse_xml(stdout_to_string(cmd
, false, 11));
903 dump
= xmlparser
.dump
;
904 } catch (const ReadError
&) {
905 skip_cmd_failed(urlterm
, context
, cmd
,
906 d
.get_size(), d
.get_mtime());
911 string cmd
= "unzip -p";
912 append_filename_argument(cmd
, file
);
913 cmd
+= " docProps/core.xml";
915 MetaXmlParser metaxmlparser
;
916 metaxmlparser
.parse(stdout_to_string(cmd
, false));
917 title
= metaxmlparser
.title
;
918 keywords
= metaxmlparser
.keywords
;
919 // FIXME: topic = metaxmlparser.topic;
920 sample
= metaxmlparser
.sample
;
921 author
= metaxmlparser
.author
;
922 } catch (const ReadError
&) {
923 // It's probably best to index the document even if this fails.
925 } else if (mimetype
== "application/x-abiword") {
926 // FIXME: Implement support for metadata.
928 const string
& text
= d
.file_to_string();
929 xmlparser
.parse_xml(text
);
930 dump
= xmlparser
.dump
;
931 md5_string(text
, md5
);
932 } else if (mimetype
== "application/x-abiword-compressed") {
933 // FIXME: Implement support for metadata.
935 xmlparser
.parse_xml(d
.gzfile_to_string());
936 dump
= xmlparser
.dump
;
937 } else if (mimetype
== "application/oxps" ||
938 mimetype
== "application/vnd.ms-xpsdocument") {
939 string cmd
= "unzip -p";
940 append_filename_argument(cmd
, file
);
941 cmd
+= " 'Documents/1/Pages/*.fpage'";
943 XpsXmlParser xpsparser
;
944 run_filter(cmd
, false, &dump
);
945 xpsparser
.parse(dump
);
946 dump
= xpsparser
.dump
;
947 } catch (const ReadError
&) {
948 skip_cmd_failed(urlterm
, context
, cmd
,
949 d
.get_size(), d
.get_mtime());
954 append_filename_argument(cmd
, file
);
955 cmd
+= " docProps/core.xml";
957 MetaXmlParser metaparser
;
958 metaparser
.parse(stdout_to_string(cmd
, false));
959 title
= metaparser
.title
;
960 keywords
= metaparser
.keywords
;
961 // FIXME: topic = metaparser.topic;
962 sample
= metaparser
.sample
;
963 author
= metaparser
.author
;
964 } catch (const ReadError
&) {
965 // Ignore errors as not all XPS files contain this file.
967 } else if (mimetype
== "text/csv") {
968 // Currently we assume that text files are UTF-8 unless they have a
970 dump
= d
.file_to_string();
971 md5_string(dump
, md5
);
973 // Look for Byte-Order Mark (BOM).
974 if (startswith(dump
, "\xfe\xff") || startswith(dump
, "\xff\xfe")) {
975 // UTF-16 in big-endian/little-endian order - we just convert
976 // it as "UTF-16" and let the conversion handle the BOM as that
977 // way we avoid the copying overhead of erasing 2 bytes from
978 // the start of dump.
979 convert_to_utf8(dump
, "UTF-16");
980 } else if (startswith(dump
, "\xef\xbb\xbf")) {
981 // UTF-8 with stupid Windows not-the-byte-order mark.
984 // FIXME: What charset is the file? Look at contents?
987 generate_sample_from_csv(dump
, sample
);
988 } else if (mimetype
== "image/svg+xml") {
990 const string
& text
= d
.file_to_string();
991 md5_string(text
, md5
);
992 svgparser
.parse(text
);
993 dump
= svgparser
.dump
;
994 title
= svgparser
.title
;
995 keywords
= svgparser
.keywords
;
996 // FIXME: topic = svgparser.topic;
997 author
= svgparser
.author
;
998 } else if (mimetype
== "application/vnd.debian.binary-package" ||
999 mimetype
== "application/x-debian-package") {
1000 const char* cmd
= "dpkg-deb -f - Description";
1002 run_filter(d
.get_fd(), cmd
, false, &desc
);
1003 // First line is short description, which we use as the title.
1004 string::size_type idx
= desc
.find('\n');
1005 title
.assign(desc
, 0, idx
);
1006 if (idx
!= string::npos
) {
1007 dump
.assign(desc
, idx
+ 1, string::npos
);
1009 } else if (mimetype
== "application/x-redhat-package-manager" ||
1010 mimetype
== "application/x-rpm") {
1011 string
cmd("rpm -q --qf '%{SUMMARY}\\n%{DESCRIPTION}' -p");
1012 append_filename_argument(cmd
, file
);
1014 run_filter(cmd
, false, &desc
);
1015 // First line is summary, which we use as the title.
1016 string::size_type idx
= desc
.find('\n');
1017 title
.assign(desc
, 0, idx
);
1018 if (idx
!= string::npos
) {
1019 dump
.assign(desc
, idx
+ 1, string::npos
);
1021 } else if (mimetype
== "application/atom+xml") {
1022 AtomParser atomparser
;
1023 const string
& text
= d
.file_to_string();
1024 md5_string(text
, md5
);
1025 atomparser
.parse(text
);
1026 dump
= atomparser
.dump
;
1027 title
= atomparser
.title
;
1028 keywords
= atomparser
.keywords
;
1029 // FIXME: topic = atomparser.topic;
1030 author
= atomparser
.author
;
1032 // Don't know how to index this type.
1033 skip_unknown_mimetype(urlterm
, context
, mimetype
,
1034 d
.get_size(), d
.get_mtime());
1038 // Compute the MD5 of the file if we haven't already.
1039 if (md5
.empty() && !d
.md5(md5
)) {
1040 if (errno
== ENOENT
|| errno
== ENOTDIR
) {
1041 skip(urlterm
, context
, "File removed during indexing",
1042 d
.get_size(), d
.get_mtime(),
1043 SKIP_VERBOSE_ONLY
| SKIP_SHOW_FILENAME
);
1045 skip(urlterm
, context
, "failed to read file to calculate MD5 checksum",
1046 d
.get_size(), d
.get_mtime());
1051 // Remove any trailing formfeeds, so we don't consider them when
1052 // considering if we extracted any text (e.g. pdftotext outputs a
1053 // formfeed between each page, even for blank pages).
1055 // If dump contain only formfeeds, then trim_end will be string::npos
1056 // and ++trim_end will be 0, which is the correct new size.
1057 string::size_type trim_end
= dump
.find_last_not_of('\f');
1058 if (++trim_end
!= dump
.size())
1059 dump
.resize(trim_end
);
1062 switch (empty_body
) {
1063 case EMPTY_BODY_INDEX
:
1065 case EMPTY_BODY_WARN
:
1066 cout
<< "no text extracted from document body, "
1067 "but indexing metadata anyway" << endl
;
1069 case EMPTY_BODY_SKIP
:
1070 skip(urlterm
, context
, "no text extracted from document body",
1071 d
.get_size(), d
.get_mtime());
1077 if (sample
.empty()) {
1078 sample
= generate_sample(dump
, sample_size
, "...", " ...");
1080 sample
= generate_sample(sample
, sample_size
, "...", " ...");
1083 // Put the data in the document
1084 if (record
.empty()) {
1090 record
+= "\nsample=";
1092 if (!title
.empty()) {
1093 record
+= "\ncaption=";
1094 record
+= generate_sample(title
, title_size
, "...", " ...");
1096 if (!author
.empty()) {
1097 record
+= "\nauthor=";
1100 record
+= "\ntype=";
1102 time_t mtime
= d
.get_mtime();
1103 if (mtime
!= static_cast<time_t>(-1)) {
1104 record
+= "\nmodtime=";
1105 record
+= str(mtime
);
1107 if (created
!= static_cast<time_t>(-1)) {
1108 record
+= "\ncreated=";
1109 record
+= str(created
);
1112 record
+= "\npages=";
1113 record
+= str(pages
);
1115 off_t size
= d
.get_size();
1116 record
+= "\nsize=";
1117 record
+= str(size
);
1118 newdocument
.set_data(record
);
1120 // Index the title, document text, keywords and topic.
1121 indexer
.set_document(newdocument
);
1122 if (!title
.empty()) {
1123 indexer
.index_text(title
, 5, "S");
1124 indexer
.increase_termpos(100);
1126 if (!dump
.empty()) {
1127 indexer
.index_text(dump
);
1129 if (!keywords
.empty()) {
1130 indexer
.increase_termpos(100);
1131 indexer
.index_text(keywords
);
1133 if (!topic
.empty()) {
1134 indexer
.increase_termpos(100);
1135 indexer
.index_text(topic
, 1, "B");
1137 // Index the leafname of the file.
1139 indexer
.increase_termpos(100);
1140 string leaf
= d
.leafname();
1141 string::size_type dot
= leaf
.find_last_of('.');
1142 if (dot
!= string::npos
&& leaf
.size() - dot
- 1 <= max_ext_len
)
1144 indexer
.index_text(leaf
, 1, "F");
1146 // Also index with underscores and ampersands replaced by spaces.
1147 bool modified
= false;
1148 string::size_type rep
= 0;
1149 while ((rep
= leaf
.find_first_of("_&", rep
)) != string::npos
) {
1154 indexer
.increase_termpos(100);
1155 indexer
.index_text(leaf
, 1, "F");
1159 if (!author
.empty()) {
1160 indexer
.increase_termpos(100);
1161 indexer
.index_text(author
, 1, "A");
1165 newdocument
.add_boolean_term("T" + mimetype
);
1167 newdocument
.add_boolean_term(site_term
);
1169 if (!host_term
.empty())
1170 newdocument
.add_boolean_term(host_term
);
1172 struct tm
*tm
= localtime(&mtime
);
1173 string date_term
= "D" + date_to_string(tm
->tm_year
+ 1900, tm
->tm_mon
+ 1, tm
->tm_mday
);
1174 newdocument
.add_boolean_term(date_term
); // Date (YYYYMMDD)
1175 date_term
.resize(7);
1177 newdocument
.add_boolean_term(date_term
); // Month (YYYYMM)
1178 date_term
.resize(5);
1180 newdocument
.add_boolean_term(date_term
); // Year (YYYY)
1182 newdocument
.add_boolean_term(urlterm
); // Url
1184 // Add mtime as a value to allow "sort by date".
1185 newdocument
.add_value(VALUE_LASTMOD
,
1186 int_to_binary_string(uint32_t(mtime
)));
1188 // Add ctime as a value to track modifications.
1189 time_t ctime
= d
.get_ctime();
1190 newdocument
.add_value(VALUE_CTIME
,
1191 int_to_binary_string(uint32_t(ctime
)));
1194 // Add MD5 as a value to allow duplicate documents to be collapsed
1196 newdocument
.add_value(VALUE_MD5
, md5
);
1198 // Add the file size as a value to allow "sort by size" and size ranges.
1199 newdocument
.add_value(VALUE_SIZE
,
1200 Xapian::sortable_serialise(size
));
1202 bool inc_tag_added
= false;
1203 if (d
.is_other_readable()) {
1204 inc_tag_added
= true;
1205 newdocument
.add_boolean_term("I*");
1206 } else if (d
.is_group_readable()) {
1207 const char * group
= d
.get_group();
1209 newdocument
.add_boolean_term(string("I#") + group
);
1212 const char * owner
= d
.get_owner();
1214 newdocument
.add_boolean_term(string("O") + owner
);
1215 if (!inc_tag_added
&& d
.is_owner_readable())
1216 newdocument
.add_boolean_term(string("I@") + owner
);
1219 string
ext_term("E");
1220 for (string::const_iterator i
= ext
.begin(); i
!= ext
.end(); ++i
) {
1222 if (ch
>= 'A' && ch
<= 'Z')
1226 newdocument
.add_boolean_term(ext_term
);
1228 index_add_document(urlterm
, last_altered
, did
, newdocument
);
1229 } catch (const ReadError
&) {
1230 skip(urlterm
, context
, string("can't read file: ") + strerror(errno
),
1231 d
.get_size(), d
.get_mtime());
1232 } catch (const NoSuchFilter
&) {
1233 string filter_entry
;
1234 if (cmd_it
!= commands
.end()) {
1235 filter_entry
= cmd_it
->first
;
1237 filter_entry
= mimetype
;
1239 string m
= "Filter for \"";
1241 m
+= "\" not installed";
1242 skip(urlterm
, context
, m
, d
.get_size(), d
.get_mtime());
1243 commands
[filter_entry
] = Filter();
1244 } catch (const FileNotFound
&) {
1245 skip(urlterm
, context
, "File removed during indexing",
1246 d
.get_size(), d
.get_mtime(),
1247 SKIP_VERBOSE_ONLY
| SKIP_SHOW_FILENAME
);
1248 } catch (const std::string
& error
) {
1249 skip(urlterm
, context
, error
, d
.get_size(), d
.get_mtime());
1250 } catch (const std::bad_alloc
&) {
1251 // Attempt to flag the file as failed and commit changes, though that
1252 // might fail too if we're low on memory rather than being asked to
1253 // allocate a ludicrous amount.
1254 skip(urlterm
, context
, "Out of memory trying to extract text from file",
1255 d
.get_size(), d
.get_mtime(),
1256 SKIP_SHOW_FILENAME
);
1257 throw CommitAndExit("Caught std::bad_alloc", "");
1262 index_handle_deletion()
1264 if (updated
.empty() || old_docs_not_seen
== 0) return;
1267 cout
<< "Deleting " << old_docs_not_seen
<< " old documents which weren't found" << endl
;
1269 Xapian::PostingIterator alldocs
= db
.postlist_begin(string());
1270 Xapian::docid did
= *alldocs
;
1271 while (did
< updated
.size()) {
1272 if (!updated
[did
]) {
1273 alldocs
.skip_to(did
);
1274 if (alldocs
== db
.postlist_end(string()))
1276 if (*alldocs
!= did
) {
1277 // Document #did didn't exist before we started.
1281 db
.delete_document(did
);
1282 if (--old_docs_not_seen
== 0)
1298 // If we created a temporary directory then delete it.