2 * @brief Handle indexing a document from a file
4 /* Copyright 1999,2000,2001 BrightStation PLC
5 * Copyright 2001,2005 James Aylett
6 * Copyright 2001,2002 Ananova Ltd
7 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2017,2019 Olly Betts
8 * Copyright 2019 Bruno Baruffaldi
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License as
12 * published by the Free Software Foundation; either version 2 of the
13 * License, or (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
26 #ifndef OMEGA_INCLUDED_INDEX_FILE_H
27 #define OMEGA_INCLUDED_INDEX_FILE_H
29 #include <sys/types.h>
35 class DirectoryIterator
;
37 enum skip_flags
{ SKIP_VERBOSE_ONLY
= 0x01, SKIP_SHOW_FILENAME
= 0x02 };
39 enum empty_body_type
{
40 EMPTY_BODY_WARN
, EMPTY_BODY_INDEX
, EMPTY_BODY_SKIP
43 enum dup_action_type
{
44 DUP_SKIP
, DUP_CHECK_LAZILY
47 // Commands which take a filename as the last argument, and output UTF-8
48 // text or some other mime type are common, so we handle these with a std::map.
51 std::string output_type
;
52 std::string output_charset
;
60 /** Set if this is a mapping for a worker sub-process. */
61 Worker
* worker
= nullptr;
64 explicit Filter(const std::string
& cmd_
, unsigned flags_
= 0)
65 : cmd(cmd_
), output_type(), flags(flags_
) { }
66 Filter(const std::string
& cmd_
, const std::string
& output_type_
,
68 : cmd(cmd_
), output_type(output_type_
), flags(flags_
) { }
69 Filter(const std::string
& cmd_
, const std::string
& output_type_
,
70 const std::string
& output_charset_
,
72 : cmd(cmd_
), output_type(output_type_
),
73 output_charset(output_charset_
), flags(flags_
) { }
74 explicit Filter(Worker
* worker_
) : worker(worker_
) { }
75 bool use_shell() const { return flags
& USE_SHELL
; }
76 bool input_on_stdin() const {
78 return flags
& (PIPE_IN
| PIPE_DEV_STDIN
| SEEK_DEV_STDIN
);
80 return flags
& PIPE_IN
;
83 bool dev_stdin() const {
85 return flags
& (PIPE_DEV_STDIN
| SEEK_DEV_STDIN
);
92 extern std::map
<std::string
, Filter
> commands
;
95 index_library(const std::string
& type
, Worker
* worker
)
97 commands
[type
] = Filter(worker
);
101 index_command(const std::string
& type
, const Filter
& filter
)
103 commands
[type
] = filter
;
107 index_command(const char* type
, const Filter
& filter
)
109 commands
[type
] = filter
;
113 skip(const std::string
& urlterm
, const std::string
& context
,
114 const std::string
& msg
,
115 off_t size
, time_t last_mod
, unsigned flags
= 0);
117 /// Call index_command() to set up the default command filters.
119 index_add_default_filters();
121 /// Call to set up the default libraries.
123 index_add_default_libraries();
127 index_init(const std::string
& dbpath
, const Xapian::Stem
& stemmer
,
128 const std::string
& root_
,
129 const std::string
& site_term_
, const std::string
& host_term_
,
130 empty_body_type empty_body_
, dup_action_type dup_action_
,
131 size_t sample_size_
, size_t title_size_
,
133 bool overwrite
, bool retry_failed_
,
134 bool delete_removed_documents
, bool verbose_
, bool use_ctime_
,
135 bool spelling
, bool ignore_exclusions_
, bool description_as_sample
,
139 index_remove_failed_entry(const std::string
& urlterm
);
142 index_add_document(const std::string
& urlterm
, time_t last_altered
,
143 Xapian::docid did
, const Xapian::Document
& doc
);
145 /// Index a file into the database.
147 index_mimetype(const std::string
& file
,
148 const std::string
& urlterm
,
149 const std::string
& url
,
150 const std::string
& ext
,
151 std::string mimetype
,
152 DirectoryIterator
& d
,
153 std::string pathterm
,
156 /// Delete any previously indexed documents we haven't seen.
157 void index_handle_deletion();
159 /// Commit any pending changes.
162 /// Clean up and release any resources, etc.
165 #endif // OMEGA_INCLUDED_INDEX_FILE_H