Fix tg_termpos1 for 64-bit termpos
[xapian.git] / xapian-applications / omega / handler_libarchive.cc
blob0a5a7cc595373b3d6b95e351e98aea25b5ca119a
1 /** @file
2 * @brief Extract text and metadata using libarchive.
3 */
4 /* Copyright (C) 2020 Parth Kapadia
5 * Copyright (C) 2022,2023 Olly Betts
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
20 * USA
22 #include <config.h>
23 #include "handler.h"
25 #include "msxmlparser.h"
26 #include "opendocmetaparser.h"
27 #include "opendocparser.h"
28 #include "stringutils.h"
29 #include "xlsxparser.h"
30 #include "xpsparser.h"
32 #include <archive.h>
33 #include <archive_entry.h>
35 #define DEFAULT_BLOCK_SIZE 10240
37 using namespace std;
39 static void
40 parse_metadata(const string& metadata)
42 OpenDocMetaParser metaparser;
43 metaparser.parse(metadata);
44 send_field(FIELD_TITLE, metaparser.title);
45 send_field(FIELD_KEYWORDS, metaparser.keywords);
46 send_field(FIELD_AUTHOR, metaparser.author);
47 send_field_created_date(metaparser.created);
48 send_field_page_count(metaparser.pages);
51 static bool
52 extract_opendoc(struct archive* archive_obj)
54 string styles;
55 OpenDocParser parser;
57 struct archive_entry* entry;
58 while (archive_read_next_header(archive_obj, &entry) == ARCHIVE_OK) {
59 size_t total;
60 ssize_t size;
61 string pathname = archive_entry_pathname(entry);
62 if (pathname == "content.xml") {
63 total = archive_entry_size(entry);
64 string content(total, '\0');
65 size = archive_read_data(archive_obj, &content[0], total);
67 if (size <= 0) {
68 send_field(FIELD_ERROR, "Failed to read content.xml");
69 return false;
71 content.resize(size);
72 parser.parse(content);
73 } else if (pathname == "styles.xml") {
74 total = archive_entry_size(entry);
75 styles.resize(total);
76 size = archive_read_data(archive_obj, &styles[0], total);
78 if (size <= 0) {
79 send_field(FIELD_ERROR, "Failed to read styles.xml");
80 return false;
82 styles.resize(size);
83 } else if (pathname == "meta.xml") {
84 total = archive_entry_size(entry);
85 string metadata(total, '\0');
86 size = archive_read_data(archive_obj, &metadata[0], total);
88 if (size > 0) {
89 // indexing file even if this fails
90 metadata.resize(size);
91 parse_metadata(metadata);
96 // We want to parse styles.xml after content.xml, but they could be stored
97 // in either order in the ZIP container.
98 parser.parse(styles);
100 send_field(FIELD_BODY, parser.dump);
101 return true;
104 static bool
105 extract_xlsx(struct archive* archive_obj)
107 int pages = 0;
108 string sheets;
109 XlsxParser parser;
111 struct archive_entry* entry;
112 while (archive_read_next_header(archive_obj, &entry) == ARCHIVE_OK) {
113 string pathname = archive_entry_pathname(entry);
114 if (pathname == "xl/styles.xml" ||
115 pathname == "xl/workbook.xml" ||
116 pathname == "xl/sharedStrings.xml") {
117 size_t total = archive_entry_size(entry);
118 string shared_strings(total, '\0');
119 ssize_t size = archive_read_data(archive_obj, &shared_strings[0],
120 total);
122 if (size > 0) {
123 shared_strings.resize(size);
124 parser.parse(shared_strings);
126 } else if (startswith(pathname, "xl/worksheets/sheet")) {
127 auto i = sheets.size();
128 size_t total = archive_entry_size(entry);
129 sheets.resize(i + total);
130 ssize_t size = archive_read_data(archive_obj, &sheets[i], total);
132 if (size <= 0) {
133 send_field(FIELD_ERROR, "Failed to read " + pathname);
134 return false;
136 sheets.resize(i + size);
137 ++pages;
138 } else if (pathname == "docProps/core.xml") {
139 size_t total = archive_entry_size(entry);
140 string metadata(total, '\0');
141 ssize_t size = archive_read_data(archive_obj, &metadata[0], total);
142 if (size > 0) {
143 metadata.resize(size);
144 parse_metadata(metadata);
148 parser.parse(sheets);
149 send_field(FIELD_BODY, parser.dump);
150 send_field_page_count(pages);
151 return true;
154 static bool
155 extract_msxml(struct archive* archive_obj,
156 const string& tail)
158 size_t total;
159 ssize_t size;
160 struct archive_entry* entry;
161 string content;
163 if (startswith(tail, "wordprocessingml.")) {
164 while (archive_read_next_header(archive_obj, &entry) == ARCHIVE_OK) {
165 string pathname = archive_entry_pathname(entry);
166 if (pathname == "word/document.xml") {
167 auto i = content.size();
168 total = archive_entry_size(entry);
169 content.resize(i + total);
170 size = archive_read_data(archive_obj, &content[i], total);
172 if (size <= 0) {
173 send_field(FIELD_ERROR, "Failed to read word/document.xml");
174 return false;
176 content.resize(i + size);
177 } else if (startswith(pathname, "word/header") ||
178 startswith(pathname, "word/footer")) {
179 auto i = content.size();
180 total = archive_entry_size(entry);
181 content.resize(i + total);
182 size = archive_read_data(archive_obj, &content[i], total);
184 if (size > 0) {
185 content.resize(i + size);
186 } else {
187 // Ignore this as header/footer may not be present
188 content.resize(i);
190 } else if (pathname == "docProps/core.xml") {
191 // docProps/core.xml stores meta data
192 total = archive_entry_size(entry);
193 string metadata(total, '\0');
194 size = archive_read_data(archive_obj, &metadata[0], total);
195 if (size > 0) {
196 metadata.resize(size);
197 parse_metadata(metadata);
201 } else if (startswith(tail, "presentationml.")) {
202 int pages = 0;
203 while (archive_read_next_header(archive_obj, &entry) == ARCHIVE_OK) {
204 string pathname = archive_entry_pathname(entry);
205 if (startswith(pathname, "ppt/slides/slide")) {
206 ++pages;
207 goto handle_pptx_content;
208 } else if (startswith(pathname, "ppt/notesSlides/notesSlide") ||
209 startswith(pathname, "ppt/comments/comment")) {
210 handle_pptx_content:
211 auto i = content.size();
212 total = archive_entry_size(entry);
213 content.resize(i + total);
214 size = archive_read_data(archive_obj, &content[i], total);
216 if (size <= 0) {
217 send_field(FIELD_ERROR, "Failed to read " + pathname);
218 return false;
220 content.resize(i + size);
221 } else if (pathname == "docProps/core.xml") {
222 total = archive_entry_size(entry);
223 string metadata(total, '\0');
224 size = archive_read_data(archive_obj, &metadata[0], total);
225 if (size > 0) {
226 metadata.resize(size);
227 parse_metadata(metadata);
231 send_field_page_count(pages);
234 MSXmlParser parser;
235 parser.parse(content);
236 send_field(FIELD_BODY, parser.dump);
237 return true;
240 static bool
241 extract_xps(struct archive* archive_obj)
243 int pages = 0;
244 string content;
245 XpsParser parser;
247 struct archive_entry* entry;
248 while (archive_read_next_header(archive_obj, &entry) == ARCHIVE_OK) {
249 string pathname = archive_entry_pathname(entry);
250 if (startswith(pathname, "Documents/") &&
251 endswith(pathname, ".fpage") &&
252 pathname.find("/Pages/") != string::npos) {
253 size_t total = archive_entry_size(entry);
254 content.resize(total);
255 ssize_t size = archive_read_data(archive_obj, &content[0], total);
257 if (size <= 0) {
258 send_field(FIELD_ERROR, "Failed to read " + pathname);
259 return false;
261 content.resize(size);
262 parser.parse(content);
263 ++pages;
264 } else if (pathname == "docProps/core.xml") {
265 // If present, docProps/core.xml stores meta data.
266 size_t total = archive_entry_size(entry);
267 content.resize(total);
268 ssize_t size = archive_read_data(archive_obj, &content[0], total);
269 if (size > 0) {
270 content.resize(size);
271 parse_metadata(content);
276 send_field(FIELD_BODY, parser.dump);
277 send_field_page_count(pages);
278 return true;
281 bool
282 initialise()
284 return true;
287 void
288 extract(const string& filename,
289 const string& mimetype)
291 const char* file = filename.c_str();
292 struct archive* archive_obj = archive_read_new();
293 archive_read_support_format_zip(archive_obj);
294 // Block size will be determined by libarchive automatically for
295 // regular files. Specified block size will only be used for tape drives
296 // 10240 is chosen as default size (20 records - 512 bytes each)
297 int status_code = archive_read_open_filename(archive_obj, file,
298 DEFAULT_BLOCK_SIZE);
300 if (status_code != ARCHIVE_OK) {
301 send_field(FIELD_ERROR, "Failed to open file");
302 return;
305 if (startswith(mimetype, "application/vnd.sun.xml.") ||
306 startswith(mimetype, "application/vnd.oasis.opendocument.")) {
307 if (!extract_opendoc(archive_obj))
308 return;
309 } else if (startswith(mimetype,
310 "application/vnd.openxmlformats-officedocument."))
312 string tail(mimetype, 46);
313 if (startswith(tail, "spreadsheetml.")) {
314 if (!extract_xlsx(archive_obj))
315 return;
316 } else {
317 if (!extract_msxml(archive_obj, tail))
318 return;
320 } else if (mimetype == "application/oxps" ||
321 mimetype == "application/vnd.ms-xpsdocument") {
322 if (!extract_xps(archive_obj))
323 return;
326 status_code = archive_read_free(archive_obj);
327 if (status_code != ARCHIVE_OK) {
328 send_field(FIELD_ERROR, archive_error_string(archive_obj));
329 return;