2 * @brief Extract text and metadata using libarchive.
4 /* Copyright (C) 2020 Parth Kapadia
5 * Copyright (C) 2022,2023 Olly Betts
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
25 #include "msxmlparser.h"
26 #include "opendocmetaparser.h"
27 #include "opendocparser.h"
28 #include "stringutils.h"
29 #include "xlsxparser.h"
30 #include "xpsparser.h"
33 #include <archive_entry.h>
35 #define DEFAULT_BLOCK_SIZE 10240
40 parse_metadata(const string
& metadata
)
42 OpenDocMetaParser metaparser
;
43 metaparser
.parse(metadata
);
44 send_field(FIELD_TITLE
, metaparser
.title
);
45 send_field(FIELD_KEYWORDS
, metaparser
.keywords
);
46 send_field(FIELD_AUTHOR
, metaparser
.author
);
47 send_field_created_date(metaparser
.created
);
48 send_field_page_count(metaparser
.pages
);
52 extract_opendoc(struct archive
* archive_obj
)
57 struct archive_entry
* entry
;
58 while (archive_read_next_header(archive_obj
, &entry
) == ARCHIVE_OK
) {
61 string pathname
= archive_entry_pathname(entry
);
62 if (pathname
== "content.xml") {
63 total
= archive_entry_size(entry
);
64 string
content(total
, '\0');
65 size
= archive_read_data(archive_obj
, &content
[0], total
);
68 send_field(FIELD_ERROR
, "Failed to read content.xml");
72 parser
.parse(content
);
73 } else if (pathname
== "styles.xml") {
74 total
= archive_entry_size(entry
);
76 size
= archive_read_data(archive_obj
, &styles
[0], total
);
79 send_field(FIELD_ERROR
, "Failed to read styles.xml");
83 } else if (pathname
== "meta.xml") {
84 total
= archive_entry_size(entry
);
85 string
metadata(total
, '\0');
86 size
= archive_read_data(archive_obj
, &metadata
[0], total
);
89 // indexing file even if this fails
90 metadata
.resize(size
);
91 parse_metadata(metadata
);
96 // We want to parse styles.xml after content.xml, but they could be stored
97 // in either order in the ZIP container.
100 send_field(FIELD_BODY
, parser
.dump
);
105 extract_xlsx(struct archive
* archive_obj
)
111 struct archive_entry
* entry
;
112 while (archive_read_next_header(archive_obj
, &entry
) == ARCHIVE_OK
) {
113 string pathname
= archive_entry_pathname(entry
);
114 if (pathname
== "xl/styles.xml" ||
115 pathname
== "xl/workbook.xml" ||
116 pathname
== "xl/sharedStrings.xml") {
117 size_t total
= archive_entry_size(entry
);
118 string
shared_strings(total
, '\0');
119 ssize_t size
= archive_read_data(archive_obj
, &shared_strings
[0],
123 shared_strings
.resize(size
);
124 parser
.parse(shared_strings
);
126 } else if (startswith(pathname
, "xl/worksheets/sheet")) {
127 auto i
= sheets
.size();
128 size_t total
= archive_entry_size(entry
);
129 sheets
.resize(i
+ total
);
130 ssize_t size
= archive_read_data(archive_obj
, &sheets
[i
], total
);
133 send_field(FIELD_ERROR
, "Failed to read " + pathname
);
136 sheets
.resize(i
+ size
);
138 } else if (pathname
== "docProps/core.xml") {
139 size_t total
= archive_entry_size(entry
);
140 string
metadata(total
, '\0');
141 ssize_t size
= archive_read_data(archive_obj
, &metadata
[0], total
);
143 metadata
.resize(size
);
144 parse_metadata(metadata
);
148 parser
.parse(sheets
);
149 send_field(FIELD_BODY
, parser
.dump
);
150 send_field_page_count(pages
);
155 extract_msxml(struct archive
* archive_obj
,
160 struct archive_entry
* entry
;
163 if (startswith(tail
, "wordprocessingml.")) {
164 while (archive_read_next_header(archive_obj
, &entry
) == ARCHIVE_OK
) {
165 string pathname
= archive_entry_pathname(entry
);
166 if (pathname
== "word/document.xml") {
167 auto i
= content
.size();
168 total
= archive_entry_size(entry
);
169 content
.resize(i
+ total
);
170 size
= archive_read_data(archive_obj
, &content
[i
], total
);
173 send_field(FIELD_ERROR
, "Failed to read word/document.xml");
176 content
.resize(i
+ size
);
177 } else if (startswith(pathname
, "word/header") ||
178 startswith(pathname
, "word/footer")) {
179 auto i
= content
.size();
180 total
= archive_entry_size(entry
);
181 content
.resize(i
+ total
);
182 size
= archive_read_data(archive_obj
, &content
[i
], total
);
185 content
.resize(i
+ size
);
187 // Ignore this as header/footer may not be present
190 } else if (pathname
== "docProps/core.xml") {
191 // docProps/core.xml stores meta data
192 total
= archive_entry_size(entry
);
193 string
metadata(total
, '\0');
194 size
= archive_read_data(archive_obj
, &metadata
[0], total
);
196 metadata
.resize(size
);
197 parse_metadata(metadata
);
201 } else if (startswith(tail
, "presentationml.")) {
203 while (archive_read_next_header(archive_obj
, &entry
) == ARCHIVE_OK
) {
204 string pathname
= archive_entry_pathname(entry
);
205 if (startswith(pathname
, "ppt/slides/slide")) {
207 goto handle_pptx_content
;
208 } else if (startswith(pathname
, "ppt/notesSlides/notesSlide") ||
209 startswith(pathname
, "ppt/comments/comment")) {
211 auto i
= content
.size();
212 total
= archive_entry_size(entry
);
213 content
.resize(i
+ total
);
214 size
= archive_read_data(archive_obj
, &content
[i
], total
);
217 send_field(FIELD_ERROR
, "Failed to read " + pathname
);
220 content
.resize(i
+ size
);
221 } else if (pathname
== "docProps/core.xml") {
222 total
= archive_entry_size(entry
);
223 string
metadata(total
, '\0');
224 size
= archive_read_data(archive_obj
, &metadata
[0], total
);
226 metadata
.resize(size
);
227 parse_metadata(metadata
);
231 send_field_page_count(pages
);
235 parser
.parse(content
);
236 send_field(FIELD_BODY
, parser
.dump
);
241 extract_xps(struct archive
* archive_obj
)
247 struct archive_entry
* entry
;
248 while (archive_read_next_header(archive_obj
, &entry
) == ARCHIVE_OK
) {
249 string pathname
= archive_entry_pathname(entry
);
250 if (startswith(pathname
, "Documents/") &&
251 endswith(pathname
, ".fpage") &&
252 pathname
.find("/Pages/") != string::npos
) {
253 size_t total
= archive_entry_size(entry
);
254 content
.resize(total
);
255 ssize_t size
= archive_read_data(archive_obj
, &content
[0], total
);
258 send_field(FIELD_ERROR
, "Failed to read " + pathname
);
261 content
.resize(size
);
262 parser
.parse(content
);
264 } else if (pathname
== "docProps/core.xml") {
265 // If present, docProps/core.xml stores meta data.
266 size_t total
= archive_entry_size(entry
);
267 content
.resize(total
);
268 ssize_t size
= archive_read_data(archive_obj
, &content
[0], total
);
270 content
.resize(size
);
271 parse_metadata(content
);
276 send_field(FIELD_BODY
, parser
.dump
);
277 send_field_page_count(pages
);
288 extract(const string
& filename
,
289 const string
& mimetype
)
291 const char* file
= filename
.c_str();
292 struct archive
* archive_obj
= archive_read_new();
293 archive_read_support_format_zip(archive_obj
);
294 // Block size will be determined by libarchive automatically for
295 // regular files. Specified block size will only be used for tape drives
296 // 10240 is chosen as default size (20 records - 512 bytes each)
297 int status_code
= archive_read_open_filename(archive_obj
, file
,
300 if (status_code
!= ARCHIVE_OK
) {
301 send_field(FIELD_ERROR
, "Failed to open file");
305 if (startswith(mimetype
, "application/vnd.sun.xml.") ||
306 startswith(mimetype
, "application/vnd.oasis.opendocument.")) {
307 if (!extract_opendoc(archive_obj
))
309 } else if (startswith(mimetype
,
310 "application/vnd.openxmlformats-officedocument."))
312 string
tail(mimetype
, 46);
313 if (startswith(tail
, "spreadsheetml.")) {
314 if (!extract_xlsx(archive_obj
))
317 if (!extract_msxml(archive_obj
, tail
))
320 } else if (mimetype
== "application/oxps" ||
321 mimetype
== "application/vnd.ms-xpsdocument") {
322 if (!extract_xps(archive_obj
))
326 status_code
= archive_read_free(archive_obj
);
327 if (status_code
!= ARCHIVE_OK
) {
328 send_field(FIELD_ERROR
, archive_error_string(archive_obj
));