Fix bug in PL2+ implementation
[xapian.git] / xapian-applications / omega / handler_libmwaw.cc
blobb38d553cb162b51803cde2d01653ab115a3aabbe
1 /** @file
2 * @brief Extract text and metadata using libmwaw.
3 */
4 /* Copyright (C) 2019 Bruno Baruffaldi
5 * Copyright (C) 2020 Parth Kapadia
6 * Copyright (C) 2022,2023 Olly Betts
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
21 * USA
24 #include <config.h>
25 #include "handler.h"
26 #include "str.h"
27 #include "stringutils.h"
29 #include <librevenge-generators/librevenge-generators.h>
30 #include <librevenge-stream/librevenge-stream.h>
31 #include <libmwaw/libmwaw.hxx>
33 #define HANDLE_FIELD(START, END, FIELD, OUT...) \
34 handle_field((START), (END), (FIELD), (CONST_STRLEN(FIELD)), OUT)
36 using namespace librevenge;
37 using namespace std;
39 // Handle a field for which we only take a single value - we avoid copying in
40 // this case.
41 static void
42 handle_field(const char* start,
43 const char* end,
44 const char* field,
45 size_t len,
46 const char*& out,
47 size_t& out_len)
49 if (size_t(end - start) > len && memcmp(start, field, len) == 0) {
50 start += len;
51 while (start != end && isspace(*start)) start++;
52 if (start != end && (end[-1] != '\r' || --end != start)) {
53 out = start;
54 out_len = end - start;
59 // Handle a field for which we process multiple instances. We just send each
60 // occurrence as we see it.
61 static void
62 handle_field(const char* start,
63 const char* end,
64 const char* field,
65 size_t len,
66 Field code)
68 if (size_t(end - start) > len && memcmp(start, field, len) == 0) {
69 start += len;
70 while (start != end && isspace(*start)) start++;
71 if (start != end && (end[-1] != '\r' || --end != start)) {
72 send_field(code, start, end - start);
77 static void
78 parse_metadata(const char* data, size_t len)
80 const char* author;
81 size_t author_len = 0;
83 const char* p = data;
84 const char* end = p + len;
86 while (p != end) {
87 const char* start = p;
88 p = static_cast<const char*>(memchr(p, '\n', end - start));
89 const char* eol;
90 if (p)
91 eol = p++;
92 else
93 p = eol = end;
94 if ((end - start) > 5 && memcmp(start, "meta:", 5) == 0) {
95 start += 5;
96 switch (*start) {
97 case 'i': {
98 // Use dc:creator in preference to meta:initial-creator.
99 if (!author_len)
100 HANDLE_FIELD(start, eol, "initial-creator",
101 author, author_len);
102 break;
104 case 'k': {
105 HANDLE_FIELD(start, eol, "keyword", FIELD_KEYWORDS);
106 break;
109 } else if ((end - start) > 3 && memcmp(start, "dc:", 3) == 0) {
110 start += 3;
111 switch (*start) {
112 case 'c': {
113 // Use dc:creator in preference to meta:initial-creator.
114 HANDLE_FIELD(start, eol, "creator", author, author_len);
115 break;
117 case 's': {
118 HANDLE_FIELD(start, eol, "subject", FIELD_KEYWORDS);
119 break;
121 case 't': {
122 HANDLE_FIELD(start, eol, "title", FIELD_TITLE);
123 break;
126 } else if ((end - start) > 8 && memcmp(start, "dcterms:", 8) == 0) {
127 start += 8;
128 HANDLE_FIELD(start, eol, "available", FIELD_KEYWORDS);
132 if (author_len) {
133 send_field(FIELD_AUTHOR, author, author_len);
137 static void
138 parse_content(const RVNGStringVector& pages)
140 auto page_count = pages.size();
141 send_field_page_count(page_count);
142 for (unsigned i = 0; i < page_count; ++i) {
143 const RVNGString& page = pages[i];
144 send_field(FIELD_BODY, page.cstr(), page.size());
148 static void
149 extract_text(RVNGFileStream* input)
151 RVNGString dump;
152 RVNGTextTextGenerator content_gen(dump, false);
153 if (MWAWDocument::parse(input, &content_gen) != MWAWDocument::MWAW_R_OK) {
154 send_field(FIELD_ERROR, "Failed to extract text");
155 return;
157 send_field(FIELD_BODY, dump.cstr(), dump.size());
159 RVNGString metadata;
160 RVNGTextTextGenerator metadata_gen(metadata, true);
161 if (MWAWDocument::parse(input, &metadata_gen) != MWAWDocument::MWAW_R_OK) {
162 send_field(FIELD_ERROR, "Failed to extract metadata");
163 return;
166 parse_metadata(metadata.cstr(), metadata.size());
169 static void
170 extract_spreadsheet(RVNGFileStream* input)
172 RVNGStringVector pages_metadata;
173 RVNGTextSpreadsheetGenerator metadata(pages_metadata, true);
174 MWAWDocument::Result result = MWAWDocument::parse(input, &metadata);
175 if (result != MWAWDocument::MWAW_R_OK) {
176 send_field(FIELD_ERROR, "Failed to extract metadata");
177 return;
180 for (unsigned i = 0; i < pages_metadata.size(); ++i) {
181 const RVNGString& page = pages_metadata[i];
182 parse_metadata(page.cstr(), page.size());
185 RVNGStringVector pages;
186 RVNGTextSpreadsheetGenerator content(pages, false);
187 result = MWAWDocument::parse(input, &content);
188 if (result != MWAWDocument::MWAW_R_OK) {
189 send_field(FIELD_ERROR, "Failed to extract text");
190 return;
192 parse_content(pages);
195 static void
196 extract_presentation(RVNGFileStream* input)
198 RVNGStringVector pages;
199 RVNGTextPresentationGenerator content(pages);
200 MWAWDocument::Result result = MWAWDocument::parse(input, &content);
201 if (result != MWAWDocument::MWAW_R_OK) {
202 send_field(FIELD_ERROR, "Failed to extract text");
203 return;
205 parse_content(pages);
208 static void
209 extract_drawing(RVNGFileStream* input)
211 RVNGStringVector pages;
212 RVNGTextDrawingGenerator content(pages);
213 MWAWDocument::Result result = MWAWDocument::parse(input, &content);
214 if (result != MWAWDocument::MWAW_R_OK) {
215 send_field(FIELD_ERROR, "Failed to extract text");
216 return;
218 parse_content(pages);
221 bool
222 initialise()
224 return true;
227 void
228 extract(const string& filename, const string&)
230 // To store the kind and type of document
231 MWAWDocument::Kind kind;
232 MWAWDocument::Type type;
233 MWAWDocument::Confidence confidence;
234 RVNGFileStream input(filename.c_str());
236 confidence = MWAWDocument::isFileFormatSupported(&input, type, kind);
237 if (confidence != MWAWDocument::MWAW_C_EXCELLENT) {
238 send_field(FIELD_ERROR, "File format not supported");
239 return;
242 switch (kind) {
243 case MWAWDocument::MWAW_K_TEXT:
244 extract_text(&input);
245 break;
246 case MWAWDocument::MWAW_K_SPREADSHEET:
247 case MWAWDocument::MWAW_K_DATABASE:
248 extract_spreadsheet(&input);
249 break;
250 case MWAWDocument::MWAW_K_PRESENTATION:
251 extract_presentation(&input);
252 break;
253 default:
254 extract_drawing(&input);
255 break;