2 * @brief Extract text and metadata using libmwaw.
4 /* Copyright (C) 2019 Bruno Baruffaldi
5 * Copyright (C) 2020 Parth Kapadia
6 * Copyright (C) 2022,2023 Olly Betts
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
27 #include "stringutils.h"
29 #include <librevenge-generators/librevenge-generators.h>
30 #include <librevenge-stream/librevenge-stream.h>
31 #include <libmwaw/libmwaw.hxx>
33 #define HANDLE_FIELD(START, END, FIELD, OUT...) \
34 handle_field((START), (END), (FIELD), (CONST_STRLEN(FIELD)), OUT)
36 using namespace librevenge
;
39 // Handle a field for which we only take a single value - we avoid copying in
42 handle_field(const char* start
,
49 if (size_t(end
- start
) > len
&& memcmp(start
, field
, len
) == 0) {
51 while (start
!= end
&& isspace(*start
)) start
++;
52 if (start
!= end
&& (end
[-1] != '\r' || --end
!= start
)) {
54 out_len
= end
- start
;
59 // Handle a field for which we process multiple instances. We just send each
60 // occurrence as we see it.
62 handle_field(const char* start
,
68 if (size_t(end
- start
) > len
&& memcmp(start
, field
, len
) == 0) {
70 while (start
!= end
&& isspace(*start
)) start
++;
71 if (start
!= end
&& (end
[-1] != '\r' || --end
!= start
)) {
72 send_field(code
, start
, end
- start
);
78 parse_metadata(const char* data
, size_t len
)
81 size_t author_len
= 0;
84 const char* end
= p
+ len
;
87 const char* start
= p
;
88 p
= static_cast<const char*>(memchr(p
, '\n', end
- start
));
94 if ((end
- start
) > 5 && memcmp(start
, "meta:", 5) == 0) {
98 // Use dc:creator in preference to meta:initial-creator.
100 HANDLE_FIELD(start
, eol
, "initial-creator",
105 HANDLE_FIELD(start
, eol
, "keyword", FIELD_KEYWORDS
);
109 } else if ((end
- start
) > 3 && memcmp(start
, "dc:", 3) == 0) {
113 // Use dc:creator in preference to meta:initial-creator.
114 HANDLE_FIELD(start
, eol
, "creator", author
, author_len
);
118 HANDLE_FIELD(start
, eol
, "subject", FIELD_KEYWORDS
);
122 HANDLE_FIELD(start
, eol
, "title", FIELD_TITLE
);
126 } else if ((end
- start
) > 8 && memcmp(start
, "dcterms:", 8) == 0) {
128 HANDLE_FIELD(start
, eol
, "available", FIELD_KEYWORDS
);
133 send_field(FIELD_AUTHOR
, author
, author_len
);
138 parse_content(const RVNGStringVector
& pages
)
140 auto page_count
= pages
.size();
141 send_field_page_count(page_count
);
142 for (unsigned i
= 0; i
< page_count
; ++i
) {
143 const RVNGString
& page
= pages
[i
];
144 send_field(FIELD_BODY
, page
.cstr(), page
.size());
149 extract_text(RVNGFileStream
* input
)
152 RVNGTextTextGenerator
content_gen(dump
, false);
153 if (MWAWDocument::parse(input
, &content_gen
) != MWAWDocument::MWAW_R_OK
) {
154 send_field(FIELD_ERROR
, "Failed to extract text");
157 send_field(FIELD_BODY
, dump
.cstr(), dump
.size());
160 RVNGTextTextGenerator
metadata_gen(metadata
, true);
161 if (MWAWDocument::parse(input
, &metadata_gen
) != MWAWDocument::MWAW_R_OK
) {
162 send_field(FIELD_ERROR
, "Failed to extract metadata");
166 parse_metadata(metadata
.cstr(), metadata
.size());
170 extract_spreadsheet(RVNGFileStream
* input
)
172 RVNGStringVector pages_metadata
;
173 RVNGTextSpreadsheetGenerator
metadata(pages_metadata
, true);
174 MWAWDocument::Result result
= MWAWDocument::parse(input
, &metadata
);
175 if (result
!= MWAWDocument::MWAW_R_OK
) {
176 send_field(FIELD_ERROR
, "Failed to extract metadata");
180 for (unsigned i
= 0; i
< pages_metadata
.size(); ++i
) {
181 const RVNGString
& page
= pages_metadata
[i
];
182 parse_metadata(page
.cstr(), page
.size());
185 RVNGStringVector pages
;
186 RVNGTextSpreadsheetGenerator
content(pages
, false);
187 result
= MWAWDocument::parse(input
, &content
);
188 if (result
!= MWAWDocument::MWAW_R_OK
) {
189 send_field(FIELD_ERROR
, "Failed to extract text");
192 parse_content(pages
);
196 extract_presentation(RVNGFileStream
* input
)
198 RVNGStringVector pages
;
199 RVNGTextPresentationGenerator
content(pages
);
200 MWAWDocument::Result result
= MWAWDocument::parse(input
, &content
);
201 if (result
!= MWAWDocument::MWAW_R_OK
) {
202 send_field(FIELD_ERROR
, "Failed to extract text");
205 parse_content(pages
);
209 extract_drawing(RVNGFileStream
* input
)
211 RVNGStringVector pages
;
212 RVNGTextDrawingGenerator
content(pages
);
213 MWAWDocument::Result result
= MWAWDocument::parse(input
, &content
);
214 if (result
!= MWAWDocument::MWAW_R_OK
) {
215 send_field(FIELD_ERROR
, "Failed to extract text");
218 parse_content(pages
);
228 extract(const string
& filename
, const string
&)
230 // To store the kind and type of document
231 MWAWDocument::Kind kind
;
232 MWAWDocument::Type type
;
233 MWAWDocument::Confidence confidence
;
234 RVNGFileStream
input(filename
.c_str());
236 confidence
= MWAWDocument::isFileFormatSupported(&input
, type
, kind
);
237 if (confidence
!= MWAWDocument::MWAW_C_EXCELLENT
) {
238 send_field(FIELD_ERROR
, "File format not supported");
243 case MWAWDocument::MWAW_K_TEXT
:
244 extract_text(&input
);
246 case MWAWDocument::MWAW_K_SPREADSHEET
:
247 case MWAWDocument::MWAW_K_DATABASE
:
248 extract_spreadsheet(&input
);
250 case MWAWDocument::MWAW_K_PRESENTATION
:
251 extract_presentation(&input
);
254 extract_drawing(&input
);