2 * @brief Extract text and metadata from Apple documents using libtonyek.
4 /* Copyright (C) 2019 Bruno Baruffaldi
5 * Copyright (C) 2022,2023 Olly Betts
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
24 #include "stringutils.h"
28 #include <librevenge-generators/librevenge-generators.h>
29 #include <librevenge-stream/librevenge-stream.h>
30 #include <libetonyek/libetonyek.h>
32 #define HANDLE_FIELD(START, END, FIELD, OUT...) \
33 handle_field((START), (END), (FIELD), (CONST_STRLEN(FIELD)), OUT)
35 using libetonyek::EtonyekDocument
;
36 using namespace librevenge
;
39 // Handle a field for which we only take a single value - we avoid copying in
42 handle_field(const char* start
,
49 if (size_t(end
- start
) > len
&& memcmp(start
, field
, len
) == 0) {
51 while (start
!= end
&& isspace(*start
)) start
++;
52 if (start
!= end
&& (end
[-1] != '\r' || --end
!= start
)) {
54 out_len
= end
- start
;
59 // Handle a field for which we process multiple instances. We just send each
60 // occurrence as we see it.
62 handle_field(const char* start
,
68 if (size_t(end
- start
) > len
&& memcmp(start
, field
, len
) == 0) {
70 while (start
!= end
&& isspace(*start
)) start
++;
71 if (start
!= end
&& (end
[-1] != '\r' || --end
!= start
)) {
72 send_field(code
, start
, end
- start
);
78 parse_metadata(const char* data
, size_t len
)
81 size_t author_len
= 0;
84 const char* end
= p
+ len
;
87 const char* start
= p
;
88 p
= static_cast<const char*>(memchr(p
, '\n', end
- start
));
94 if ((end
- start
) > 5 && memcmp(start
, "meta:", 5) == 0) {
98 // Use dc:creator in preference to meta:initial-creator.
100 HANDLE_FIELD(start
, eol
, "initial-creator",
105 HANDLE_FIELD(start
, eol
, "keyword", FIELD_KEYWORDS
);
109 } else if ((end
- start
) > 3 && memcmp(start
, "dc:", 3) == 0) {
113 // Use dc:creator in preference to meta:initial-creator.
114 HANDLE_FIELD(start
, eol
, "creator", author
, author_len
);
118 HANDLE_FIELD(start
, eol
, "subject", FIELD_KEYWORDS
);
122 HANDLE_FIELD(start
, eol
, "title", FIELD_TITLE
);
126 } else if ((end
- start
) > 8 && memcmp(start
, "dcterms:", 8) == 0) {
128 HANDLE_FIELD(start
, eol
, "available", FIELD_KEYWORDS
);
133 send_field(FIELD_AUTHOR
, author
, author_len
);
138 extract_key(RVNGInputStream
* input
)
140 RVNGStringVector content
;
141 RVNGTextPresentationGenerator
document(content
);
142 if (!EtonyekDocument::parse(input
, &document
)) {
143 send_field(FIELD_ERROR
, "Failed to extract text");
146 unsigned size
= content
.size();
147 // Use the number of slides as the page count.
148 send_field_page_count(size
);
149 for (unsigned i
= 0; i
< size
; ++i
) {
150 const RVNGString
& slide
= content
[i
];
151 send_field(FIELD_BODY
, slide
.cstr(), slide
.size());
156 extract_numbers(RVNGInputStream
* input
)
158 RVNGStringVector content
;
159 RVNGTextSpreadsheetGenerator
document(content
);
161 if (!EtonyekDocument::parse(input
, &document
)) {
162 send_field(FIELD_ERROR
, "Failed to extract text");
165 unsigned size
= content
.size();
166 // Use the number of sheets as the page count.
167 send_field_page_count(size
);
168 for (unsigned i
= 0; i
< size
; ++i
) {
169 const RVNGString
& slide
= content
[i
];
170 send_field(FIELD_BODY
, slide
.cstr(), slide
.size());
175 extract_pages(RVNGInputStream
* input
)
177 RVNGString dump
, metadata
;
180 RVNGTextTextGenerator
data(metadata
, true);
181 if (!EtonyekDocument::parse(input
, &data
)) {
182 send_field(FIELD_ERROR
, "Failed to extract metadata");
185 parse_metadata(metadata
.cstr(), metadata
.size());
187 // Extract body text.
188 RVNGTextTextGenerator
content(dump
, false);
189 if (!EtonyekDocument::parse(input
, &content
)) {
190 send_field(FIELD_ERROR
, "Failed to extract text");
194 send_field(FIELD_BODY
, dump
.cstr(), dump
.size());
204 extract(const string
& filename
, const string
&)
206 unique_ptr
<RVNGInputStream
> input
;
208 if (RVNGDirectoryStream::isDirectory(filename
.c_str()))
209 input
.reset(new RVNGDirectoryStream(filename
.c_str()));
211 input
.reset(new RVNGFileStream(filename
.c_str()));
213 EtonyekDocument::Type type
= EtonyekDocument::TYPE_UNKNOWN
;
214 auto confidence
= EtonyekDocument::isSupported(input
.get(), &type
);
216 if (confidence
== EtonyekDocument::CONFIDENCE_NONE
) {
217 send_field(FIELD_ERROR
, "Format couldn't be detected");
222 case EtonyekDocument::TYPE_PAGES
:
223 extract_pages(input
.get());
225 case EtonyekDocument::TYPE_NUMBERS
:
226 extract_numbers(input
.get());
228 case EtonyekDocument::TYPE_KEYNOTE
:
229 extract_key(input
.get());
232 send_field(FIELD_ERROR
, "Format not supported");