2 * @brief Extract text and metadata using libabw.
4 /* Copyright (C) 2019 Bruno Baruffaldi
5 * Copyright (C) 2020 Parth Kapadia
6 * Copyright (C) 2022,2023 Olly Betts
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
25 #include "stringutils.h"
27 #include <librevenge-generators/librevenge-generators.h>
28 #include <librevenge-stream/librevenge-stream.h>
29 #include <libabw/libabw.h>
31 #define HANDLE_FIELD(START, END, FIELD, OUT...) \
32 handle_field((START), (END), (FIELD), (CONST_STRLEN(FIELD)), OUT)
34 using namespace librevenge
;
37 // Handle a field for which we only take a single value - we avoid copying in
40 handle_field(const char* start
,
47 if (size_t(end
- start
) > len
&& memcmp(start
, field
, len
) == 0) {
49 while (start
!= end
&& isspace(*start
)) start
++;
50 if (start
!= end
&& (end
[-1] != '\r' || --end
!= start
)) {
52 out_len
= end
- start
;
57 // Handle a field for which we process multiple instances. We just send each
58 // occurrence as we see it.
60 handle_field(const char* start
,
66 if (size_t(end
- start
) > len
&& memcmp(start
, field
, len
) == 0) {
68 while (start
!= end
&& isspace(*start
)) start
++;
69 if (start
!= end
&& (end
[-1] != '\r' || --end
!= start
)) {
70 send_field(code
, start
, end
- start
);
76 parse_metadata(const char* data
, size_t len
)
79 size_t author_len
= 0;
82 const char* end
= p
+ len
;
85 const char* start
= p
;
86 p
= static_cast<const char*>(memchr(p
, '\n', end
- start
));
92 if ((end
- start
) > 5 && memcmp(start
, "meta:", 5) == 0) {
96 // Use dc:creator in preference to meta:initial-creator.
98 HANDLE_FIELD(start
, eol
, "initial-creator",
103 HANDLE_FIELD(start
, eol
, "keyword", FIELD_KEYWORDS
);
107 } else if ((end
- start
) > 3 && memcmp(start
, "dc:", 3) == 0) {
111 // Use dc:creator in preference to meta:initial-creator.
112 HANDLE_FIELD(start
, eol
, "creator", author
, author_len
);
116 HANDLE_FIELD(start
, eol
, "subject", FIELD_KEYWORDS
);
120 HANDLE_FIELD(start
, eol
, "title", FIELD_TITLE
);
124 } else if ((end
- start
) > 8 && memcmp(start
, "dcterms:", 8) == 0) {
126 HANDLE_FIELD(start
, eol
, "available", FIELD_KEYWORDS
);
131 send_field(FIELD_AUTHOR
, author
, author_len
);
142 extract(const string
& filename
, const string
&)
144 RVNGFileStream
input(filename
.c_str());
146 if (!libabw::AbiDocument::isFileFormatSupported(&input
)) {
147 send_field(FIELD_ERROR
, "Format not supported");
150 RVNGString metadata
, dump
;
152 RVNGTextTextGenerator
metadata_gen(metadata
, true);
153 if (!libabw::AbiDocument::parse(&input
, &metadata_gen
)) {
154 send_field(FIELD_ERROR
, "Failed to extract metadata");
157 parse_metadata(metadata
.cstr(), metadata
.size());
159 // Extract body text.
160 RVNGTextTextGenerator
content(dump
, false);
161 if (!libabw::AbiDocument::parse(&input
, &content
)) {
162 send_field(FIELD_ERROR
, "Failed to extract text");
165 send_field(FIELD_BODY
, dump
.cstr(), dump
.size());