Fix tg_termpos1 for 64-bit termpos
[xapian.git] / xapian-applications / omega / handler_libabw.cc
blob7fdf409d59f0c1bbd1d887ea326ff4ff953219f2
1 /** @file
2 * @brief Extract text and metadata using libabw.
3 */
4 /* Copyright (C) 2019 Bruno Baruffaldi
5 * Copyright (C) 2020 Parth Kapadia
6 * Copyright (C) 2022,2023 Olly Betts
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
21 * USA
23 #include <config.h>
24 #include "handler.h"
25 #include "stringutils.h"
27 #include <librevenge-generators/librevenge-generators.h>
28 #include <librevenge-stream/librevenge-stream.h>
29 #include <libabw/libabw.h>
31 #define HANDLE_FIELD(START, END, FIELD, OUT...) \
32 handle_field((START), (END), (FIELD), (CONST_STRLEN(FIELD)), OUT)
34 using namespace librevenge;
35 using namespace std;
37 // Handle a field for which we only take a single value - we avoid copying in
38 // this case.
39 static void
40 handle_field(const char* start,
41 const char* end,
42 const char* field,
43 size_t len,
44 const char*& out,
45 size_t& out_len)
47 if (size_t(end - start) > len && memcmp(start, field, len) == 0) {
48 start += len;
49 while (start != end && isspace(*start)) start++;
50 if (start != end && (end[-1] != '\r' || --end != start)) {
51 out = start;
52 out_len = end - start;
57 // Handle a field for which we process multiple instances. We just send each
58 // occurrence as we see it.
59 static void
60 handle_field(const char* start,
61 const char* end,
62 const char* field,
63 size_t len,
64 Field code)
66 if (size_t(end - start) > len && memcmp(start, field, len) == 0) {
67 start += len;
68 while (start != end && isspace(*start)) start++;
69 if (start != end && (end[-1] != '\r' || --end != start)) {
70 send_field(code, start, end - start);
75 static void
76 parse_metadata(const char* data, size_t len)
78 const char* author;
79 size_t author_len = 0;
81 const char* p = data;
82 const char* end = p + len;
84 while (p != end) {
85 const char* start = p;
86 p = static_cast<const char*>(memchr(p, '\n', end - start));
87 const char* eol;
88 if (p)
89 eol = p++;
90 else
91 p = eol = end;
92 if ((end - start) > 5 && memcmp(start, "meta:", 5) == 0) {
93 start += 5;
94 switch (*start) {
95 case 'i': {
96 // Use dc:creator in preference to meta:initial-creator.
97 if (!author_len)
98 HANDLE_FIELD(start, eol, "initial-creator",
99 author, author_len);
100 break;
102 case 'k': {
103 HANDLE_FIELD(start, eol, "keyword", FIELD_KEYWORDS);
104 break;
107 } else if ((end - start) > 3 && memcmp(start, "dc:", 3) == 0) {
108 start += 3;
109 switch (*start) {
110 case 'c': {
111 // Use dc:creator in preference to meta:initial-creator.
112 HANDLE_FIELD(start, eol, "creator", author, author_len);
113 break;
115 case 's': {
116 HANDLE_FIELD(start, eol, "subject", FIELD_KEYWORDS);
117 break;
119 case 't': {
120 HANDLE_FIELD(start, eol, "title", FIELD_TITLE);
121 break;
124 } else if ((end - start) > 8 && memcmp(start, "dcterms:", 8) == 0) {
125 start += 8;
126 HANDLE_FIELD(start, eol, "available", FIELD_KEYWORDS);
130 if (author_len) {
131 send_field(FIELD_AUTHOR, author, author_len);
135 bool
136 initialise()
138 return true;
141 void
142 extract(const string& filename, const string&)
144 RVNGFileStream input(filename.c_str());
146 if (!libabw::AbiDocument::isFileFormatSupported(&input)) {
147 send_field(FIELD_ERROR, "Format not supported");
150 RVNGString metadata, dump;
152 RVNGTextTextGenerator metadata_gen(metadata, true);
153 if (!libabw::AbiDocument::parse(&input, &metadata_gen)) {
154 send_field(FIELD_ERROR, "Failed to extract metadata");
155 return;
157 parse_metadata(metadata.cstr(), metadata.size());
159 // Extract body text.
160 RVNGTextTextGenerator content(dump, false);
161 if (!libabw::AbiDocument::parse(&input, &content)) {
162 send_field(FIELD_ERROR, "Failed to extract text");
163 return;
165 send_field(FIELD_BODY, dump.cstr(), dump.size());