Fix tg_termpos1 for 64-bit termpos
[xapian.git] / xapian-applications / omega / handler_libextractor.cc
blobc0fcee184e32bd7c8e96ed4bb20d8abf6588c9a9
1 /** @file
2 * @brief Extract metadata using libextractor.
3 */
4 /* Copyright (C) 2020 Parth Kapadia
5 * Copyright (C) 2022,2023 Olly Betts
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
20 * USA
22 #include <config.h>
23 #include "handler.h"
24 #include "parseint.h"
26 #include <extractor.h>
28 using namespace std;
30 /** Store metadata in its corresponding variable.
32 * @param cls last parameter from EXTRACTOR_extract (unused)
33 * @param plugin_name name of the plugin (unused)
34 * @param type mime-type of file according to libextractor
35 * @param format format information about data
36 * @param data_mime_type mimetype according to libextractor (unused)
37 * @param data actual meta-data found
38 * @param data_len number of bytes in data
40 static int
41 process_metadata(void*,
42 const char*,
43 enum EXTRACTOR_MetaType type,
44 enum EXTRACTOR_MetaFormat format,
45 const char*,
46 const char* data,
47 size_t data_len)
49 switch (format) {
50 case EXTRACTOR_METAFORMAT_UTF8:
51 break;
53 default:
54 // specific encoding unknown
55 // EXTRACTOR_METAFORMAT_UNKNOWN
56 // EXTRACTOR_METAFORMAT_BINARY
57 // EXTRACTOR_METAFORMAT_C_STRING
58 return 0;
61 // "data_len is strlen (data)+1"!
62 --data_len;
64 switch (type) {
65 case EXTRACTOR_METATYPE_BOOK_TITLE:
66 case EXTRACTOR_METATYPE_JOURNAL_NAME:
67 case EXTRACTOR_METATYPE_ORIGINAL_TITLE:
68 case EXTRACTOR_METATYPE_SUBJECT:
69 case EXTRACTOR_METATYPE_SUBTITLE:
70 case EXTRACTOR_METATYPE_TITLE:
71 send_field(FIELD_TITLE, data, data_len);
72 break;
74 case EXTRACTOR_METATYPE_PAGE_COUNT: {
75 unsigned p;
76 if (parse_unsigned(data, p)) {
77 send_field_page_count(int(p));
79 break;
82 case EXTRACTOR_METATYPE_ARTIST:
83 case EXTRACTOR_METATYPE_AUTHOR_NAME:
84 case EXTRACTOR_METATYPE_COMPOSER:
85 case EXTRACTOR_METATYPE_CONDUCTOR:
86 case EXTRACTOR_METATYPE_CREATOR:
87 case EXTRACTOR_METATYPE_MOVIE_DIRECTOR:
88 case EXTRACTOR_METATYPE_ORIGINAL_ARTIST:
89 case EXTRACTOR_METATYPE_ORIGINAL_PERFORMER:
90 case EXTRACTOR_METATYPE_ORIGINAL_WRITER:
91 case EXTRACTOR_METATYPE_PERFORMER:
92 case EXTRACTOR_METATYPE_WRITER:
93 send_field(FIELD_AUTHOR, data, data_len);
94 break;
96 case EXTRACTOR_METATYPE_KEYWORDS:
97 send_field(FIELD_KEYWORDS, data, data_len);
98 break;
100 case EXTRACTOR_METATYPE_ABSTRACT:
101 case EXTRACTOR_METATYPE_COMMENT:
102 case EXTRACTOR_METATYPE_DESCRIPTION:
103 case EXTRACTOR_METATYPE_LYRICS:
104 case EXTRACTOR_METATYPE_SUMMARY:
105 send_field(FIELD_BODY, data, data_len);
106 break;
108 default:
109 // Ignore other metadata.
110 break;
112 return 0;
115 static struct EXTRACTOR_PluginList* plugins;
117 bool
118 initialise()
120 // Add all default plugins.
121 plugins = EXTRACTOR_plugin_add_defaults(EXTRACTOR_OPTION_DEFAULT_POLICY);
122 return plugins != nullptr;
125 void
126 extract(const string& filename, const string&)
128 // If plugin not found/ File format not recognised/ corrupt file
129 // no data is extracted, rather than reporting an error.
130 EXTRACTOR_extract(plugins, filename.c_str(),
131 nullptr, 0,
132 &process_metadata, nullptr);