Fix tg_termpos1 for 64-bit termpos
[xapian.git] / xapian-applications / omega / handler_libgepub.cc
blobab063917129ab94f33f6f70d0881b81a22821120
1 /** @file
2 * @brief Extract text and metadata using libgepub
3 */
4 /* Copyright (C) 2023 Olly Betts
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
19 * USA
22 #include <config.h>
24 #include "handler.h"
26 #include "htmlparser.h"
28 #include <gepub-doc.h>
30 #include <string_view>
32 using namespace std;
34 static void
35 send_glib_field(Field field, gchar* data)
37 if (data) {
38 send_field(field, data);
39 g_free(data);
43 bool
44 initialise()
46 return true;
49 void
50 extract(const string& filename, const string&)
52 GError* e = nullptr;
53 GepubDoc* doc = gepub_doc_new(filename.c_str(), &e);
54 if (!doc) {
55 send_field(FIELD_ERROR, "gepub_doc_new() failed: ");
56 send_field(FIELD_ERROR, e->message);
57 g_error_free(e);
58 return;
61 int chapters = gepub_doc_get_n_chapters(doc);
62 // "Page count" is not a concept which seems to perfectly fit with an EPUB
63 // so (at least for now) we report the number of chapters.
64 send_field_page_count(chapters);
65 for (int i = 0; i < chapters; ++i) {
66 gepub_doc_set_chapter(doc, i);
67 GBytes* html = gepub_doc_get_current(doc);
68 gsize size;
69 auto data = static_cast<const char*>(g_bytes_get_data(html, &size));
70 HtmlParser parser;
71 try {
72 parser.parse(string_view(data, size), "utf-8", false);
73 } catch (const string& newcharset) {
74 parser.parse(string_view(data, size), newcharset, true);
76 send_field(FIELD_BODY, parser.dump);
77 g_bytes_unref(html);
80 // Extract metadata.
81 send_glib_field(FIELD_AUTHOR,
82 gepub_doc_get_metadata(doc, GEPUB_META_AUTHOR));
83 send_glib_field(FIELD_KEYWORDS,
84 gepub_doc_get_metadata(doc, GEPUB_META_DESC));
85 send_glib_field(FIELD_TITLE,
86 gepub_doc_get_metadata(doc, GEPUB_META_TITLE));
88 g_object_unref(doc);