2 * @brief Extract text and metadata using libgepub
4 /* Copyright (C) 2023 Olly Betts
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
26 #include "htmlparser.h"
28 #include <gepub-doc.h>
30 #include <string_view>
35 send_glib_field(Field field
, gchar
* data
)
38 send_field(field
, data
);
50 extract(const string
& filename
, const string
&)
53 GepubDoc
* doc
= gepub_doc_new(filename
.c_str(), &e
);
55 send_field(FIELD_ERROR
, "gepub_doc_new() failed: ");
56 send_field(FIELD_ERROR
, e
->message
);
61 int chapters
= gepub_doc_get_n_chapters(doc
);
62 // "Page count" is not a concept which seems to perfectly fit with an EPUB
63 // so (at least for now) we report the number of chapters.
64 send_field_page_count(chapters
);
65 for (int i
= 0; i
< chapters
; ++i
) {
66 gepub_doc_set_chapter(doc
, i
);
67 GBytes
* html
= gepub_doc_get_current(doc
);
69 auto data
= static_cast<const char*>(g_bytes_get_data(html
, &size
));
72 parser
.parse(string_view(data
, size
), "utf-8", false);
73 } catch (const string
& newcharset
) {
74 parser
.parse(string_view(data
, size
), newcharset
, true);
76 send_field(FIELD_BODY
, parser
.dump
);
81 send_glib_field(FIELD_AUTHOR
,
82 gepub_doc_get_metadata(doc
, GEPUB_META_AUTHOR
));
83 send_glib_field(FIELD_KEYWORDS
,
84 gepub_doc_get_metadata(doc
, GEPUB_META_DESC
));
85 send_glib_field(FIELD_TITLE
,
86 gepub_doc_get_metadata(doc
, GEPUB_META_TITLE
));