2 * @brief Extract text and metadata using poppler.
4 /* Copyright (C) 2019 Bruno Baruffaldi
5 * Copyright (C) 2022,2023 Olly Betts
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
27 #include <poppler-document.h>
28 #include <poppler-page.h>
33 convert_to_uri(const string
& filename
, GError
** e
)
35 #if GLIB_CHECK_VERSION(2,58,0)
36 gchar
* abs_filename
= g_canonicalize_filename(filename
.c_str(), NULL
);
39 if (g_path_is_absolute(filename
.c_str())) {
40 abs_filename
= g_strdup(filename
.c_str());
42 gchar
* cwd
= g_get_current_dir();
43 abs_filename
= g_build_filename(cwd
, filename
.c_str(), NULL
);
47 gchar
* uri
= g_filename_to_uri(abs_filename
, NULL
, e
);
53 send_glib_field(Field field
, gchar
* data
)
56 send_field(field
, data
);
68 extract(const string
& filename
, const string
&)
71 gchar
* uri
= convert_to_uri(filename
, &e
);
73 send_field(FIELD_ERROR
, "g_filename_to_uri() failed: ");
74 send_field(FIELD_ERROR
, e
->message
);
79 PopplerDocument
* doc
= poppler_document_new_from_file(uri
, NULL
, &e
);
82 send_field(FIELD_ERROR
, "poppler_document_new_from_file() failed: ");
83 send_field(FIELD_ERROR
, e
->message
);
88 int pages
= poppler_document_get_n_pages(doc
);
89 send_field_page_count(pages
);
90 // Extracting text from PDF file
91 for (int i
= 0; i
< pages
; ++i
) {
92 PopplerPage
* page
= poppler_document_get_page(doc
, i
);
95 send_field(FIELD_ERROR
, "Failed to get page " + str(i
));
98 send_field(FIELD_BODY
, poppler_page_get_text(page
));
102 // Extract PDF metadata.
103 send_glib_field(FIELD_AUTHOR
, poppler_document_get_author(doc
));
104 send_glib_field(FIELD_TITLE
, poppler_document_get_title(doc
));
105 send_glib_field(FIELD_KEYWORDS
, poppler_document_get_keywords(doc
));
106 send_field_created_date(poppler_document_get_creation_date(doc
));