2 * @brief Extract text and metadata using LibreOfficeKit
4 /* Copyright (C) 2014-2023 Olly Betts
6 * This Source Code Form is subject to the terms of the Mozilla Public
7 * License, v. 2.0. If a copy of the MPL was not distributed with this
8 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
15 #include "htmlparser.h"
18 #include "urlencode.h"
28 #include <sys/types.h>
32 #include "safesysexits.h"
34 #define LOK_USE_UNSTABLE_API // So we can use lok::Document::getParts().
35 #include <LibreOfficeKit/LibreOfficeKit.hxx>
40 // Install location for Debian packages (also Fedora on 32-bit architectures):
41 #define LO_PATH_DEBIAN "/usr/lib/libreoffice/program"
43 // Install location for Fedora packages on 64-bit architectures:
44 #define LO_PATH_FEDORA64 "/usr/lib64/libreoffice/program"
46 // Install location on macOS. May not actually work there currently though,
47 // see: https://gitlab.com/ojwb/lloconv/-/issues/11
48 #define LO_PATH_MACOS "/Applications/LibreOffice.app/Contents/Frameworks"
50 // Find a LibreOffice installation to use.
54 const char* lo_path
= getenv("LO_PATH");
55 if (lo_path
) return lo_path
;
58 #define CHECK_DIR(P) if (stat(P"/versionrc", &sb) == 0 && S_ISREG(sb.st_mode)) return P
60 CHECK_DIR(LO_PATH_MACOS
);
62 CHECK_DIR(LO_PATH_DEBIAN
);
63 if constexpr(sizeof(void*) > 4) {
64 CHECK_DIR(LO_PATH_FEDORA64
);
68 // Check install locations for .deb files from libreoffice.org,
69 // e.g. /opt/libreoffice6.3/program
70 DIR* opt
= opendir("/opt");
72 // We require at least LibreOffice 4.3.
73 unsigned long best_major
= 4;
74 unsigned long best_minor
= 2;
75 static string best_rc
;
77 while ((d
= readdir(opt
))) {
79 // Opportunistically skip non-directories if we can spot them
80 // just by looking at d_type.
81 if (d
->d_type
!= DT_DIR
&& d
->d_type
!= DT_UNKNOWN
) {
85 if (memcmp(d
->d_name
, "libreoffice", strlen("libreoffice")) != 0) {
89 char* p
= d
->d_name
+ strlen("libreoffice");
90 unsigned long major
= strtoul(p
, &p
, 10);
91 if (major
== ULONG_MAX
) continue;
92 unsigned long minor
= 0;
94 minor
= strtoul(p
+ 1, &p
, 10);
95 if (minor
== ULONG_MAX
) continue;
100 if (stat((rc
+ "/versionrc").c_str(), &sb
) != 0 ||
101 !S_ISREG(sb
.st_mode
)) {
105 if (major
> best_major
||
106 (major
== best_major
&& minor
> best_minor
)) {
109 best_rc
= std::move(rc
);
114 if (!best_rc
.empty()) {
115 return best_rc
.c_str();
119 cerr
<< "LibreOffice install not found\n"
120 "Set LO_PATH in the environment to the 'program' directory - e.g.:\n"
121 "LO_PATH=/opt/libreoffice/program\n"
123 _Exit(EX_UNAVAILABLE
);
126 static string output_file
;
127 static string output_url
;
134 output_file
= get_tmpfile("tmp.html");
135 if (output_file
.empty()) {
136 cerr
<< "Couldn't create temporary directory\n";
139 url_encode_path(output_url
, output_file
);
141 const char* lo_path
= get_lo_path();
142 llo
= lok_cpp_init(lo_path
);
144 cerr
<< "Failed to initialise LibreOfficeKit\n";
151 extract(const string
& filename
, const string
&)
153 const char* format
= "html"; // FIXME or xhtml
154 const char* options
= "SkipImages";
156 url_encode_path(input_url
, filename
);
157 unique_ptr
<Document
> lodoc(llo
->documentLoad(input_url
.c_str(), options
));
159 const char* errmsg
= llo
->getError();
160 send_field(FIELD_ERROR
, errmsg
? errmsg
: "Failed to load document");
164 if (!lodoc
->saveAs(output_url
.c_str(), format
, options
)) {
165 const char* errmsg
= llo
->getError();
166 send_field(FIELD_ERROR
, errmsg
? errmsg
: "Failed to load export");
171 if (!load_file(output_file
, html
)) {
172 unlink(output_file
.c_str());
173 send_field(FIELD_ERROR
, "Failed to load LibreOffice HTML output");
177 p
.ignore_metarobots();
178 p
.parse(html
, "utf-8", true);
179 unlink(output_file
.c_str());
180 send_field(FIELD_BODY
, p
.dump
);
181 send_field(FIELD_TITLE
, p
.title
);
182 send_field(FIELD_KEYWORDS
, p
.keywords
);
183 send_field(FIELD_KEYWORDS
, p
.topic
);
184 send_field(FIELD_AUTHOR
, p
.author
);
185 send_field_created_date(p
.created
);
186 // The documentation comment in LibreOfficeKit.hxx says this method
187 // returns a count of "individual sheets in a Calc, or slides in Impress,
188 // and has no relevance for Writer" but it actually seems to return a
189 // page count for writer documents.
190 send_field_page_count(lodoc
->getParts());
191 } catch (const exception
& e
) {
192 send_field(FIELD_ERROR
, e
.what());