Fix tg_termpos1 for 64-bit termpos
[xapian.git] / xapian-applications / omega / handler_libreofficekit.cc
bloba335eb2066b76f88612e9f03055b0f9b6ee123e5
1 /** @file
2 * @brief Extract text and metadata using LibreOfficeKit
3 */
4 /* Copyright (C) 2014-2023 Olly Betts
6 * This Source Code Form is subject to the terms of the Mozilla Public
7 * License, v. 2.0. If a copy of the MPL was not distributed with this
8 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 */
11 #include <config.h>
13 #include "handler.h"
15 #include "htmlparser.h"
16 #include "loadfile.h"
17 #include "tmpdir.h"
18 #include "urlencode.h"
20 using namespace std;
22 #include <climits>
23 #include <cstdlib>
24 #include <exception>
25 #include <iostream>
26 #include <memory>
28 #include <sys/types.h>
29 #include <sys/stat.h>
30 #include <dirent.h>
31 #include <unistd.h>
32 #include "safesysexits.h"
34 #define LOK_USE_UNSTABLE_API // So we can use lok::Document::getParts().
35 #include <LibreOfficeKit/LibreOfficeKit.hxx>
37 using namespace std;
38 using namespace lok;
40 // Install location for Debian packages (also Fedora on 32-bit architectures):
41 #define LO_PATH_DEBIAN "/usr/lib/libreoffice/program"
43 // Install location for Fedora packages on 64-bit architectures:
44 #define LO_PATH_FEDORA64 "/usr/lib64/libreoffice/program"
46 // Install location on macOS. May not actually work there currently though,
47 // see: https://gitlab.com/ojwb/lloconv/-/issues/11
48 #define LO_PATH_MACOS "/Applications/LibreOffice.app/Contents/Frameworks"
50 // Find a LibreOffice installation to use.
51 static const char*
52 get_lo_path()
54 const char* lo_path = getenv("LO_PATH");
55 if (lo_path) return lo_path;
57 struct stat sb;
58 #define CHECK_DIR(P) if (stat(P"/versionrc", &sb) == 0 && S_ISREG(sb.st_mode)) return P
59 #ifdef __APPLE__
60 CHECK_DIR(LO_PATH_MACOS);
61 #else
62 CHECK_DIR(LO_PATH_DEBIAN);
63 if constexpr(sizeof(void*) > 4) {
64 CHECK_DIR(LO_PATH_FEDORA64);
66 #endif
68 // Check install locations for .deb files from libreoffice.org,
69 // e.g. /opt/libreoffice6.3/program
70 DIR* opt = opendir("/opt");
71 if (opt) {
72 // We require at least LibreOffice 4.3.
73 unsigned long best_major = 4;
74 unsigned long best_minor = 2;
75 static string best_rc;
76 struct dirent* d;
77 while ((d = readdir(opt))) {
78 #ifdef DT_DIR
79 // Opportunistically skip non-directories if we can spot them
80 // just by looking at d_type.
81 if (d->d_type != DT_DIR && d->d_type != DT_UNKNOWN) {
82 continue;
84 #endif
85 if (memcmp(d->d_name, "libreoffice", strlen("libreoffice")) != 0) {
86 continue;
89 char* p = d->d_name + strlen("libreoffice");
90 unsigned long major = strtoul(p, &p, 10);
91 if (major == ULONG_MAX) continue;
92 unsigned long minor = 0;
93 if (*p == '.') {
94 minor = strtoul(p + 1, &p, 10);
95 if (minor == ULONG_MAX) continue;
97 string rc = "/opt/";
98 rc += d->d_name;
99 rc += "/program";
100 if (stat((rc + "/versionrc").c_str(), &sb) != 0 ||
101 !S_ISREG(sb.st_mode)) {
102 continue;
105 if (major > best_major ||
106 (major == best_major && minor > best_minor)) {
107 best_major = major;
108 best_minor = minor;
109 best_rc = std::move(rc);
113 closedir(opt);
114 if (!best_rc.empty()) {
115 return best_rc.c_str();
119 cerr << "LibreOffice install not found\n"
120 "Set LO_PATH in the environment to the 'program' directory - e.g.:\n"
121 "LO_PATH=/opt/libreoffice/program\n"
122 "export LO_PATH\n";
123 _Exit(EX_UNAVAILABLE);
126 static string output_file;
127 static string output_url;
129 static Office* llo;
131 bool
132 initialise()
134 output_file = get_tmpfile("tmp.html");
135 if (output_file.empty()) {
136 cerr << "Couldn't create temporary directory\n";
137 return false;
139 url_encode_path(output_url, output_file);
141 const char* lo_path = get_lo_path();
142 llo = lok_cpp_init(lo_path);
143 if (!llo) {
144 cerr << "Failed to initialise LibreOfficeKit\n";
145 return false;
147 return true;
150 void
151 extract(const string& filename, const string&)
152 try {
153 const char* format = "html"; // FIXME or xhtml
154 const char* options = "SkipImages";
155 string input_url;
156 url_encode_path(input_url, filename);
157 unique_ptr<Document> lodoc(llo->documentLoad(input_url.c_str(), options));
158 if (!lodoc.get()) {
159 const char* errmsg = llo->getError();
160 send_field(FIELD_ERROR, errmsg ? errmsg : "Failed to load document");
161 return;
164 if (!lodoc->saveAs(output_url.c_str(), format, options)) {
165 const char* errmsg = llo->getError();
166 send_field(FIELD_ERROR, errmsg ? errmsg : "Failed to load export");
167 return;
170 string html;
171 if (!load_file(output_file, html)) {
172 unlink(output_file.c_str());
173 send_field(FIELD_ERROR, "Failed to load LibreOffice HTML output");
174 return;
176 HtmlParser p;
177 p.ignore_metarobots();
178 p.parse(html, "utf-8", true);
179 unlink(output_file.c_str());
180 send_field(FIELD_BODY, p.dump);
181 send_field(FIELD_TITLE, p.title);
182 send_field(FIELD_KEYWORDS, p.keywords);
183 send_field(FIELD_KEYWORDS, p.topic);
184 send_field(FIELD_AUTHOR, p.author);
185 send_field_created_date(p.created);
186 // The documentation comment in LibreOfficeKit.hxx says this method
187 // returns a count of "individual sheets in a Calc, or slides in Impress,
188 // and has no relevance for Writer" but it actually seems to return a
189 // page count for writer documents.
190 send_field_page_count(lodoc->getParts());
191 } catch (const exception& e) {
192 send_field(FIELD_ERROR, e.what());