[ci] Update macos jobs
[xapian.git] / xapian-applications / omega / handler_gmime.cc
blob333a4da8a31ea376599691da52e27b8419738954
1 /** @file
2 * @brief Extract text and metadata using gmime.
3 */
4 /* Copyright (C) 2019 Bruno Baruffaldi
5 * Copyright (C) 2022,2023 Olly Betts
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
20 * USA
22 #include <config.h>
23 #include "handler.h"
25 #include "htmlparser.h"
26 #include "str.h"
27 #include "stringutils.h"
28 #include "utf8convert.h"
30 #include <glib.h>
31 #include <gmime/gmime.h>
32 #include <string.h>
34 #include "safefcntl.h"
36 using namespace std;
38 static string
39 get_attachment_dir()
41 string result;
42 const char* p = getenv("XAPIAN_OMEGA_ATTACHMENT_DIR");
43 if (p && *p) {
44 result = p;
45 if (result.back() != '/') {
46 result += '/';
49 return result;
52 /** Directory to save attachments into.
54 * If empty, attachments are not saved.
56 static string attachment_dir = get_attachment_dir();
58 /** Counter used to ensure saved attachments have unique names.
60 * Reset to 0 for each email processed.
62 static unsigned attachment_counter;
64 /** To aid debugging attachment filenames from emails are used in many cases.
66 * This constant limits how long an attachment leafname can be.
68 static constexpr size_t MAX_ATTACHMENT_LEAF_LEN = 64;
70 /** To aid debugging attachment filenames from emails are used in many cases.
72 * This constant limits how long a file extension can be.
74 static constexpr size_t MAX_ATTACHMENT_EXT_LEN = 14;
76 /** To aid debugging attachment filenames from emails are used in many cases.
78 * If there's no supplied filename or it isn't suitable then this basename
79 * is used instead (potentially with the extension if there's a supplied
80 * name).
82 static constexpr const char* FALLBACK_ATTACHMENT_BASENAME = "attachment";
84 /** Buffer size used when reading and decoding MIME data. */
85 static constexpr unsigned SIZE = 4096;
87 static void
88 extract_html(const string& text, string& charset)
90 HtmlParser parser;
91 if (charset.empty())
92 charset = "UTF-8";
93 try {
94 parser.ignore_metarobots();
95 parser.parse(text, charset, false);
96 } catch (const string& newcharset) {
97 parser.reset();
98 parser.ignore_metarobots();
99 parser.parse(text, newcharset, true);
101 send_field(FIELD_BODY, parser.dump);
104 static std::string
105 decode(GMimeDataWrapper* content, GMimeContentEncoding content_encoding)
107 string result;
108 char buffer[SIZE];
109 unsigned char data[SIZE];
110 auto u_buff = reinterpret_cast<unsigned char*>(buffer);
111 GMimeStream* sr = g_mime_data_wrapper_get_stream(content);
112 int state = 0;
113 guint32 save = 0;
114 int len;
115 switch (content_encoding) {
116 case GMIME_CONTENT_ENCODING_BASE64:
117 while ((len = g_mime_stream_read(sr, buffer, SIZE)) > 0) {
118 len = g_mime_encoding_base64_decode_step(u_buff, len, data,
119 &state, &save);
120 result.append(reinterpret_cast<char*>(data), len);
122 break;
123 case GMIME_CONTENT_ENCODING_UUENCODE:
124 while ((len = g_mime_stream_read(sr, buffer, SIZE)) > 0) {
125 len = g_mime_encoding_uudecode_step(u_buff, len, data,
126 &state, &save);
127 result.append(reinterpret_cast<char*>(data), len);
129 break;
130 case GMIME_CONTENT_ENCODING_QUOTEDPRINTABLE:
131 while ((len = g_mime_stream_read(sr, buffer, SIZE)) > 0) {
132 len = g_mime_encoding_quoted_decode_step(u_buff, len, data,
133 &state, &save);
134 result.append(reinterpret_cast<char*>(data), len);
136 break;
137 default:
138 while ((len = g_mime_stream_read(sr, buffer, SIZE)) > 0) {
139 result.append(reinterpret_cast<char*>(buffer), len);
141 break;
143 return result;
146 static bool
147 parse_mime_part(GMimePart* part,
148 GMimeContentType* ct,
149 bool attachments)
151 const char* type = g_mime_content_type_get_media_type(ct);
152 const char* subtype = g_mime_content_type_get_media_subtype(ct);
153 enum { OTHER = 0, TEXT_PLAIN, TEXT_HTML } t = OTHER;
154 if (g_ascii_strcasecmp(type, "text") == 0) {
155 if (g_ascii_strcasecmp(subtype, "plain") == 0) {
156 t = TEXT_PLAIN;
157 } else if (g_ascii_strcasecmp(subtype, "html") == 0) {
158 t = TEXT_HTML;
162 if (t == OTHER && !attachments) {
163 // We're not interested in this MIME part.
164 return false;
167 #if GMIME_MAJOR_VERSION >= 3
168 GMimeDataWrapper* content = g_mime_part_get_content(part);
169 #else
170 GMimeDataWrapper* content = g_mime_part_get_content_object(part);
171 #endif
172 if (!content) return false;
174 string data = decode(content, g_mime_part_get_content_encoding(part));
175 if (t == TEXT_PLAIN) {
176 // Always convert text/plain parts to UTF-8 (doesn't matter whether
177 // we're sending as a field or saving as an attachment).
178 const char* charset = g_mime_content_type_get_parameter(ct, "charset");
179 if (charset) {
180 charset = g_mime_charset_canon_name(charset);
181 convert_to_utf8(data, charset);
185 bool is_attachment = g_mime_part_is_attachment(part);
186 // If we're not saving attachments, treat all text/plain and text/html
187 // parts as inline.
188 if (t != OTHER && (!attachments || !is_attachment)) {
189 if (t == TEXT_PLAIN) {
190 // text/plain
191 send_field(FIELD_BODY, data);
192 } else {
193 // text/html
194 string charset;
195 const char* p = g_mime_content_type_get_parameter(ct, "charset");
196 if (p) {
197 charset = g_mime_charset_canon_name(p);
199 extract_html(data, charset);
201 return true;
204 // Save attachment.
205 string filename = attachment_dir;
206 // Prefix with `a` for attachment, `i` for inline.
207 filename += (is_attachment ? 'a' : 'i');
208 filename += str(attachment_counter++);
209 filename += '-';
211 // It's much easier to debug if the extracted files can be easily
212 // matched up with the attachments in the email so using any
213 // supplied filename is good, but we need to keep security concerns
214 // in mind.
215 const char* leaf = g_mime_part_get_filename(part);
216 if (!leaf)
217 leaf = g_mime_content_type_get_parameter(ct, "filename");
218 if (leaf) {
219 // Remove any path.
220 const char* slash = strrchr(leaf, '/');
221 if (slash) leaf = slash + 1;
222 slash = strrchr(leaf, '\\');
223 if (slash) leaf = slash + 1;
225 size_t len = strlen(leaf);
226 if (len < 3 || len > MAX_ATTACHMENT_LEAF_LEN || leaf[0] == '.') {
227 // Don't use the leafname as is, but use the extension if it's
228 // sensible.
229 leaf = strrchr(leaf + 1, '.');
230 if (leaf && strlen(leaf + 1) <= MAX_ATTACHMENT_EXT_LEN) {
231 // Use the extension.
232 } else {
233 // Don't use the leafname at all.
234 leaf = nullptr;
239 if (!leaf || leaf[0] == '.') {
240 filename += FALLBACK_ATTACHMENT_BASENAME;
242 if (leaf) {
243 for (size_t i = 0; leaf[i]; ++i) {
244 char ch = leaf[i];
245 // Only allow clearly safe characters.
246 if (C_isalnum(ch) || ch == '.' || ch == '-' || ch == '+')
247 filename += ch;
248 else
249 filename += '_';
253 // This ends up ignoring the supplied Content-Type. If the extracted
254 // attachments are fed back into the indexer then this will end up
255 // determining the MIME Content-Type from the extension or file
256 // contents instead.
258 // It seems odd to ignore the supplied Content-Type, but in practice
259 // the supplied MIME type is likely actually determined by the sending
260 // MUA from the extension and/or file contents or is just set to
261 // `application/octet-stream`, so arguably it's better to determine the
262 // type for ourselves in a consistent way. Otherwise the exact same
263 // attachment could be indexed or not depending who sent it.
264 int fd = open(filename.c_str(),
265 O_CREAT | O_EXCL | O_WRONLY | O_BINARY,
266 0664);
267 const char* p = data.data();
268 size_t count = data.size();
269 while (count) {
270 ssize_t r = write(fd, p, count);
271 if (rare(r < 0)) {
272 if (errno == EINTR) continue;
273 close(fd);
274 unlink(filename.c_str());
275 send_field(FIELD_ERROR, "saving attachment failed");
276 return false;
278 p += r;
279 count -= r;
281 close(fd);
282 send_field(FIELD_ATTACHMENT, filename);
283 return false;
286 static bool
287 parse_mime_object(GMimeObject* me, bool attachments)
289 GMimeContentType* ct = g_mime_object_get_content_type(me);
290 if (GMIME_IS_PART(me)) {
291 return parse_mime_part(reinterpret_cast<GMimePart*>(me),
293 attachments);
296 if (!GMIME_IS_MULTIPART(me))
297 return false;
299 GMimeMultipart* mpart = reinterpret_cast<GMimeMultipart*>(me);
300 const char* subtype = g_mime_content_type_get_media_subtype(ct);
301 int count = g_mime_multipart_get_count(mpart);
302 if (g_ascii_strcasecmp(subtype, "alternative") == 0) {
303 // Use the last MIME part which we get text from.
304 for (int i = count - 1; i >= 0; --i) {
305 GMimeObject* part = g_mime_multipart_get_part(mpart, i);
306 // Don't consider parts under an alternative as attachments.
307 if (parse_mime_object(part, false))
308 return true;
310 return false;
313 bool ret = false;
314 for (int i = 0; i < count; ++i) {
315 GMimeObject* part = g_mime_multipart_get_part(mpart, i);
316 ret |= parse_mime_object(part, attachments);
318 return ret;
321 static void
322 send_glib_field(Field field, gchar* data)
324 if (data) {
325 send_field(field, data);
326 g_free(data);
330 static void
331 extract_addresses(Field field, InternetAddressList* address_list)
333 #if GMIME_MAJOR_VERSION >= 3
334 auto value = internet_address_list_to_string(address_list, NULL, false);
335 #else
336 auto value = internet_address_list_to_string(address_list, false);
337 #endif
338 send_glib_field(field, value);
341 static void
342 extract_message(GMimeMessage* message)
344 attachment_counter = 0;
345 (void)parse_mime_object(g_mime_message_get_mime_part(message),
346 !attachment_dir.empty());
347 send_field(FIELD_TITLE, g_mime_message_get_subject(message));
349 #if GMIME_MAJOR_VERSION >= 3
350 extract_addresses(FIELD_AUTHOR, g_mime_message_get_from(message));
351 #else
352 send_field(FIELD_AUTHOR, g_mime_message_get_sender(message));
353 #endif
354 #if GMIME_MAJOR_VERSION >= 3
355 extract_addresses(FIELD_TO, g_mime_message_get_to(message));
356 #else
357 extract_addresses(FIELD_TO,
358 g_mime_message_get_recipients(message,
359 GMIME_RECIPIENT_TYPE_TO));
360 #endif
361 #if GMIME_MAJOR_VERSION >= 3
362 extract_addresses(FIELD_CC, g_mime_message_get_cc(message));
363 #else
364 extract_addresses(FIELD_CC,
365 g_mime_message_get_recipients(message,
366 GMIME_RECIPIENT_TYPE_CC));
367 #endif
368 #if GMIME_MAJOR_VERSION >= 3
369 extract_addresses(FIELD_BCC, g_mime_message_get_bcc(message));
370 #else
371 extract_addresses(FIELD_BCC,
372 g_mime_message_get_recipients(message,
373 GMIME_RECIPIENT_TYPE_BCC));
374 #endif
375 #if GMIME_MAJOR_VERSION >= 3
376 GDateTime* datetime = g_mime_message_get_date(message);
377 if (datetime) {
378 GDateTime* utc_datetime = g_date_time_to_utc(datetime);
379 if (utc_datetime) {
380 gint64 unix_time = g_date_time_to_unix(utc_datetime);
381 // Check value doesn't overflow time_t.
382 if (gint64(time_t(unix_time)) == unix_time) {
383 send_field_created_date(time_t(unix_time));
385 g_date_time_unref(utc_datetime);
388 #else
389 time_t datetime;
390 int tz_offset;
391 g_mime_message_get_date(message, &datetime, &tz_offset);
392 if (datetime != time_t(-1)) {
393 // The documentation doesn't clearly say, but from testing the
394 // time_t value is in UTC which is what we want so we don't need
395 // tz_offset.
397 // (If we did, tz_offset is not actually in hours as the docs say,
398 // but actually hours*100+minutes, e.g. +1300 for UTC+13).
399 send_field_created_date(datetime);
401 #endif
402 send_field(FIELD_MESSAGE_ID, g_mime_message_get_message_id(message));
403 GMimeObject* object = GMIME_OBJECT(message);
404 GMimeHeaderList* headers = g_mime_object_get_header_list(object);
405 #if GMIME_MAJOR_VERSION >= 3
406 int count = g_mime_header_list_get_count(headers);
407 for (int i = 0; i < count; ++i) {
408 GMimeHeader* header = g_mime_header_list_get_header_at(headers, i);
409 auto name = g_mime_header_get_name(header);
410 # define value g_mime_header_get_value(header)
411 #else
412 g_mime_header_list_foreach(headers,
413 [](const char* name,
414 const char* value,
415 gpointer) {
416 #endif
417 if (g_ascii_strcasecmp(name, "Comments") == 0 ||
418 g_ascii_strcasecmp(name, "Keywords") == 0) {
419 send_field(FIELD_KEYWORDS, value);
421 #if GMIME_MAJOR_VERSION >= 3
423 # undef value
424 #else
425 }, nullptr);
426 #endif
429 bool
430 initialise()
432 #if GMIME_MAJOR_VERSION >= 3
433 g_mime_init();
434 #else
435 g_mime_init(GMIME_ENABLE_RFC2047_WORKAROUNDS);
436 #endif
437 return true;
440 void
441 extract(const string& filename, const string&)
443 FILE* fp = fopen(filename.c_str(), "r");
445 if (fp == NULL) {
446 send_field(FIELD_ERROR, "fopen() failed");
447 return;
450 GMimeStream* stream = g_mime_stream_file_new(fp);
451 GMimeParser* parser = g_mime_parser_new_with_stream(stream);
452 #if GMIME_MAJOR_VERSION >= 3
453 GMimeMessage* message = g_mime_parser_construct_message(parser, NULL);
454 #else
455 GMimeMessage* message = g_mime_parser_construct_message(parser);
456 #endif
457 if (message) {
458 extract_message(message);
459 g_object_unref(message);
461 g_object_unref(parser);
462 g_object_unref(stream);