xapian-applications/omega/handler_gmime.cc

   1 /** @file
   2  * @brief Extract text and metadata using gmime.
   3  */
   4 /* Copyright (C) 2019 Bruno Baruffaldi
   5  * Copyright (C) 2022,2023 Olly Betts
   6  *
   7  * This program is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License as
   9  * published by the Free Software Foundation; either version 2 of the
  10  * License, or (at your option) any later version.
  11  *
  12  * This program is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15  * GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with this program; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
  20  * USA
  21  */
  22 #include <config.h>
  23 #include "handler.h"
  24
  25 #include "htmlparser.h"
  26 #include "str.h"
  27 #include "stringutils.h"
  28 #include "utf8convert.h"
  29
  30 #include <glib.h>
  31 #include <gmime/gmime.h>
  32 #include <string.h>
  33
  34 #include "safefcntl.h"
  35
  36 using namespace std;
  37
  38 static string
  39 get_attachment_dir()
  40 {
  41     string result;
  42     const char* p = getenv("XAPIAN_OMEGA_ATTACHMENT_DIR");
  43     if (p && *p) {
  44         result = p;
  45         if (result.back() != '/') {
  46             result += '/';
  47         }
  48     }
  49     return result;
  50 }
  51
  52 /** Directory to save attachments into.
  53  *
  54  *  If empty, attachments are not saved.
  55  */
  56 static string attachment_dir = get_attachment_dir();
  57
  58 /** Counter used to ensure saved attachments have unique names.
  59  *
  60  *  Reset to 0 for each email processed.
  61  */
  62 static unsigned attachment_counter;
  63
  64 /** To aid debugging attachment filenames from emails are used in many cases.
  65  *
  66  *  This constant limits how long an attachment leafname can be.
  67  */
  68 static constexpr size_t MAX_ATTACHMENT_LEAF_LEN = 64;
  69
  70 /** To aid debugging attachment filenames from emails are used in many cases.
  71  *
  72  *  This constant limits how long a file extension can be.
  73  */
  74 static constexpr size_t MAX_ATTACHMENT_EXT_LEN = 14;
  75
  76 /** To aid debugging attachment filenames from emails are used in many cases.
  77  *
  78  *  If there's no supplied filename or it isn't suitable then this basename
  79  *  is used instead (potentially with the extension if there's a supplied
  80  *  name).
  81  */
  82 static constexpr const char* FALLBACK_ATTACHMENT_BASENAME = "attachment";
  83
  84 /** Buffer size used when reading and decoding MIME data. */
  85 static constexpr unsigned SIZE = 4096;
  86
  87 static void
  88 extract_html(const string& text, string& charset)
  89 {
  90     HtmlParser parser;
  91     if (charset.empty())
  92         charset = "UTF-8";
  93     try {
  94         parser.ignore_metarobots();
  95         parser.parse(text, charset, false);
  96     } catch (const string& newcharset) {
  97         parser.reset();
  98         parser.ignore_metarobots();
  99         parser.parse(text, newcharset, true);
 100     }
 101     send_field(FIELD_BODY, parser.dump);
 102 }
 103
 104 static std::string
 105 decode(GMimeDataWrapper* content, GMimeContentEncoding content_encoding)
 106 {
 107     string result;
 108     char buffer[SIZE];
 109     unsigned char data[SIZE];
 110     auto u_buff = reinterpret_cast<unsigned char*>(buffer);
 111     GMimeStream* sr = g_mime_data_wrapper_get_stream(content);
 112     int state = 0;
 113     guint32 save = 0;
 114     int len;
 115     switch (content_encoding) {
 116         case GMIME_CONTENT_ENCODING_BASE64:
 117             while ((len = g_mime_stream_read(sr, buffer, SIZE)) > 0) {
 118                 len = g_mime_encoding_base64_decode_step(u_buff, len, data,
 119                                                          &state, &save);
 120                 result.append(reinterpret_cast<char*>(data), len);
 121             }
 122             break;
 123         case GMIME_CONTENT_ENCODING_UUENCODE:
 124             while ((len = g_mime_stream_read(sr, buffer, SIZE)) > 0) {
 125                 len = g_mime_encoding_uudecode_step(u_buff, len, data,
 126                                                     &state, &save);
 127                 result.append(reinterpret_cast<char*>(data), len);
 128             }
 129             break;
 130         case GMIME_CONTENT_ENCODING_QUOTEDPRINTABLE:
 131             while ((len = g_mime_stream_read(sr, buffer, SIZE)) > 0) {
 132                 len = g_mime_encoding_quoted_decode_step(u_buff, len, data,
 133                                                          &state, &save);
 134                 result.append(reinterpret_cast<char*>(data), len);
 135             }
 136             break;
 137         default:
 138             while ((len = g_mime_stream_read(sr, buffer, SIZE)) > 0) {
 139                 result.append(reinterpret_cast<char*>(buffer), len);
 140             }
 141             break;
 142     }
 143     return result;
 144 }
 145
 146 static bool
 147 parse_mime_part(GMimePart* part,
 148                 GMimeContentType* ct,
 149                 bool attachments)
 150 {
 151     const char* type = g_mime_content_type_get_media_type(ct);
 152     const char* subtype = g_mime_content_type_get_media_subtype(ct);
 153     enum { OTHER = 0, TEXT_PLAIN, TEXT_HTML } t = OTHER;
 154     if (g_ascii_strcasecmp(type, "text") == 0) {
 155         if (g_ascii_strcasecmp(subtype, "plain") == 0) {
 156             t = TEXT_PLAIN;
 157         } else if (g_ascii_strcasecmp(subtype, "html") == 0) {
 158             t = TEXT_HTML;
 159         }
 160     }
 161
 162     if (t == OTHER && !attachments) {
 163         // We're not interested in this MIME part.
 164         return false;
 165     }
 166
 167 #if GMIME_MAJOR_VERSION >= 3
 168     GMimeDataWrapper* content = g_mime_part_get_content(part);
 169 #else
 170     GMimeDataWrapper* content = g_mime_part_get_content_object(part);
 171 #endif
 172     if (!content) return false;
 173
 174     string data = decode(content, g_mime_part_get_content_encoding(part));
 175     if (t == TEXT_PLAIN) {
 176         // Always convert text/plain parts to UTF-8 (doesn't matter whether
 177         // we're sending as a field or saving as an attachment).
 178         const char* charset = g_mime_content_type_get_parameter(ct, "charset");
 179         if (charset) {
 180             charset = g_mime_charset_canon_name(charset);
 181             convert_to_utf8(data, charset);
 182         }
 183     }
 184
 185     bool is_attachment = g_mime_part_is_attachment(part);
 186     // If we're not saving attachments, treat all text/plain and text/html
 187     // parts as inline.
 188     if (t != OTHER && (!attachments || !is_attachment)) {
 189         if (t == TEXT_PLAIN) {
 190             // text/plain
 191             send_field(FIELD_BODY, data);
 192         } else {
 193             // text/html
 194             string charset;
 195             const char* p = g_mime_content_type_get_parameter(ct, "charset");
 196             if (p) {
 197                 charset = g_mime_charset_canon_name(p);
 198             }
 199             extract_html(data, charset);
 200         }
 201         return true;
 202     }
 203
 204     // Save attachment.
 205     string filename = attachment_dir;
 206     // Prefix with `a` for attachment, `i` for inline.
 207     filename += (is_attachment ? 'a' : 'i');
 208     filename += str(attachment_counter++);
 209     filename += '-';
 210
 211     // It's much easier to debug if the extracted files can be easily
 212     // matched up with the attachments in the email so using any
 213     // supplied filename is good, but we need to keep security concerns
 214     // in mind.
 215     const char* leaf = g_mime_part_get_filename(part);
 216     if (!leaf)
 217         leaf = g_mime_content_type_get_parameter(ct, "filename");
 218     if (leaf) {
 219         // Remove any path.
 220         const char* slash = strrchr(leaf, '/');
 221         if (slash) leaf = slash + 1;
 222         slash = strrchr(leaf, '\\');
 223         if (slash) leaf = slash + 1;
 224
 225         size_t len = strlen(leaf);
 226         if (len < 3 || len > MAX_ATTACHMENT_LEAF_LEN || leaf[0] == '.') {
 227             // Don't use the leafname as is, but use the extension if it's
 228             // sensible.
 229             leaf = strrchr(leaf + 1, '.');
 230             if (leaf && strlen(leaf + 1) <= MAX_ATTACHMENT_EXT_LEN) {
 231                 // Use the extension.
 232             } else {
 233                 // Don't use the leafname at all.
 234                 leaf = nullptr;
 235             }
 236         }
 237     }
 238
 239     if (!leaf || leaf[0] == '.') {
 240         filename += FALLBACK_ATTACHMENT_BASENAME;
 241     }
 242     if (leaf) {
 243         for (size_t i = 0; leaf[i]; ++i) {
 244             char ch = leaf[i];
 245             // Only allow clearly safe characters.
 246             if (C_isalnum(ch) || ch == '.' || ch == '-' || ch == '+')
 247                 filename += ch;
 248             else
 249                 filename += '_';
 250         }
 251     }
 252
 253     // This ends up ignoring the supplied Content-Type.  If the extracted
 254     // attachments are fed back into the indexer then this will end up
 255     // determining the MIME Content-Type from the extension or file
 256     // contents instead.
 257     //
 258     // It seems odd to ignore the supplied Content-Type, but in practice
 259     // the supplied MIME type is likely actually determined by the sending
 260     // MUA from the extension and/or file contents or is just set to
 261     // `application/octet-stream`, so arguably it's better to determine the
 262     // type for ourselves in a consistent way.  Otherwise the exact same
 263     // attachment could be indexed or not depending who sent it.
 264     int fd = open(filename.c_str(),
 265                   O_CREAT | O_EXCL | O_WRONLY | O_BINARY,
 266                   0664);
 267     const char* p = data.data();
 268     size_t count = data.size();
 269     while (count) {
 270         ssize_t r = write(fd, p, count);
 271         if (rare(r < 0)) {
 272             if (errno == EINTR) continue;
 273             close(fd);
 274             unlink(filename.c_str());
 275             send_field(FIELD_ERROR, "saving attachment failed");
 276             return false;
 277         }
 278         p += r;
 279         count -= r;
 280     }
 281     close(fd);
 282     send_field(FIELD_ATTACHMENT, filename);
 283     return false;
 284 }
 285
 286 static bool
 287 parse_mime_object(GMimeObject* me, bool attachments)
 288 {
 289     GMimeContentType* ct = g_mime_object_get_content_type(me);
 290     if (GMIME_IS_PART(me)) {
 291         return parse_mime_part(reinterpret_cast<GMimePart*>(me),
 292                                ct,
 293                                attachments);
 294     }
 295
 296     if (!GMIME_IS_MULTIPART(me))
 297         return false;
 298
 299     GMimeMultipart* mpart = reinterpret_cast<GMimeMultipart*>(me);
 300     const char* subtype = g_mime_content_type_get_media_subtype(ct);
 301     int count = g_mime_multipart_get_count(mpart);
 302     if (g_ascii_strcasecmp(subtype, "alternative") == 0) {
 303         // Use the last MIME part which we get text from.
 304         for (int i = count - 1; i >= 0; --i) {
 305             GMimeObject* part = g_mime_multipart_get_part(mpart, i);
 306             // Don't consider parts under an alternative as attachments.
 307             if (parse_mime_object(part, false))
 308                 return true;
 309         }
 310         return false;
 311     }
 312
 313     bool ret = false;
 314     for (int i = 0; i < count; ++i) {
 315         GMimeObject* part = g_mime_multipart_get_part(mpart, i);
 316         ret |= parse_mime_object(part, attachments);
 317     }
 318     return ret;
 319 }
 320
 321 static void
 322 send_glib_field(Field field, gchar* data)
 323 {
 324     if (data) {
 325         send_field(field, data);
 326         g_free(data);
 327     }
 328 }
 329
 330 static void
 331 extract_addresses(Field field, InternetAddressList* address_list)
 332 {
 333 #if GMIME_MAJOR_VERSION >= 3
 334     auto value = internet_address_list_to_string(address_list, NULL, false);
 335 #else
 336     auto value = internet_address_list_to_string(address_list, false);
 337 #endif
 338     send_glib_field(field, value);
 339 }
 340
 341 static void
 342 extract_message(GMimeMessage* message)
 343 {
 344     attachment_counter = 0;
 345     (void)parse_mime_object(g_mime_message_get_mime_part(message),
 346                             !attachment_dir.empty());
 347     send_field(FIELD_TITLE, g_mime_message_get_subject(message));
 348
 349 #if GMIME_MAJOR_VERSION >= 3
 350     extract_addresses(FIELD_AUTHOR, g_mime_message_get_from(message));
 351 #else
 352     send_field(FIELD_AUTHOR, g_mime_message_get_sender(message));
 353 #endif
 354 #if GMIME_MAJOR_VERSION >= 3
 355     extract_addresses(FIELD_TO, g_mime_message_get_to(message));
 356 #else
 357     extract_addresses(FIELD_TO,
 358                       g_mime_message_get_recipients(message,
 359                                                     GMIME_RECIPIENT_TYPE_TO));
 360 #endif
 361 #if GMIME_MAJOR_VERSION >= 3
 362     extract_addresses(FIELD_CC, g_mime_message_get_cc(message));
 363 #else
 364     extract_addresses(FIELD_CC,
 365                       g_mime_message_get_recipients(message,
 366                                                     GMIME_RECIPIENT_TYPE_CC));
 367 #endif
 368 #if GMIME_MAJOR_VERSION >= 3
 369     extract_addresses(FIELD_BCC, g_mime_message_get_bcc(message));
 370 #else
 371     extract_addresses(FIELD_BCC,
 372                       g_mime_message_get_recipients(message,
 373                                                     GMIME_RECIPIENT_TYPE_BCC));
 374 #endif
 375 #if GMIME_MAJOR_VERSION >= 3
 376     GDateTime* datetime = g_mime_message_get_date(message);
 377     if (datetime) {
 378         GDateTime* utc_datetime = g_date_time_to_utc(datetime);
 379         if (utc_datetime) {
 380             gint64 unix_time = g_date_time_to_unix(utc_datetime);
 381             // Check value doesn't overflow time_t.
 382             if (gint64(time_t(unix_time)) == unix_time) {
 383                 send_field_created_date(time_t(unix_time));
 384             }
 385             g_date_time_unref(utc_datetime);
 386         }
 387     }
 388 #else
 389     time_t datetime;
 390     int tz_offset;
 391     g_mime_message_get_date(message, &datetime, &tz_offset);
 392     if (datetime != time_t(-1)) {
 393         // The documentation doesn't clearly say, but from testing the
 394         // time_t value is in UTC which is what we want so we don't need
 395         // tz_offset.
 396         //
 397         // (If we did, tz_offset is not actually in hours as the docs say,
 398         // but actually hours*100+minutes, e.g. +1300 for UTC+13).
 399         send_field_created_date(datetime);
 400     }
 401 #endif
 402     send_field(FIELD_MESSAGE_ID, g_mime_message_get_message_id(message));
 403     GMimeObject* object = GMIME_OBJECT(message);
 404     GMimeHeaderList* headers = g_mime_object_get_header_list(object);
 405 #if GMIME_MAJOR_VERSION >= 3
 406     int count = g_mime_header_list_get_count(headers);
 407     for (int i = 0; i < count; ++i) {
 408         GMimeHeader* header = g_mime_header_list_get_header_at(headers, i);
 409         auto name = g_mime_header_get_name(header);
 410 # define value g_mime_header_get_value(header)
 411 #else
 412     g_mime_header_list_foreach(headers,
 413                                [](const char* name,
 414                                   const char* value,
 415                                   gpointer) {
 416 #endif
 417         if (g_ascii_strcasecmp(name, "Comments") == 0 ||
 418             g_ascii_strcasecmp(name, "Keywords") == 0) {
 419             send_field(FIELD_KEYWORDS, value);
 420         }
 421 #if GMIME_MAJOR_VERSION >= 3
 422     }
 423 # undef value
 424 #else
 425                                }, nullptr);
 426 #endif
 427 }
 428
 429 bool
 430 initialise()
 431 {
 432 #if GMIME_MAJOR_VERSION >= 3
 433     g_mime_init();
 434 #else
 435     g_mime_init(GMIME_ENABLE_RFC2047_WORKAROUNDS);
 436 #endif
 437     return true;
 438 }
 439
 440 void
 441 extract(const string& filename, const string&)
 442 {
 443     FILE* fp = fopen(filename.c_str(), "r");
 444
 445     if (fp == NULL) {
 446         send_field(FIELD_ERROR, "fopen() failed");
 447         return;
 448     }
 449
 450     GMimeStream* stream = g_mime_stream_file_new(fp);
 451     GMimeParser* parser = g_mime_parser_new_with_stream(stream);
 452 #if GMIME_MAJOR_VERSION >= 3
 453     GMimeMessage* message = g_mime_parser_construct_message(parser, NULL);
 454 #else
 455     GMimeMessage* message = g_mime_parser_construct_message(parser);
 456 #endif
 457     if (message) {
 458         extract_message(message);
 459         g_object_unref(message);
 460     }
 461     g_object_unref(parser);
 462     g_object_unref(stream);
 463 }