2 * @brief Extract text and metadata using gmime.
4 /* Copyright (C) 2019 Bruno Baruffaldi
5 * Copyright (C) 2022,2023 Olly Betts
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
25 #include "htmlparser.h"
27 #include "stringutils.h"
28 #include "utf8convert.h"
31 #include <gmime/gmime.h>
34 #include "safefcntl.h"
42 const char* p
= getenv("XAPIAN_OMEGA_ATTACHMENT_DIR");
45 if (result
.back() != '/') {
52 /** Directory to save attachments into.
54 * If empty, attachments are not saved.
56 static string attachment_dir
= get_attachment_dir();
58 /** Counter used to ensure saved attachments have unique names.
60 * Reset to 0 for each email processed.
62 static unsigned attachment_counter
;
64 /** To aid debugging attachment filenames from emails are used in many cases.
66 * This constant limits how long an attachment leafname can be.
68 static constexpr size_t MAX_ATTACHMENT_LEAF_LEN
= 64;
70 /** To aid debugging attachment filenames from emails are used in many cases.
72 * This constant limits how long a file extension can be.
74 static constexpr size_t MAX_ATTACHMENT_EXT_LEN
= 14;
76 /** To aid debugging attachment filenames from emails are used in many cases.
78 * If there's no supplied filename or it isn't suitable then this basename
79 * is used instead (potentially with the extension if there's a supplied
82 static constexpr const char* FALLBACK_ATTACHMENT_BASENAME
= "attachment";
84 /** Buffer size used when reading and decoding MIME data. */
85 static constexpr unsigned SIZE
= 4096;
88 extract_html(const string
& text
, string
& charset
)
94 parser
.ignore_metarobots();
95 parser
.parse(text
, charset
, false);
96 } catch (const string
& newcharset
) {
98 parser
.ignore_metarobots();
99 parser
.parse(text
, newcharset
, true);
101 send_field(FIELD_BODY
, parser
.dump
);
105 decode(GMimeDataWrapper
* content
, GMimeContentEncoding content_encoding
)
109 unsigned char data
[SIZE
];
110 auto u_buff
= reinterpret_cast<unsigned char*>(buffer
);
111 GMimeStream
* sr
= g_mime_data_wrapper_get_stream(content
);
115 switch (content_encoding
) {
116 case GMIME_CONTENT_ENCODING_BASE64
:
117 while ((len
= g_mime_stream_read(sr
, buffer
, SIZE
)) > 0) {
118 len
= g_mime_encoding_base64_decode_step(u_buff
, len
, data
,
120 result
.append(reinterpret_cast<char*>(data
), len
);
123 case GMIME_CONTENT_ENCODING_UUENCODE
:
124 while ((len
= g_mime_stream_read(sr
, buffer
, SIZE
)) > 0) {
125 len
= g_mime_encoding_uudecode_step(u_buff
, len
, data
,
127 result
.append(reinterpret_cast<char*>(data
), len
);
130 case GMIME_CONTENT_ENCODING_QUOTEDPRINTABLE
:
131 while ((len
= g_mime_stream_read(sr
, buffer
, SIZE
)) > 0) {
132 len
= g_mime_encoding_quoted_decode_step(u_buff
, len
, data
,
134 result
.append(reinterpret_cast<char*>(data
), len
);
138 while ((len
= g_mime_stream_read(sr
, buffer
, SIZE
)) > 0) {
139 result
.append(reinterpret_cast<char*>(buffer
), len
);
147 parse_mime_part(GMimePart
* part
,
148 GMimeContentType
* ct
,
151 const char* type
= g_mime_content_type_get_media_type(ct
);
152 const char* subtype
= g_mime_content_type_get_media_subtype(ct
);
153 enum { OTHER
= 0, TEXT_PLAIN
, TEXT_HTML
} t
= OTHER
;
154 if (g_ascii_strcasecmp(type
, "text") == 0) {
155 if (g_ascii_strcasecmp(subtype
, "plain") == 0) {
157 } else if (g_ascii_strcasecmp(subtype
, "html") == 0) {
162 if (t
== OTHER
&& !attachments
) {
163 // We're not interested in this MIME part.
167 #if GMIME_MAJOR_VERSION >= 3
168 GMimeDataWrapper
* content
= g_mime_part_get_content(part
);
170 GMimeDataWrapper
* content
= g_mime_part_get_content_object(part
);
172 if (!content
) return false;
174 string data
= decode(content
, g_mime_part_get_content_encoding(part
));
175 if (t
== TEXT_PLAIN
) {
176 // Always convert text/plain parts to UTF-8 (doesn't matter whether
177 // we're sending as a field or saving as an attachment).
178 const char* charset
= g_mime_content_type_get_parameter(ct
, "charset");
180 charset
= g_mime_charset_canon_name(charset
);
181 convert_to_utf8(data
, charset
);
185 bool is_attachment
= g_mime_part_is_attachment(part
);
186 // If we're not saving attachments, treat all text/plain and text/html
188 if (t
!= OTHER
&& (!attachments
|| !is_attachment
)) {
189 if (t
== TEXT_PLAIN
) {
191 send_field(FIELD_BODY
, data
);
195 const char* p
= g_mime_content_type_get_parameter(ct
, "charset");
197 charset
= g_mime_charset_canon_name(p
);
199 extract_html(data
, charset
);
205 string filename
= attachment_dir
;
206 // Prefix with `a` for attachment, `i` for inline.
207 filename
+= (is_attachment
? 'a' : 'i');
208 filename
+= str(attachment_counter
++);
211 // It's much easier to debug if the extracted files can be easily
212 // matched up with the attachments in the email so using any
213 // supplied filename is good, but we need to keep security concerns
215 const char* leaf
= g_mime_part_get_filename(part
);
217 leaf
= g_mime_content_type_get_parameter(ct
, "filename");
220 const char* slash
= strrchr(leaf
, '/');
221 if (slash
) leaf
= slash
+ 1;
222 slash
= strrchr(leaf
, '\\');
223 if (slash
) leaf
= slash
+ 1;
225 size_t len
= strlen(leaf
);
226 if (len
< 3 || len
> MAX_ATTACHMENT_LEAF_LEN
|| leaf
[0] == '.') {
227 // Don't use the leafname as is, but use the extension if it's
229 leaf
= strrchr(leaf
+ 1, '.');
230 if (leaf
&& strlen(leaf
+ 1) <= MAX_ATTACHMENT_EXT_LEN
) {
231 // Use the extension.
233 // Don't use the leafname at all.
239 if (!leaf
|| leaf
[0] == '.') {
240 filename
+= FALLBACK_ATTACHMENT_BASENAME
;
243 for (size_t i
= 0; leaf
[i
]; ++i
) {
245 // Only allow clearly safe characters.
246 if (C_isalnum(ch
) || ch
== '.' || ch
== '-' || ch
== '+')
253 // This ends up ignoring the supplied Content-Type. If the extracted
254 // attachments are fed back into the indexer then this will end up
255 // determining the MIME Content-Type from the extension or file
258 // It seems odd to ignore the supplied Content-Type, but in practice
259 // the supplied MIME type is likely actually determined by the sending
260 // MUA from the extension and/or file contents or is just set to
261 // `application/octet-stream`, so arguably it's better to determine the
262 // type for ourselves in a consistent way. Otherwise the exact same
263 // attachment could be indexed or not depending who sent it.
264 int fd
= open(filename
.c_str(),
265 O_CREAT
| O_EXCL
| O_WRONLY
| O_BINARY
,
267 const char* p
= data
.data();
268 size_t count
= data
.size();
270 ssize_t r
= write(fd
, p
, count
);
272 if (errno
== EINTR
) continue;
274 unlink(filename
.c_str());
275 send_field(FIELD_ERROR
, "saving attachment failed");
282 send_field(FIELD_ATTACHMENT
, filename
);
287 parse_mime_object(GMimeObject
* me
, bool attachments
)
289 GMimeContentType
* ct
= g_mime_object_get_content_type(me
);
290 if (GMIME_IS_PART(me
)) {
291 return parse_mime_part(reinterpret_cast<GMimePart
*>(me
),
296 if (!GMIME_IS_MULTIPART(me
))
299 GMimeMultipart
* mpart
= reinterpret_cast<GMimeMultipart
*>(me
);
300 const char* subtype
= g_mime_content_type_get_media_subtype(ct
);
301 int count
= g_mime_multipart_get_count(mpart
);
302 if (g_ascii_strcasecmp(subtype
, "alternative") == 0) {
303 // Use the last MIME part which we get text from.
304 for (int i
= count
- 1; i
>= 0; --i
) {
305 GMimeObject
* part
= g_mime_multipart_get_part(mpart
, i
);
306 // Don't consider parts under an alternative as attachments.
307 if (parse_mime_object(part
, false))
314 for (int i
= 0; i
< count
; ++i
) {
315 GMimeObject
* part
= g_mime_multipart_get_part(mpart
, i
);
316 ret
|= parse_mime_object(part
, attachments
);
322 send_glib_field(Field field
, gchar
* data
)
325 send_field(field
, data
);
331 extract_addresses(Field field
, InternetAddressList
* address_list
)
333 #if GMIME_MAJOR_VERSION >= 3
334 auto value
= internet_address_list_to_string(address_list
, NULL
, false);
336 auto value
= internet_address_list_to_string(address_list
, false);
338 send_glib_field(field
, value
);
342 extract_message(GMimeMessage
* message
)
344 attachment_counter
= 0;
345 (void)parse_mime_object(g_mime_message_get_mime_part(message
),
346 !attachment_dir
.empty());
347 send_field(FIELD_TITLE
, g_mime_message_get_subject(message
));
349 #if GMIME_MAJOR_VERSION >= 3
350 extract_addresses(FIELD_AUTHOR
, g_mime_message_get_from(message
));
352 send_field(FIELD_AUTHOR
, g_mime_message_get_sender(message
));
354 #if GMIME_MAJOR_VERSION >= 3
355 extract_addresses(FIELD_TO
, g_mime_message_get_to(message
));
357 extract_addresses(FIELD_TO
,
358 g_mime_message_get_recipients(message
,
359 GMIME_RECIPIENT_TYPE_TO
));
361 #if GMIME_MAJOR_VERSION >= 3
362 extract_addresses(FIELD_CC
, g_mime_message_get_cc(message
));
364 extract_addresses(FIELD_CC
,
365 g_mime_message_get_recipients(message
,
366 GMIME_RECIPIENT_TYPE_CC
));
368 #if GMIME_MAJOR_VERSION >= 3
369 extract_addresses(FIELD_BCC
, g_mime_message_get_bcc(message
));
371 extract_addresses(FIELD_BCC
,
372 g_mime_message_get_recipients(message
,
373 GMIME_RECIPIENT_TYPE_BCC
));
375 #if GMIME_MAJOR_VERSION >= 3
376 GDateTime
* datetime
= g_mime_message_get_date(message
);
378 GDateTime
* utc_datetime
= g_date_time_to_utc(datetime
);
380 gint64 unix_time
= g_date_time_to_unix(utc_datetime
);
381 // Check value doesn't overflow time_t.
382 if (gint64(time_t(unix_time
)) == unix_time
) {
383 send_field_created_date(time_t(unix_time
));
385 g_date_time_unref(utc_datetime
);
391 g_mime_message_get_date(message
, &datetime
, &tz_offset
);
392 if (datetime
!= time_t(-1)) {
393 // The documentation doesn't clearly say, but from testing the
394 // time_t value is in UTC which is what we want so we don't need
397 // (If we did, tz_offset is not actually in hours as the docs say,
398 // but actually hours*100+minutes, e.g. +1300 for UTC+13).
399 send_field_created_date(datetime
);
402 send_field(FIELD_MESSAGE_ID
, g_mime_message_get_message_id(message
));
403 GMimeObject
* object
= GMIME_OBJECT(message
);
404 GMimeHeaderList
* headers
= g_mime_object_get_header_list(object
);
405 #if GMIME_MAJOR_VERSION >= 3
406 int count
= g_mime_header_list_get_count(headers
);
407 for (int i
= 0; i
< count
; ++i
) {
408 GMimeHeader
* header
= g_mime_header_list_get_header_at(headers
, i
);
409 auto name
= g_mime_header_get_name(header
);
410 # define value g_mime_header_get_value(header)
412 g_mime_header_list_foreach(headers
,
417 if (g_ascii_strcasecmp(name
, "Comments") == 0 ||
418 g_ascii_strcasecmp(name
, "Keywords") == 0) {
419 send_field(FIELD_KEYWORDS
, value
);
421 #if GMIME_MAJOR_VERSION >= 3
432 #if GMIME_MAJOR_VERSION >= 3
435 g_mime_init(GMIME_ENABLE_RFC2047_WORKAROUNDS
);
441 extract(const string
& filename
, const string
&)
443 FILE* fp
= fopen(filename
.c_str(), "r");
446 send_field(FIELD_ERROR
, "fopen() failed");
450 GMimeStream
* stream
= g_mime_stream_file_new(fp
);
451 GMimeParser
* parser
= g_mime_parser_new_with_stream(stream
);
452 #if GMIME_MAJOR_VERSION >= 3
453 GMimeMessage
* message
= g_mime_parser_construct_message(parser
, NULL
);
455 GMimeMessage
* message
= g_mime_parser_construct_message(parser
);
458 extract_message(message
);
459 g_object_unref(message
);
461 g_object_unref(parser
);
462 g_object_unref(stream
);