2 * Copyright 2011-2016, Haiku, Inc. All rights reserved.
3 * Copyright 2001-2003 Dr. Zoidberg Enterprises. All rights reserved.
17 #include <FindDirectory.h>
20 #include <parsedate.h>
25 #include <mail_encoding.h>
27 #include <AttributeUtilities.h>
28 #include <CharacterSet.h>
29 #include <CharacterSetRoster.h>
32 using namespace BPrivate
;
37 struct CharsetConversionEntry
{
42 extern const CharsetConversionEntry mail_charsets
[] = {
43 // In order of authority, so when searching for the name for a particular
44 // numbered conversion, start at the beginning of the array.
45 {"iso-8859-1", B_ISO1_CONVERSION
}, // MIME STANDARD
46 {"iso-8859-2", B_ISO2_CONVERSION
}, // MIME STANDARD
47 {"iso-8859-3", B_ISO3_CONVERSION
}, // MIME STANDARD
48 {"iso-8859-4", B_ISO4_CONVERSION
}, // MIME STANDARD
49 {"iso-8859-5", B_ISO5_CONVERSION
}, // MIME STANDARD
50 {"iso-8859-6", B_ISO6_CONVERSION
}, // MIME STANDARD
51 {"iso-8859-7", B_ISO7_CONVERSION
}, // MIME STANDARD
52 {"iso-8859-8", B_ISO8_CONVERSION
}, // MIME STANDARD
53 {"iso-8859-9", B_ISO9_CONVERSION
}, // MIME STANDARD
54 {"iso-8859-10", B_ISO10_CONVERSION
}, // MIME STANDARD
55 {"iso-8859-13", B_ISO13_CONVERSION
}, // MIME STANDARD
56 {"iso-8859-14", B_ISO14_CONVERSION
}, // MIME STANDARD
57 {"iso-8859-15", B_ISO15_CONVERSION
}, // MIME STANDARD
59 {"shift_jis", B_SJIS_CONVERSION
}, // MIME STANDARD
60 {"shift-jis", B_SJIS_CONVERSION
},
61 {"iso-2022-jp", B_JIS_CONVERSION
}, // MIME STANDARD
62 {"euc-jp", B_EUC_CONVERSION
}, // MIME STANDARD
64 {"euc-kr", B_EUC_KR_CONVERSION
}, // Shift encoding 7 bit and KSC-5601 if bit 8 is on. // MIME STANDARD
65 {"ksc5601", B_EUC_KR_CONVERSION
}, // Not sure if 7 or 8 bit. // COMPATIBLE?
66 {"ks_c_5601-1987", B_EUC_KR_CONVERSION
}, // Not sure if 7 or 8 bit. // COMPATIBLE with stupid MS software
68 {"koi8-r", B_KOI8R_CONVERSION
}, // MIME STANDARD
69 {"windows-1251",B_MS_WINDOWS_1251_CONVERSION
}, // MIME STANDARD
70 {"windows-1252",B_MS_WINDOWS_CONVERSION
}, // MIME STANDARD
72 {"dos-437", B_MS_DOS_CONVERSION
}, // WRONG NAME : MIME STANDARD NAME = NONE ( IBM437? )
73 {"dos-866", B_MS_DOS_866_CONVERSION
}, // WRONG NAME : MIME STANDARD NAME = NONE ( IBM866? )
74 {"x-mac-roman", B_MAC_ROMAN_CONVERSION
}, // WRONG NAME : MIME STANDARD NAME = NONE ( macintosh? + x-mac-roman? )
76 {"big5", 24}, // MIME STANDARD
78 {"gb18030", 25}, // WRONG NAME : MIME STANDARD NAME = NONE ( GB18030? )
79 {"gb2312", 25}, // COMPATIBLE
80 {"gbk", 25}, // COMPATIBLE
82 /* {"utf-16", B_UNICODE_CONVERSION}, Might not work due to NULs in text, needs testing. */
83 {"us-ascii", B_MAIL_US_ASCII_CONVERSION
}, // MIME STANDARD
84 {"utf-8", B_MAIL_UTF8_CONVERSION
/* Special code for no conversion */}, // MIME STANDARD
86 {NULL
, (uint32
) -1} /* End of list marker, NULL string pointer is the key. */
90 static int32 gLocker
= 0;
91 static size_t gNsub
= 1;
92 static re_pattern_buffer gRe
;
93 static re_pattern_buffer
*gRebuf
= NULL
;
94 static unsigned char gTranslation
[256];
98 handle_non_rfc2047_encoding(char **buffer
, size_t *bufferLength
,
101 char *string
= *buffer
;
102 int32 length
= *sourceLength
;
105 // check for 8-bit characters
106 for (i
= 0;i
< length
;i
++)
107 if (string
[i
] & 0x80)
112 // check for groups of 8-bit characters - this code is not very smart;
113 // it just can detect some sort of single-byte encoded stuff, the rest
114 // is regarded as UTF-8
116 int32 singletons
= 0,doubles
= 0;
118 for (i
= 0;i
< length
;i
++)
120 if (string
[i
] & 0x80)
122 if ((string
[i
+ 1] & 0x80) == 0)
129 if (singletons
!= 0) // can't be valid UTF-8 anymore, so we assume ISO-Latin-1
133 int32 destLength
= length
* 4 + 1;
134 int32 destBufferLength
= destLength
;
135 char *dest
= (char*)malloc(destLength
);
139 if (convert_to_utf8(B_ISO1_CONVERSION
, string
, &length
,dest
,
140 &destLength
, &state
) == B_OK
) {
142 *bufferLength
= destBufferLength
;
143 *sourceLength
= destLength
;
150 // we assume a valid UTF-8 string here, but yes, we don't check it
159 write_read_attr(BNode
& node
, read_flags flag
)
161 if (node
.WriteAttr(B_MAIL_ATTR_READ
, B_INT32_TYPE
, 0, &flag
, sizeof(int32
))
165 // Manage the status string only if it currently has a known state
166 BString currentStatus
;
167 if (node
.ReadAttrString(B_MAIL_ATTR_STATUS
, ¤tStatus
) == B_OK
168 && currentStatus
.ICompare("New") != 0
169 && currentStatus
.ICompare("Read") != 0
170 && currentStatus
.ICompare("Seen") != 0) {
174 const char* statusString
= flag
== B_READ
? "Read"
175 : flag
== B_SEEN
? "Seen" : "New";
176 if (node
.WriteAttr(B_MAIL_ATTR_STATUS
, B_STRING_TYPE
, 0, statusString
,
177 strlen(statusString
)) < 0)
185 read_read_attr(BNode
& node
, read_flags
& flag
)
187 if (node
.ReadAttr(B_MAIL_ATTR_READ
, B_INT32_TYPE
, 0, &flag
, sizeof(int32
))
191 BString statusString
;
192 if (node
.ReadAttrString(B_MAIL_ATTR_STATUS
, &statusString
) == B_OK
) {
193 if (statusString
.ICompare("New"))
205 // The next couple of functions are our wrapper around convert_to_utf8 and
206 // convert_from_utf8 so that they can also convert from UTF-8 to UTF-8 by
207 // specifying the B_MAIL_UTF8_CONVERSION constant as the conversion operation.
208 // It also lets us add new conversions, like B_MAIL_US_ASCII_CONVERSION.
212 mail_convert_to_utf8(uint32 srcEncoding
, const char *src
, int32
*srcLen
,
213 char *dst
, int32
*dstLen
, int32
*state
, char substitute
)
216 char *originalDst
= dst
;
217 status_t returnCode
= -1;
219 if (srcEncoding
== B_MAIL_UTF8_CONVERSION
) {
220 copyAmount
= *srcLen
;
221 if (*dstLen
< copyAmount
)
222 copyAmount
= *dstLen
;
223 memcpy (dst
, src
, copyAmount
);
224 *srcLen
= copyAmount
;
225 *dstLen
= copyAmount
;
227 } else if (srcEncoding
== B_MAIL_US_ASCII_CONVERSION
) {
229 unsigned char letter
;
230 copyAmount
= *srcLen
;
231 if (*dstLen
< copyAmount
)
232 copyAmount
= *dstLen
;
233 for (i
= 0; i
< copyAmount
; i
++) {
236 // Invalid, could also use substitute, but better to strip high bit.
237 *dst
++ = letter
- 0x80U
;
238 else if (letter
== 0x80U
)
239 // Can't convert to 0x00 since that's NUL, which would cause problems.
244 *srcLen
= copyAmount
;
245 *dstLen
= copyAmount
;
248 returnCode
= convert_to_utf8 (srcEncoding
, src
, srcLen
,
249 dst
, dstLen
, state
, substitute
);
251 if (returnCode
== B_OK
) {
252 // Replace spurious NUL bytes, which should normally not be in the
253 // output of the decoding (not normal UTF-8 characters, and no NULs are
254 // in our usual input strings). They happen for some odd ISO-2022-JP
255 // byte pair combinations which are improperly handled by the BeOS
256 // routines. Like "\e$ByD\e(B" where \e is the ESC character $1B, the
257 // first ESC $ B switches to a Japanese character set, then the next
258 // two bytes "yD" specify a character, then ESC ( B switches back to
259 // the ASCII character set. The UTF-8 conversion yields a NUL byte.
261 for (i
= 0; i
< *dstLen
; i
++)
262 if (originalDst
[i
] == 0)
263 originalDst
[i
] = substitute
;
270 mail_convert_from_utf8(uint32 dstEncoding
, const char *src
, int32
*srcLen
,
271 char *dst
, int32
*dstLen
, int32
*state
, char substitute
)
275 int32 originalDstLen
= *dstLen
;
279 if (dstEncoding
== B_MAIL_UTF8_CONVERSION
) {
280 copyAmount
= *srcLen
;
281 if (*dstLen
< copyAmount
)
282 copyAmount
= *dstLen
;
283 memcpy (dst
, src
, copyAmount
);
284 *srcLen
= copyAmount
;
285 *dstLen
= copyAmount
;
289 if (dstEncoding
== B_MAIL_US_ASCII_CONVERSION
) {
290 int32 characterLength
;
291 int32 dstRemaining
= *dstLen
;
292 unsigned char letter
;
293 int32 srcRemaining
= *srcLen
;
295 // state contains the number of source bytes to skip, left over from a
296 // partial UTF-8 character split over the end of the buffer from last
298 if (srcRemaining
<= *state
) {
299 *state
-= srcRemaining
;
303 srcRemaining
-= *state
;
308 if (srcRemaining
<= 0 || dstRemaining
<= 0)
312 characterLength
= 1; // Regular ASCII equivalent code.
313 else if (letter
< 0xC0)
314 characterLength
= 1; // Invalid in-between data byte 10xxxxxx.
315 else if (letter
< 0xE0)
317 else if (letter
< 0xF0)
319 else if (letter
< 0xF8)
321 else if (letter
< 0xFC)
323 else if (letter
< 0xFE)
326 characterLength
= 1; // 0xFE and 0xFF are invalid in UTF-8.
332 if (srcRemaining
< characterLength
) {
333 // Character split past the end of the buffer.
334 *state
= characterLength
- srcRemaining
;
337 src
+= characterLength
;
338 srcRemaining
-= characterLength
;
341 // Update with the amounts used.
342 *srcLen
= *srcLen
- srcRemaining
;
343 *dstLen
= *dstLen
- dstRemaining
;
347 errorCode
= convert_from_utf8(dstEncoding
, src
, srcLen
, dst
, dstLen
, state
,
349 if (errorCode
!= B_OK
)
352 if (dstEncoding
!= B_JIS_CONVERSION
)
355 // B_JIS_CONVERSION (ISO-2022-JP) works by shifting between different
356 // character subsets. For E-mail headers (and other uses), it needs to be
357 // switched back to ASCII at the end (otherwise the last character gets
358 // lost or other weird things happen in the headers). Note that we can't
359 // just append the escape code since the convert_from_utf8 "state" will be
360 // wrong. So we append an ASCII letter and throw it away, leaving just the
361 // escape code. Well, it actually switches to the Roman character set, not
362 // ASCII, but that should be OK.
364 tempDstLen
= originalDstLen
- *dstLen
;
365 if (tempDstLen
< 3) // Not enough space remaining in the output.
366 return B_OK
; // Sort of an error, but we did convert the rest OK.
368 errorCode
= convert_from_utf8(dstEncoding
, "a", &tempSrcLen
,
369 dst
+ *dstLen
, &tempDstLen
, state
, substitute
);
370 if (errorCode
!= B_OK
)
372 *dstLen
+= tempDstLen
- 1 /* don't include the ASCII letter */;
378 rfc2047_to_utf8(char **bufp
, size_t *bufLen
, size_t strLen
)
381 char *charset
, *encoding
, *end
;
384 if (bufp
== NULL
|| *bufp
== NULL
)
387 char *string
= *bufp
;
389 //---------Handle *&&^%*&^ non-RFC compliant, 8bit mail
390 if (handle_non_rfc2047_encoding(bufp
,bufLen
,&strLen
))
393 // set up string length
395 strLen
= strlen(*bufp
);
396 char lastChar
= (*bufp
)[strLen
];
397 (*bufp
)[strLen
] = '\0';
399 //---------Whew! Now for RFC compliant mail
400 bool encodedWordFoundPreviously
= false;
401 for (head
= tail
= string
;
402 ((charset
= strstr(tail
, "=?")) != NULL
)
403 && (((encoding
= strchr(charset
+ 2, '?')) != NULL
)
404 && encoding
[1] && (encoding
[2] == '?') && encoding
[3])
405 && (end
= strstr(encoding
+ 3, "?=")) != NULL
;
406 // found "=?...charset...?e?...text...?= (e == encoding)
407 // ^charset ^encoding ^end
410 // Copy non-encoded text (from tail up to charset) to the output.
411 // Ignore spaces between two encoded "words". RFC2047 says the words
412 // should be concatenated without the space (designed for Asian
413 // sentences which have no spaces yet need to be broken into "words" to
414 // keep within the line length limits).
415 bool nonSpaceFound
= false;
416 for (int i
= 0; i
< charset
-tail
; i
++) {
417 if (!isspace (tail
[i
])) {
418 nonSpaceFound
= true;
422 if (!encodedWordFoundPreviously
|| nonSpaceFound
) {
423 if (string
!= tail
&& tail
!= charset
)
424 memmove(string
, tail
, charset
-tail
);
425 string
+= charset
-tail
;
428 encodedWordFoundPreviously
= true;
430 // move things to point at what they should:
431 // =?...charset...?e?...text...?= (e == encoding)
432 // ^charset ^encoding ^end
437 // find the charset this text is in now
438 size_t cLen
= encoding
- 1 - charset
;
439 bool base64encoded
= toupper(*encoding
) == 'B';
441 uint32 convertID
= B_MAIL_NULL_CONVERSION
;
442 char charsetName
[cLen
+ 1];
443 memcpy(charsetName
, charset
, cLen
);
444 charsetName
[cLen
] = '\0';
445 if (strcasecmp(charsetName
, "us-ascii") == 0) {
446 convertID
= B_MAIL_US_ASCII_CONVERSION
;
447 } else if (strcasecmp(charsetName
, "utf-8") == 0) {
448 convertID
= B_MAIL_UTF8_CONVERSION
;
450 const BCharacterSet
* charSet
451 = BCharacterSetRoster::FindCharacterSetByName(charsetName
);
452 if (charSet
!= NULL
) {
453 convertID
= charSet
->GetConversionID();
456 if (convertID
== B_MAIL_NULL_CONVERSION
) {
457 // unidentified charset
458 // what to do? doing nothing skips the encoded text;
459 // but we should keep it: we copy it to the output.
460 if (string
!= tail
&& tail
!= end
)
461 memmove(string
, tail
, end
-tail
);
465 // else we've successfully identified the charset
467 char *src
= encoding
+2;
468 int32 srcLen
= end
- 2 - src
;
469 // encoded text: src..src+srcLen
471 // decode text, get decoded length (reducing xforms)
472 srcLen
= !base64encoded
? decode_qp(src
, src
, srcLen
, 1)
473 : decode_base64(src
, src
, srcLen
);
475 // allocate space for the converted text
476 int32 dstLen
= end
-string
+ *bufLen
-strLen
;
477 char *dst
= (char*)malloc(dstLen
);
478 int32 cvLen
= srcLen
;
484 ret
= mail_convert_to_utf8(convertID
, src
, &cvLen
, dst
, &dstLen
,
487 // what to do? doing nothing skips the encoded text
488 // but we should keep it: we copy it to the output.
492 if (string
!= tail
&& tail
!= end
)
493 memmove(string
, tail
, end
-tail
);
497 /* convert_to_ is either returning something wrong or my
498 test data is screwed up. Whatever it is, Not Enough
499 Space is not the only cause of the below, so we just
500 assume it succeeds if it converts anything at all.
501 else if (cvLen < srcLen)
503 // not enough room to convert the data;
504 // grow *buf and retry
508 char *temp = (char*)realloc(*bufp, 2*(*bufLen + 1));
516 *bufLen = 2*(*bufLen + 1);
518 string = *bufp + (string-head);
519 tail = *bufp + (tail-head);
520 charset = *bufp + (charset-head);
521 encoding = *bufp + (encoding-head);
522 end = *bufp + (end-head);
523 src = *bufp + (src-head);
529 if (dstLen
> end
-string
) {
530 // copy the string forward...
531 memmove(string
+dstLen
, end
, strLen
- (end
-head
) + 1);
532 strLen
+= string
+dstLen
- end
;
533 end
= string
+ dstLen
;
536 memcpy(string
, dst
, dstLen
);
543 // copy everything that's left
544 size_t tailLen
= strLen
- (tail
- head
);
545 memmove(string
, tail
, tailLen
+1);
548 // replace the last char
549 (*bufp
)[strLen
] = lastChar
;
551 return ret
< B_OK
? ret
: string
-head
;
556 utf8_to_rfc2047 (char **bufp
, ssize_t length
, uint32 charset
, char encoding
)
559 BString originalWord
;
560 BString convertedWord
;
563 // Convert the word from UTF-8 to the desired character set. The
564 // converted version also includes the escape codes to return to ASCII
565 // mode, if relevant. Also note if it uses unprintable characters,
566 // which means it will need that special encoding treatment later.
567 void ConvertWordToCharset (uint32 charset
) {
569 int32 originalLength
= originalWord
.Length();
570 int32 convertedLength
= originalLength
* 5 + 1;
571 char *convertedBuffer
= convertedWord
.LockBuffer (convertedLength
);
572 mail_convert_from_utf8 (charset
, originalWord
.String(),
573 &originalLength
, convertedBuffer
, &convertedLength
, &state
);
574 for (int i
= 0; i
< convertedLength
; i
++) {
575 if ((convertedBuffer
[i
] & (1 << 7)) ||
576 (convertedBuffer
[i
] >= 0 && convertedBuffer
[i
] < 32)) {
577 needsEncoding
= true;
581 convertedWord
.UnlockBuffer (convertedLength
);
584 struct word
*currentWord
;
587 // Break the header into words. White space characters (including tabs and
588 // newlines) separate the words. Each word includes any space before it as
589 // part of the word. Actually, quotes and other special characters
590 // (",()<>@) are treated as separate words of their own so that they don't
591 // get encoded (because MIME headers get the quotes parsed before character
592 // set unconversion is done). The reader is supposed to ignore all white
593 // space between encoded words, which can be inserted so that older mail
594 // parsers don't have overly long line length problems.
596 const char *source
= *bufp
;
597 const char *bufEnd
= *bufp
+ length
;
598 const char *specialChars
= "\"()<>@,";
600 while (source
< bufEnd
) {
601 currentWord
= new struct word
;
602 currentWord
->needsEncoding
= false;
606 // Include leading spaces as part of the word.
607 while (source
+ wordEnd
< bufEnd
&& isspace (source
[wordEnd
]))
610 if (source
+ wordEnd
< bufEnd
&&
611 strchr (specialChars
, source
[wordEnd
]) != NULL
) {
612 // Got a quote mark or other special character, which is treated as
613 // a word in itself since it shouldn't be encoded, which would hide
614 // it from the mail system.
617 // Find the end of the word. Leave wordEnd pointing just after the
618 // last character in the word.
619 while (source
+ wordEnd
< bufEnd
) {
620 if (isspace(source
[wordEnd
]) ||
621 strchr (specialChars
, source
[wordEnd
]) != NULL
)
623 if (wordEnd
> 51 /* Makes Base64 ISO-2022-JP "word" a multiple of 4 bytes */ &&
624 0xC0 == (0xC0 & (unsigned int) source
[wordEnd
])) {
625 // No English words are that long (46 is the longest),
626 // break up what is likely Asian text (which has no spaces)
627 // at the start of the next non-ASCII UTF-8 character (high
628 // two bits are both ones). Note that two encoded words in
629 // a row get joined together, even if there is a space
630 // between them in the final output text, according to the
631 // standard. Next word will also be conveniently get
632 // encoded due to the 0xC0 test.
633 currentWord
->needsEncoding
= true;
639 currentWord
->originalWord
.SetTo (source
, wordEnd
);
640 currentWord
->ConvertWordToCharset (charset
);
641 words
.AddItem(currentWord
);
645 // Combine adjacent words which contain unprintable text so that the
646 // overhead of switching back and forth between regular text and specially
647 // encoded text is reduced. However, the combined word must be shorter
648 // than the maximum of 75 bytes, including character set specification and
649 // all those delimiters (worst case 22 bytes of overhead).
653 for (int32 i
= 0; (currentWord
= (struct word
*) words
.ItemAt (i
)) != NULL
; i
++) {
654 if (!currentWord
->needsEncoding
)
655 continue; // No need to combine unencoded words.
656 for (int32 g
= i
+1; (run
= (struct word
*) words
.ItemAt (g
)) != NULL
; g
++) {
657 if (!run
->needsEncoding
)
658 break; // Don't want to combine encoded and unencoded words.
659 if ((currentWord
->convertedWord
.Length() + run
->convertedWord
.Length() <= 53)) {
660 currentWord
->originalWord
.Append (run
->originalWord
);
661 currentWord
->ConvertWordToCharset (charset
);
665 } else // Can't merge this word, result would be too long.
670 // Combine the encoded and unencoded words into one line, doing the
671 // quoted-printable or base64 encoding. Insert an extra space between
672 // words which are both encoded to make word wrapping easier, since there
673 // is normally none, and you're allowed to insert space (the receiver
674 // throws it away if it is between encoded words).
677 bool previousWordNeededEncoding
= false;
679 const char *charset_dec
= "none-bug";
680 for (int32 i
= 0; mail_charsets
[i
].charset
!= NULL
; i
++) {
681 if (mail_charsets
[i
].flavor
== charset
) {
682 charset_dec
= mail_charsets
[i
].charset
;
687 while ((currentWord
= (struct word
*)words
.RemoveItem((int32
)0)) != NULL
) {
688 if ((encoding
!= quoted_printable
&& encoding
!= base64
) ||
689 !currentWord
->needsEncoding
) {
690 rfc2047
.Append (currentWord
->convertedWord
);
692 // This word needs encoding. Try to insert a space between it and
693 // the previous word.
694 if (previousWordNeededEncoding
)
695 rfc2047
<< ' '; // Can insert as many spaces as you want between encoded words.
697 // Previous word is not encoded, spaces are significant. Try
698 // to move a space from the start of this word to be outside of
699 // the encoded text, so that there is a bit of space between
700 // this word and the previous one to enhance word wrapping
702 if (currentWord
->originalWord
.Length() > 1 &&
703 isspace (currentWord
->originalWord
[0])) {
704 rfc2047
<< currentWord
->originalWord
[0];
705 currentWord
->originalWord
.Remove (0 /* offset */, 1 /* length */);
706 currentWord
->ConvertWordToCharset (charset
);
710 char *encoded
= NULL
;
711 ssize_t encoded_len
= 0;
712 int32 convertedLength
= currentWord
->convertedWord
.Length ();
713 const char *convertedBuffer
= currentWord
->convertedWord
.String ();
716 case quoted_printable
:
717 encoded
= (char *) malloc (convertedLength
* 3);
718 encoded_len
= encode_qp (encoded
, convertedBuffer
, convertedLength
, true /* headerMode */);
721 encoded
= (char *) malloc (convertedLength
* 2);
722 encoded_len
= encode_base64 (encoded
, convertedBuffer
, convertedLength
, true /* headerMode */);
724 default: // Unknown encoding type, shouldn't happen.
725 encoded
= (char *) convertedBuffer
;
726 encoded_len
= convertedLength
;
730 rfc2047
<< "=?" << charset_dec
<< '?' << encoding
<< '?';
731 rfc2047
.Append (encoded
, encoded_len
);
734 if (encoding
== quoted_printable
|| encoding
== base64
)
737 previousWordNeededEncoding
= currentWord
->needsEncoding
;
743 ssize_t finalLength
= rfc2047
.Length ();
744 *bufp
= (char *) (malloc (finalLength
+ 1));
745 memcpy (*bufp
, rfc2047
.String(), finalLength
);
746 (*bufp
)[finalLength
] = 0;
753 FoldLineAtWhiteSpaceAndAddCRLF(BString
&string
)
755 int inputLength
= string
.Length();
757 const int maxLineLength
= 78; // Doesn't include CRLF.
764 // If we don't need to wrap the text, just output the remainder, if any.
766 if (lineStartIndex
+ maxLineLength
>= inputLength
) {
767 if (lineStartIndex
< inputLength
) {
768 output
.Insert (string
, lineStartIndex
/* source offset */,
769 inputLength
- lineStartIndex
/* count */,
770 output
.Length() /* insert at */);
771 output
.Append (CRLF
);
776 // Look ahead for a convenient spot to split it, between a comma and
777 // space, which you often see between e-mail addresses like this:
778 // "Joe Who" joe@dot.com, "Someone Else" else@blot.com
780 tempIndex
= lineStartIndex
+ maxLineLength
;
781 if (tempIndex
> inputLength
)
782 tempIndex
= inputLength
;
783 splitIndex
= string
.FindLast (", ", tempIndex
);
784 if (splitIndex
>= lineStartIndex
)
785 splitIndex
++; // Point to the space character.
787 // If none of those exist, try splitting at any white space.
789 if (splitIndex
<= lineStartIndex
)
790 splitIndex
= string
.FindLast (" ", tempIndex
);
791 if (splitIndex
<= lineStartIndex
)
792 splitIndex
= string
.FindLast ("\t", tempIndex
);
794 // If none of those exist, allow for a longer word - split at the next
795 // available white space.
797 if (splitIndex
<= lineStartIndex
)
798 splitIndex
= string
.FindFirst (" ", lineStartIndex
+ 1);
799 if (splitIndex
<= lineStartIndex
)
800 splitIndex
= string
.FindFirst ("\t", lineStartIndex
+ 1);
802 // Give up, the whole rest of the line can't be split, just dump it
805 if (splitIndex
<= lineStartIndex
) {
806 if (lineStartIndex
< inputLength
) {
807 output
.Insert (string
, lineStartIndex
/* source offset */,
808 inputLength
- lineStartIndex
/* count */,
809 output
.Length() /* insert at */);
810 output
.Append (CRLF
);
815 // Do the split. The current line up to but not including the space
816 // gets output, followed by a CRLF. The space remains to become the
817 // start of the next line (and that tells the message reader that it is
818 // a continuation line).
820 output
.Insert (string
, lineStartIndex
/* source offset */,
821 splitIndex
- lineStartIndex
/* count */,
822 output
.Length() /* insert at */);
823 output
.Append (CRLF
);
824 lineStartIndex
= splitIndex
;
826 string
.SetTo (output
);
831 readfoldedline(FILE *file
, char **buffer
, size_t *buflen
)
833 ssize_t len
= buflen
&& *buflen
? *buflen
: 0;
834 char * buf
= buffer
&& *buffer
? *buffer
: NULL
;
835 ssize_t cnt
= 0; // Number of characters currently in the buffer.
839 // Make sure there is space in the buffer for two more characters (one
840 // for the next character, and one for the end of string NUL byte).
841 if (buf
== NULL
|| cnt
+ 2 >= len
) {
842 char *temp
= (char *)realloc(buf
, len
+ 64);
844 // Out of memory, however existing buffer remains allocated.
852 // Read the next character, or end of file, or IO error.
853 if ((c
= fgetc(file
)) == EOF
) {
857 cnt
= -1; // Error codes must be negative.
859 // Really is end of file. Also make it end of line if there is
860 // some text already read in. If the first thing read was EOF,
861 // just return an empty string.
864 if (buf
[cnt
-2] == '\r') {
876 // Convert CRLF end of line to just a LF. Do it before folding, in
877 // case we don't need to fold.
878 if (cnt
>= 2 && buf
[cnt
-2] == '\r') {
882 // If the current line is empty then return it (so that empty lines
883 // don't disappear if the next line starts with a space).
886 // Fold if first character on the next line is whitespace.
887 c
= fgetc(file
); // Note it's OK to read EOF and ungetc it too.
888 if (c
== ' ' || c
== '\t')
889 buf
[cnt
-1] = c
; // Replace \n with the white space character.
891 // Not folding, we finished reading a line; break out of the loop
898 if (buf
!= NULL
&& cnt
>= 0)
914 readfoldedline(BPositionIO
&in
, char **buffer
, size_t *buflen
)
916 ssize_t len
= buflen
&& *buflen
? *buflen
: 0;
917 char * buf
= buffer
&& *buffer
? *buffer
: NULL
;
918 ssize_t cnt
= 0; // Number of characters currently in the buffer.
923 // Make sure there is space in the buffer for two more characters (one
924 // for the next character, and one for the end of string NUL byte).
925 if (buf
== NULL
|| cnt
+ 2 >= len
) {
926 char *temp
= (char *)realloc(buf
, len
+ 64);
928 // Out of memory, however existing buffer remains allocated.
936 errorCode
= in
.Read (&c
,1); // A really slow way of reading - unbuffered.
937 if (errorCode
!= 1) {
939 cnt
= errorCode
; // IO error encountered, just return the code.
941 // Really is end of file. Also make it end of line if there is
942 // some text already read in. If the first thing read was EOF,
943 // just return an empty string.
946 if (buf
[cnt
-2] == '\r') {
958 // Convert CRLF end of line to just a LF. Do it before folding, in
959 // case we don't need to fold.
960 if (cnt
>= 2 && buf
[cnt
-2] == '\r') {
964 // If the current line is empty then return it (so that empty lines
965 // don't disappear if the next line starts with a space).
968 // if first character on the next line is whitespace, fold lines
969 errorCode
= in
.Read(&c
,1);
970 if (errorCode
== 1) {
971 if (c
== ' ' || c
== '\t')
972 buf
[cnt
-1] = c
; // Replace \n with the white space character.
974 // Not folding, we finished reading a whole line.
975 in
.Seek(-1,SEEK_CUR
); // Undo the look-ahead character read.
978 } else if (errorCode
< 0) {
981 } else // No next line; at the end of the file. Return the line.
986 if (buf
!= NULL
&& cnt
>= 0)
1002 nextfoldedline(const char** header
, char **buffer
, size_t *buflen
)
1004 ssize_t len
= buflen
&& *buflen
? *buflen
: 0;
1005 char * buf
= buffer
&& *buffer
? *buffer
: NULL
;
1006 ssize_t cnt
= 0; // Number of characters currently in the buffer.
1011 // Make sure there is space in the buffer for two more characters (one
1012 // for the next character, and one for the end of string NUL byte).
1013 if (buf
== NULL
|| cnt
+ 2 >= len
)
1015 char *temp
= (char *)realloc(buf
, len
+ 64);
1017 // Out of memory, however existing buffer remains allocated.
1025 // Read the next character, or end of file.
1026 if ((c
= *(*header
)++) == 0) {
1027 // End of file. Also make it end of line if there is some text
1028 // already read in. If the first thing read was EOF, just return
1032 if (buf
[cnt
-2] == '\r') {
1043 // Convert CRLF end of line to just a LF. Do it before folding, in
1044 // case we don't need to fold.
1045 if (cnt
>= 2 && buf
[cnt
-2] == '\r') {
1049 // If the current line is empty then return it (so that empty lines
1050 // don't disappear if the next line starts with a space).
1053 // if first character on the next line is whitespace, fold lines
1055 if (c
== ' ' || c
== '\t')
1056 buf
[cnt
-1] = c
; // Replace \n with the white space character.
1058 // Not folding, we finished reading a line; break out of the loop
1059 (*header
)--; // Undo read of the non-whitespace.
1066 if (buf
!= NULL
&& cnt
>= 0)
1082 trim_white_space(BString
&string
)
1085 int32 length
= string
.Length();
1086 char *buffer
= string
.LockBuffer(length
+ 1);
1088 while (length
> 0 && isspace(buffer
[length
- 1]))
1090 buffer
[length
] = '\0';
1092 for (i
= 0; buffer
[i
] && isspace(buffer
[i
]); i
++) {}
1095 memmove(buffer
,buffer
+ i
,length
+ 1);
1097 string
.UnlockBuffer(length
);
1101 /*! Tries to return a human-readable name from the specified
1102 header parameter (should be from "To:" or "From:").
1103 Tries to return the name rather than the eMail address.
1106 extract_address_name(BString
&header
)
1109 const char *start
= header
.String();
1110 const char *stop
= start
+ strlen (start
);
1112 // Find a string S in the header (email foo) that matches:
1113 // Old style name in brackets: foo@bar.com (S)
1114 // New style quotes: "S" <foo@bar.com>
1115 // New style no quotes if nothing else found: S <foo@bar.com>
1116 // If nothing else found then use the whole thing: S
1118 for (int i
= 0; i
<= 3; i
++) {
1119 // Set p1 to the first letter in the name and p2 to just past the last
1120 // letter in the name. p2 stays NULL if a name wasn't found in this
1122 const char *p1
= NULL
, *p2
= NULL
;
1125 case 0: // foo@bar.com (S)
1126 if ((p1
= strchr(start
,'(')) != NULL
) {
1127 p1
++; // Advance to first letter in the name.
1128 size_t nest
= 1; // Handle nested brackets.
1129 for (p2
= p1
; p2
< stop
; ++p2
)
1133 else if (*p2
== '(')
1139 p2
= NULL
; // False alarm, no terminating bracket.
1142 case 1: // "S" <foo@bar.com>
1143 if ((p1
= strchr(start
, '\"')) != NULL
)
1144 p2
= strchr(++p1
, '\"');
1146 case 2: // S <foo@bar.com>
1148 if (name
.Length() == 0)
1149 p2
= strchr(start
, '<');
1153 if (name
.Length() == 0)
1158 // Remove leading and trailing space-like characters and save the
1159 // result if it is longer than any other likely names found.
1161 while (p1
< p2
&& (isspace (*p1
)))
1164 while (p1
< p2
&& (isspace (p2
[-1])))
1167 int newLength
= p2
- p1
;
1168 if (name
.Length() < newLength
)
1169 name
.SetTo(p1
, newLength
);
1173 int32 lessIndex
= name
.FindFirst('<');
1174 int32 greaterIndex
= name
.FindLast('>');
1176 if (lessIndex
== 0) {
1177 // Have an address of the form <address> and nothing else, so remove
1178 // the greater and less than signs, if any.
1179 if (greaterIndex
> 0)
1180 name
.Remove(greaterIndex
, 1);
1181 name
.Remove(lessIndex
, 1);
1182 } else if (lessIndex
> 0 && lessIndex
< greaterIndex
) {
1183 // Yahoo stupidly inserts the e-mail address into the name string, so
1184 // this bit of code fixes: "Joe <joe@yahoo.com>" <joe@yahoo.com>
1185 name
.Remove(lessIndex
, greaterIndex
- lessIndex
+ 1);
1188 trim_white_space(name
);
1193 /*! Given a subject in a BString, remove the extraneous RE: re: and other stuff
1194 to get down to the core subject string, which should be identical for all
1195 messages posted about a topic. The input string is modified in place to
1196 become the output core subject string.
1199 SubjectToThread (BString
&string
)
1201 // a regex that matches a non-ASCII UTF8 character:
1203 "[\302-\337][\200-\277]" \
1204 "|\340[\302-\337][\200-\277]" \
1205 "|[\341-\357][\200-\277][\200-\277]" \
1206 "|\360[\220-\277][\200-\277][\200-\277]" \
1207 "|[\361-\367][\200-\277][\200-\277][\200-\277]" \
1208 "|\370[\210-\277][\200-\277][\200-\277][\200-\277]" \
1209 "|[\371-\373][\200-\277][\200-\277][\200-\277][\200-\277]" \
1210 "|\374[\204-\277][\200-\277][\200-\277][\200-\277][\200-\277]" \
1211 "|\375[\200-\277][\200-\277][\200-\277][\200-\277][\200-\277]"
1215 "|^(\\[[^]]*\\])(\\<| +| *(\\<(\\w|" U8C "){2,3} *(\\[[^\\]]*\\])? *:)+ *)" \
1216 "|^( +| *(\\<(\\w|" U8C "){2,3} *(\\[[^\\]]*\\])? *:)+ *)" \
1219 if (gRebuf
== NULL
&& atomic_add(&gLocker
, 1) == 0) {
1220 // the idea is to compile the regexp once to speed up testing
1222 for (int i
=0; i
<256; ++i
) gTranslation
[i
]=i
;
1223 for (int i
='a'; i
<='z'; ++i
) gTranslation
[i
]=toupper(i
);
1225 gRe
.translate
= gTranslation
;
1226 gRe
.regs_allocated
= REGS_FIXED
;
1227 re_syntax_options
= RE_SYNTAX_POSIX_EXTENDED
;
1229 const char *pattern
= PATTERN
;
1230 // count subexpressions in PATTERN
1231 for (unsigned int i
=0; pattern
[i
] != 0; ++i
)
1233 if (pattern
[i
] == '\\')
1235 else if (pattern
[i
] == '(')
1239 const char *err
= re_compile_pattern(pattern
,strlen(pattern
),&gRe
);
1243 fprintf(stderr
, "Failed to compile the regex: %s\n", err
);
1246 while (gRebuf
== NULL
&& tries
-- > 0)
1251 struct re_registers regs
;
1252 // can't be static if this function is to be thread-safe
1254 regs
.num_regs
= gNsub
;
1255 regs
.start
= (regoff_t
*)malloc(gNsub
*sizeof(regoff_t
));
1256 regs
.end
= (regoff_t
*)malloc(gNsub
*sizeof(regoff_t
));
1258 for (int start
= 0; (start
= re_search(gRebuf
, string
.String(),
1259 string
.Length(), 0, string
.Length(), ®s
)) >= 0;) {
1261 // we found something
1264 // don't delete [bemaildaemon]...
1265 if (start
== regs
.start
[1])
1266 start
= regs
.start
[2];
1268 string
.Remove(start
,regs
.end
[0]-start
);
1270 string
.Insert(' ',1,start
);
1272 // TODO: for some subjects this results in an endless loop, check
1274 if (regs
.end
[0] - start
<= 1)
1282 // Finally remove leading and trailing space. Some software, like
1283 // tm-edit 1.8, appends a space to the subject, which would break
1284 // threading if we left it in.
1285 trim_white_space(string
);
1289 /*! Converts a date to a time. Handles numeric time zones too, unlike
1290 parsedate(). Returns -1 if it fails.
1293 ParseDateWithTimeZone(const char *DateString
)
1297 char tempDateString
[80];
1298 char tempZoneString
[6];
1299 time_t zoneDeltaTime
;
1303 // See if we can remove the time zone portion. parsedate understands time
1304 // zone 3 letter names, but doesn't understand the numeric +9999 time zone
1305 // format. To do: see if a newer parsedate exists.
1307 strncpy (tempDateString
, DateString
, sizeof (tempDateString
));
1308 tempDateString
[sizeof (tempDateString
) - 1] = 0;
1310 // Remove trailing spaces.
1311 zonePntr
= tempDateString
+ strlen (tempDateString
) - 1;
1312 while (zonePntr
>= tempDateString
&& isspace (*zonePntr
))
1314 if (zonePntr
< tempDateString
)
1315 return -1; // Empty string.
1317 // Remove the trailing time zone in round brackets, like in
1318 // Fri, 22 Feb 2002 15:22:42 EST (-0500)
1319 // Thu, 25 Apr 1996 11:44:19 -0400 (EDT)
1320 if (tempDateString
[strlen(tempDateString
)-1] == ')')
1322 zonePntr
= strrchr (tempDateString
, '(');
1323 if (zonePntr
!= NULL
)
1325 *zonePntr
-- = 0; // Zap the '(', then remove trailing spaces.
1326 while (zonePntr
>= tempDateString
&& isspace (*zonePntr
))
1328 if (zonePntr
< tempDateString
)
1329 return -1; // Empty string.
1333 // Look for a numeric time zone like Tue, 30 Dec 2003 05:01:40 +0000
1334 for (zoneIndex
= strlen (tempDateString
); zoneIndex
>= 0; zoneIndex
--)
1336 zonePntr
= tempDateString
+ zoneIndex
;
1337 if (zonePntr
[0] == '+' || zonePntr
[0] == '-')
1339 if (zonePntr
[1] >= '0' && zonePntr
[1] <= '9' &&
1340 zonePntr
[2] >= '0' && zonePntr
[2] <= '9' &&
1341 zonePntr
[3] >= '0' && zonePntr
[3] <= '9' &&
1342 zonePntr
[4] >= '0' && zonePntr
[4] <= '9')
1348 // Remove the zone from the date string and any following time zone
1349 // letter codes. Also put in GMT so that the date gets parsed as GMT.
1350 memcpy (tempZoneString
, zonePntr
, 5);
1351 tempZoneString
[5] = 0;
1352 strcpy (zonePntr
, "GMT");
1354 else // No numeric time zone found.
1355 strcpy (tempZoneString
, "+0000");
1357 time (¤tTime
);
1358 dateAsTime
= parsedate (tempDateString
, currentTime
);
1359 if (dateAsTime
== (time_t) -1)
1360 return -1; // Failure.
1362 zoneDeltaTime
= 60 * atol (tempZoneString
+ 3); // Get the last two digits - minutes.
1363 tempZoneString
[3] = 0;
1364 zoneDeltaTime
+= atol (tempZoneString
+ 1) * 60 * 60; // Get the first two digits - hours.
1365 if (tempZoneString
[0] == '+')
1366 zoneDeltaTime
= 0 - zoneDeltaTime
;
1367 dateAsTime
+= zoneDeltaTime
;
1373 /*! Parses a mail header and fills the headers BMessage
1376 parse_header(BMessage
&headers
, BPositionIO
&input
)
1378 char *buffer
= NULL
;
1379 size_t bufferSize
= 0;
1382 while ((length
= readfoldedline(input
, &buffer
, &bufferSize
)) >= 2) {
1384 // Don't include the \n at the end of the buffer.
1386 // convert to UTF-8 and null-terminate the buffer
1387 length
= rfc2047_to_utf8(&buffer
, &bufferSize
, length
);
1388 buffer
[length
] = '\0';
1390 const char *delimiter
= strstr(buffer
, ":");
1391 if (delimiter
== NULL
)
1394 BString
header(buffer
, delimiter
- buffer
);
1395 header
.CapitalizeEachWord();
1396 // unified case for later fetch
1398 delimiter
++; // Skip the colon.
1399 // Skip over leading white space and tabs.
1400 // TODO: (comments in brackets).
1401 while (isspace(*delimiter
))
1404 // TODO: implement joining of multiple header tags (i.e. multiple "Cc:"s)
1405 headers
.AddString(header
.String(), delimiter
);
1414 extract_from_header(const BString
& header
, const BString
& field
,
1417 int32 headerLength
= header
.Length();
1418 int32 fieldEndPos
= 0;
1420 int32 pos
= header
.IFindFirst(field
, fieldEndPos
);
1423 fieldEndPos
= pos
+ field
.Length();
1425 if (pos
!= 0 && header
.ByteAt(pos
- 1) != '\n')
1427 if (header
.ByteAt(fieldEndPos
) == ':')
1432 int32 crPos
= fieldEndPos
;
1434 fieldEndPos
= crPos
;
1435 crPos
= header
.FindFirst('\n', crPos
);
1437 crPos
= headerLength
;
1439 header
.CopyInto(temp
, fieldEndPos
, crPos
- fieldEndPos
);
1440 if (header
.ByteAt(crPos
- 1) == '\r') {
1441 temp
.Truncate(temp
.Length() - 1);
1446 if (crPos
>= headerLength
)
1448 char nextByte
= header
.ByteAt(crPos
);
1449 if (nextByte
!= ' ' && nextByte
!= '\t')
1454 size_t bufferSize
= target
.Length();
1455 char* buffer
= target
.LockBuffer(bufferSize
);
1456 size_t length
= rfc2047_to_utf8(&buffer
, &bufferSize
, bufferSize
);
1457 target
.UnlockBuffer(length
);
1459 trim_white_space(target
);
1466 extract_address(BString
&address
)
1468 const char *string
= address
.String();
1471 // first, remove all quoted text
1473 if ((first
= address
.FindFirst('"')) >= 0) {
1474 int32 last
= first
+ 1;
1475 while (string
[last
] && string
[last
] != '"')
1478 if (string
[last
] == '"')
1479 address
.Remove(first
, last
+ 1 - first
);
1482 // try to extract the address now
1484 if ((first
= address
.FindFirst('<')) >= 0) {
1485 // the world likes us and we can just get the address the easy way...
1486 int32 last
= address
.FindFirst('>');
1488 address
.Truncate(last
);
1489 address
.Remove(0, first
+ 1);
1495 // then, see if there is anything in parenthesis to throw away
1497 if ((first
= address
.FindFirst('(')) >= 0) {
1498 int32 last
= first
+ 1;
1499 while (string
[last
] && string
[last
] != ')')
1502 if (string
[last
] == ')')
1503 address
.Remove(first
, last
+ 1 - first
);
1506 // now, there shouldn't be much else left
1508 trim_white_space(address
);
1513 get_address_list(BList
&list
, const char *string
,
1514 void (*cleanupFunc
)(BString
&))
1516 if (string
== NULL
|| !string
[0])
1519 const char *start
= string
;
1522 if (string
[0] == '"') {
1523 const char *quoteEnd
= ++string
;
1525 while (quoteEnd
[0] && quoteEnd
[0] != '"')
1528 if (!quoteEnd
[0]) // string exceeds line!
1531 string
= quoteEnd
+ 1;
1534 if (string
[0] == ',' || string
[0] == '\0') {
1535 BString
address(start
, string
- start
);
1536 trim_white_space(address
);
1539 cleanupFunc(address
);
1541 list
.AddItem(strdup(address
.String()));
1555 CopyMailFolderAttributes(const char* targetPath
)
1558 status_t status
= find_directory(B_USER_SETTINGS_DIRECTORY
, &path
);
1562 path
.Append("Tracker");
1563 path
.Append("DefaultQueryTemplates");
1564 path
.Append("text_x-email");
1566 BNode
source(path
.Path());
1567 BNode
target(targetPath
);
1568 return BPrivate::CopyAttributes(source
, target
);