2 * Copyright 2011, Haiku, Inc. All rights reserved.
3 * Copyright 2001-2003 Dr. Zoidberg Enterprises. All rights reserved.
19 #include <parsedate.h>
23 #include <mail_encoding.h>
25 #include <CharacterSet.h>
26 #include <CharacterSetRoster.h>
29 using namespace BPrivate
;
34 struct CharsetConversionEntry
{
39 extern const CharsetConversionEntry mail_charsets
[] = {
40 // In order of authority, so when searching for the name for a particular
41 // numbered conversion, start at the beginning of the array.
42 {"iso-8859-1", B_ISO1_CONVERSION
}, // MIME STANDARD
43 {"iso-8859-2", B_ISO2_CONVERSION
}, // MIME STANDARD
44 {"iso-8859-3", B_ISO3_CONVERSION
}, // MIME STANDARD
45 {"iso-8859-4", B_ISO4_CONVERSION
}, // MIME STANDARD
46 {"iso-8859-5", B_ISO5_CONVERSION
}, // MIME STANDARD
47 {"iso-8859-6", B_ISO6_CONVERSION
}, // MIME STANDARD
48 {"iso-8859-7", B_ISO7_CONVERSION
}, // MIME STANDARD
49 {"iso-8859-8", B_ISO8_CONVERSION
}, // MIME STANDARD
50 {"iso-8859-9", B_ISO9_CONVERSION
}, // MIME STANDARD
51 {"iso-8859-10", B_ISO10_CONVERSION
}, // MIME STANDARD
52 {"iso-8859-13", B_ISO13_CONVERSION
}, // MIME STANDARD
53 {"iso-8859-14", B_ISO14_CONVERSION
}, // MIME STANDARD
54 {"iso-8859-15", B_ISO15_CONVERSION
}, // MIME STANDARD
56 {"shift_jis", B_SJIS_CONVERSION
}, // MIME STANDARD
57 {"shift-jis", B_SJIS_CONVERSION
},
58 {"iso-2022-jp", B_JIS_CONVERSION
}, // MIME STANDARD
59 {"euc-jp", B_EUC_CONVERSION
}, // MIME STANDARD
61 {"euc-kr", B_EUC_KR_CONVERSION
}, // Shift encoding 7 bit and KSC-5601 if bit 8 is on. // MIME STANDARD
62 {"ksc5601", B_EUC_KR_CONVERSION
}, // Not sure if 7 or 8 bit. // COMPATIBLE?
63 {"ks_c_5601-1987", B_EUC_KR_CONVERSION
}, // Not sure if 7 or 8 bit. // COMPATIBLE with stupid MS software
65 {"koi8-r", B_KOI8R_CONVERSION
}, // MIME STANDARD
66 {"windows-1251",B_MS_WINDOWS_1251_CONVERSION
}, // MIME STANDARD
67 {"windows-1252",B_MS_WINDOWS_CONVERSION
}, // MIME STANDARD
69 {"dos-437", B_MS_DOS_CONVERSION
}, // WRONG NAME : MIME STANDARD NAME = NONE ( IBM437? )
70 {"dos-866", B_MS_DOS_866_CONVERSION
}, // WRONG NAME : MIME STANDARD NAME = NONE ( IBM866? )
71 {"x-mac-roman", B_MAC_ROMAN_CONVERSION
}, // WRONG NAME : MIME STANDARD NAME = NONE ( macintosh? + x-mac-roman? )
73 {"big5", 24}, // MIME STANDARD
75 {"gb18030", 25}, // WRONG NAME : MIME STANDARD NAME = NONE ( GB18030? )
76 {"gb2312", 25}, // COMPATIBLE
77 {"gbk", 25}, // COMPATIBLE
79 /* {"utf-16", B_UNICODE_CONVERSION}, Might not work due to NULs in text, needs testing. */
80 {"us-ascii", B_MAIL_US_ASCII_CONVERSION
}, // MIME STANDARD
81 {"utf-8", B_MAIL_UTF8_CONVERSION
/* Special code for no conversion */}, // MIME STANDARD
83 {NULL
, (uint32
) -1} /* End of list marker, NULL string pointer is the key. */
87 static int32 gLocker
= 0;
88 static size_t gNsub
= 1;
89 static re_pattern_buffer gRe
;
90 static re_pattern_buffer
*gRebuf
= NULL
;
91 static unsigned char gTranslation
[256];
95 handle_non_rfc2047_encoding(char **buffer
, size_t *bufferLength
,
98 char *string
= *buffer
;
99 int32 length
= *sourceLength
;
102 // check for 8-bit characters
103 for (i
= 0;i
< length
;i
++)
104 if (string
[i
] & 0x80)
109 // check for groups of 8-bit characters - this code is not very smart;
110 // it just can detect some sort of single-byte encoded stuff, the rest
111 // is regarded as UTF-8
113 int32 singletons
= 0,doubles
= 0;
115 for (i
= 0;i
< length
;i
++)
117 if (string
[i
] & 0x80)
119 if ((string
[i
+ 1] & 0x80) == 0)
126 if (singletons
!= 0) // can't be valid UTF-8 anymore, so we assume ISO-Latin-1
130 int32 destLength
= length
* 4 + 1;
131 int32 destBufferLength
= destLength
;
132 char *dest
= (char*)malloc(destLength
);
136 if (convert_to_utf8(B_ISO1_CONVERSION
, string
, &length
,dest
,
137 &destLength
, &state
) == B_OK
) {
139 *bufferLength
= destBufferLength
;
140 *sourceLength
= destLength
;
147 // we assume a valid UTF-8 string here, but yes, we don't check it
156 write_read_attr(BNode
& node
, read_flags flag
)
158 if (node
.WriteAttr(B_MAIL_ATTR_READ
, B_INT32_TYPE
, 0, &flag
, sizeof(int32
))
162 // manage the status string only if it currently has a "read" status
163 BString currentStatus
;
164 if (node
.ReadAttrString(B_MAIL_ATTR_STATUS
, ¤tStatus
) == B_OK
) {
165 if (currentStatus
.ICompare("New") != 0
166 && currentStatus
.ICompare("Read") != 0
167 && currentStatus
.ICompare("Seen") != 0)
171 const char* statusString
= flag
== B_READ
? "Read"
172 : flag
== B_SEEN
? "Seen" : "New";
173 if (node
.WriteAttr(B_MAIL_ATTR_STATUS
, B_STRING_TYPE
, 0, statusString
,
174 strlen(statusString
)) < 0)
182 read_read_attr(BNode
& node
, read_flags
& flag
)
184 if (node
.ReadAttr(B_MAIL_ATTR_READ
, B_INT32_TYPE
, 0, &flag
, sizeof(int32
))
188 BString statusString
;
189 if (node
.ReadAttrString(B_MAIL_ATTR_STATUS
, &statusString
) == B_OK
) {
190 if (statusString
.ICompare("New"))
202 // The next couple of functions are our wrapper around convert_to_utf8 and
203 // convert_from_utf8 so that they can also convert from UTF-8 to UTF-8 by
204 // specifying the B_MAIL_UTF8_CONVERSION constant as the conversion operation.
205 // It also lets us add new conversions, like B_MAIL_US_ASCII_CONVERSION.
209 mail_convert_to_utf8(uint32 srcEncoding
, const char *src
, int32
*srcLen
,
210 char *dst
, int32
*dstLen
, int32
*state
, char substitute
)
213 char *originalDst
= dst
;
214 status_t returnCode
= -1;
216 if (srcEncoding
== B_MAIL_UTF8_CONVERSION
) {
217 copyAmount
= *srcLen
;
218 if (*dstLen
< copyAmount
)
219 copyAmount
= *dstLen
;
220 memcpy (dst
, src
, copyAmount
);
221 *srcLen
= copyAmount
;
222 *dstLen
= copyAmount
;
224 } else if (srcEncoding
== B_MAIL_US_ASCII_CONVERSION
) {
226 unsigned char letter
;
227 copyAmount
= *srcLen
;
228 if (*dstLen
< copyAmount
)
229 copyAmount
= *dstLen
;
230 for (i
= 0; i
< copyAmount
; i
++) {
233 // Invalid, could also use substitute, but better to strip high bit.
234 *dst
++ = letter
- 0x80U
;
235 else if (letter
== 0x80U
)
236 // Can't convert to 0x00 since that's NUL, which would cause problems.
241 *srcLen
= copyAmount
;
242 *dstLen
= copyAmount
;
245 returnCode
= convert_to_utf8 (srcEncoding
, src
, srcLen
,
246 dst
, dstLen
, state
, substitute
);
248 if (returnCode
== B_OK
) {
249 // Replace spurious NUL bytes, which should normally not be in the
250 // output of the decoding (not normal UTF-8 characters, and no NULs are
251 // in our usual input strings). They happen for some odd ISO-2022-JP
252 // byte pair combinations which are improperly handled by the BeOS
253 // routines. Like "\e$ByD\e(B" where \e is the ESC character $1B, the
254 // first ESC $ B switches to a Japanese character set, then the next
255 // two bytes "yD" specify a character, then ESC ( B switches back to
256 // the ASCII character set. The UTF-8 conversion yields a NUL byte.
258 for (i
= 0; i
< *dstLen
; i
++)
259 if (originalDst
[i
] == 0)
260 originalDst
[i
] = substitute
;
267 mail_convert_from_utf8(uint32 dstEncoding
, const char *src
, int32
*srcLen
,
268 char *dst
, int32
*dstLen
, int32
*state
, char substitute
)
272 int32 originalDstLen
= *dstLen
;
276 if (dstEncoding
== B_MAIL_UTF8_CONVERSION
) {
277 copyAmount
= *srcLen
;
278 if (*dstLen
< copyAmount
)
279 copyAmount
= *dstLen
;
280 memcpy (dst
, src
, copyAmount
);
281 *srcLen
= copyAmount
;
282 *dstLen
= copyAmount
;
286 if (dstEncoding
== B_MAIL_US_ASCII_CONVERSION
) {
287 int32 characterLength
;
288 int32 dstRemaining
= *dstLen
;
289 unsigned char letter
;
290 int32 srcRemaining
= *srcLen
;
292 // state contains the number of source bytes to skip, left over from a
293 // partial UTF-8 character split over the end of the buffer from last
295 if (srcRemaining
<= *state
) {
296 *state
-= srcRemaining
;
300 srcRemaining
-= *state
;
305 if (srcRemaining
<= 0 || dstRemaining
<= 0)
309 characterLength
= 1; // Regular ASCII equivalent code.
310 else if (letter
< 0xC0)
311 characterLength
= 1; // Invalid in-between data byte 10xxxxxx.
312 else if (letter
< 0xE0)
314 else if (letter
< 0xF0)
316 else if (letter
< 0xF8)
318 else if (letter
< 0xFC)
320 else if (letter
< 0xFE)
323 characterLength
= 1; // 0xFE and 0xFF are invalid in UTF-8.
329 if (srcRemaining
< characterLength
) {
330 // Character split past the end of the buffer.
331 *state
= characterLength
- srcRemaining
;
334 src
+= characterLength
;
335 srcRemaining
-= characterLength
;
338 // Update with the amounts used.
339 *srcLen
= *srcLen
- srcRemaining
;
340 *dstLen
= *dstLen
- dstRemaining
;
344 errorCode
= convert_from_utf8(dstEncoding
, src
, srcLen
, dst
, dstLen
, state
,
346 if (errorCode
!= B_OK
)
349 if (dstEncoding
!= B_JIS_CONVERSION
)
352 // B_JIS_CONVERSION (ISO-2022-JP) works by shifting between different
353 // character subsets. For E-mail headers (and other uses), it needs to be
354 // switched back to ASCII at the end (otherwise the last character gets
355 // lost or other weird things happen in the headers). Note that we can't
356 // just append the escape code since the convert_from_utf8 "state" will be
357 // wrong. So we append an ASCII letter and throw it away, leaving just the
358 // escape code. Well, it actually switches to the Roman character set, not
359 // ASCII, but that should be OK.
361 tempDstLen
= originalDstLen
- *dstLen
;
362 if (tempDstLen
< 3) // Not enough space remaining in the output.
363 return B_OK
; // Sort of an error, but we did convert the rest OK.
365 errorCode
= convert_from_utf8(dstEncoding
, "a", &tempSrcLen
,
366 dst
+ *dstLen
, &tempDstLen
, state
, substitute
);
367 if (errorCode
!= B_OK
)
369 *dstLen
+= tempDstLen
- 1 /* don't include the ASCII letter */;
375 rfc2047_to_utf8(char **bufp
, size_t *bufLen
, size_t strLen
)
378 char *charset
, *encoding
, *end
;
381 if (bufp
== NULL
|| *bufp
== NULL
)
384 char *string
= *bufp
;
386 //---------Handle *&&^%*&^ non-RFC compliant, 8bit mail
387 if (handle_non_rfc2047_encoding(bufp
,bufLen
,&strLen
))
390 // set up string length
392 strLen
= strlen(*bufp
);
393 char lastChar
= (*bufp
)[strLen
];
394 (*bufp
)[strLen
] = '\0';
396 //---------Whew! Now for RFC compliant mail
397 bool encodedWordFoundPreviously
= false;
398 for (head
= tail
= string
;
399 ((charset
= strstr(tail
, "=?")) != NULL
)
400 && (((encoding
= strchr(charset
+ 2, '?')) != NULL
)
401 && encoding
[1] && (encoding
[2] == '?') && encoding
[3])
402 && (end
= strstr(encoding
+ 3, "?=")) != NULL
;
403 // found "=?...charset...?e?...text...?= (e == encoding)
404 // ^charset ^encoding ^end
407 // Copy non-encoded text (from tail up to charset) to the output.
408 // Ignore spaces between two encoded "words". RFC2047 says the words
409 // should be concatenated without the space (designed for Asian
410 // sentences which have no spaces yet need to be broken into "words" to
411 // keep within the line length limits).
412 bool nonSpaceFound
= false;
413 for (int i
= 0; i
< charset
-tail
; i
++) {
414 if (!isspace (tail
[i
])) {
415 nonSpaceFound
= true;
419 if (!encodedWordFoundPreviously
|| nonSpaceFound
) {
420 if (string
!= tail
&& tail
!= charset
)
421 memmove(string
, tail
, charset
-tail
);
422 string
+= charset
-tail
;
425 encodedWordFoundPreviously
= true;
427 // move things to point at what they should:
428 // =?...charset...?e?...text...?= (e == encoding)
429 // ^charset ^encoding ^end
434 // find the charset this text is in now
435 size_t cLen
= encoding
- 1 - charset
;
436 bool base64encoded
= toupper(*encoding
) == 'B';
438 uint32 convertID
= B_MAIL_NULL_CONVERSION
;
439 char charsetName
[cLen
+ 1];
440 memcpy(charsetName
, charset
, cLen
);
441 charsetName
[cLen
] = '\0';
442 if (strcasecmp(charsetName
, "us-ascii") == 0) {
443 convertID
= B_MAIL_US_ASCII_CONVERSION
;
444 } else if (strcasecmp(charsetName
, "utf-8") == 0) {
445 convertID
= B_MAIL_UTF8_CONVERSION
;
447 const BCharacterSet
* charSet
448 = BCharacterSetRoster::FindCharacterSetByName(charsetName
);
449 if (charSet
!= NULL
) {
450 convertID
= charSet
->GetConversionID();
453 if (convertID
== B_MAIL_NULL_CONVERSION
) {
454 // unidentified charset
455 // what to do? doing nothing skips the encoded text;
456 // but we should keep it: we copy it to the output.
457 if (string
!= tail
&& tail
!= end
)
458 memmove(string
, tail
, end
-tail
);
462 // else we've successfully identified the charset
464 char *src
= encoding
+2;
465 int32 srcLen
= end
- 2 - src
;
466 // encoded text: src..src+srcLen
468 // decode text, get decoded length (reducing xforms)
469 srcLen
= !base64encoded
? decode_qp(src
, src
, srcLen
, 1)
470 : decode_base64(src
, src
, srcLen
);
472 // allocate space for the converted text
473 int32 dstLen
= end
-string
+ *bufLen
-strLen
;
474 char *dst
= (char*)malloc(dstLen
);
475 int32 cvLen
= srcLen
;
481 ret
= mail_convert_to_utf8(convertID
, src
, &cvLen
, dst
, &dstLen
,
484 // what to do? doing nothing skips the encoded text
485 // but we should keep it: we copy it to the output.
489 if (string
!= tail
&& tail
!= end
)
490 memmove(string
, tail
, end
-tail
);
494 /* convert_to_ is either returning something wrong or my
495 test data is screwed up. Whatever it is, Not Enough
496 Space is not the only cause of the below, so we just
497 assume it succeeds if it converts anything at all.
498 else if (cvLen < srcLen)
500 // not enough room to convert the data;
501 // grow *buf and retry
505 char *temp = (char*)realloc(*bufp, 2*(*bufLen + 1));
513 *bufLen = 2*(*bufLen + 1);
515 string = *bufp + (string-head);
516 tail = *bufp + (tail-head);
517 charset = *bufp + (charset-head);
518 encoding = *bufp + (encoding-head);
519 end = *bufp + (end-head);
520 src = *bufp + (src-head);
526 if (dstLen
> end
-string
) {
527 // copy the string forward...
528 memmove(string
+dstLen
, end
, strLen
- (end
-head
) + 1);
529 strLen
+= string
+dstLen
- end
;
530 end
= string
+ dstLen
;
533 memcpy(string
, dst
, dstLen
);
540 // copy everything that's left
541 size_t tailLen
= strLen
- (tail
- head
);
542 memmove(string
, tail
, tailLen
+1);
545 // replace the last char
546 (*bufp
)[strLen
] = lastChar
;
548 return ret
< B_OK
? ret
: string
-head
;
553 utf8_to_rfc2047 (char **bufp
, ssize_t length
, uint32 charset
, char encoding
)
556 BString originalWord
;
557 BString convertedWord
;
560 // Convert the word from UTF-8 to the desired character set. The
561 // converted version also includes the escape codes to return to ASCII
562 // mode, if relevant. Also note if it uses unprintable characters,
563 // which means it will need that special encoding treatment later.
564 void ConvertWordToCharset (uint32 charset
) {
566 int32 originalLength
= originalWord
.Length();
567 int32 convertedLength
= originalLength
* 5 + 1;
568 char *convertedBuffer
= convertedWord
.LockBuffer (convertedLength
);
569 mail_convert_from_utf8 (charset
, originalWord
.String(),
570 &originalLength
, convertedBuffer
, &convertedLength
, &state
);
571 for (int i
= 0; i
< convertedLength
; i
++) {
572 if ((convertedBuffer
[i
] & (1 << 7)) ||
573 (convertedBuffer
[i
] >= 0 && convertedBuffer
[i
] < 32)) {
574 needsEncoding
= true;
578 convertedWord
.UnlockBuffer (convertedLength
);
581 struct word
*currentWord
;
584 // Break the header into words. White space characters (including tabs and
585 // newlines) separate the words. Each word includes any space before it as
586 // part of the word. Actually, quotes and other special characters
587 // (",()<>@) are treated as separate words of their own so that they don't
588 // get encoded (because MIME headers get the quotes parsed before character
589 // set unconversion is done). The reader is supposed to ignore all white
590 // space between encoded words, which can be inserted so that older mail
591 // parsers don't have overly long line length problems.
593 const char *source
= *bufp
;
594 const char *bufEnd
= *bufp
+ length
;
595 const char *specialChars
= "\"()<>@,";
597 while (source
< bufEnd
) {
598 currentWord
= new struct word
;
599 currentWord
->needsEncoding
= false;
603 // Include leading spaces as part of the word.
604 while (source
+ wordEnd
< bufEnd
&& isspace (source
[wordEnd
]))
607 if (source
+ wordEnd
< bufEnd
&&
608 strchr (specialChars
, source
[wordEnd
]) != NULL
) {
609 // Got a quote mark or other special character, which is treated as
610 // a word in itself since it shouldn't be encoded, which would hide
611 // it from the mail system.
614 // Find the end of the word. Leave wordEnd pointing just after the
615 // last character in the word.
616 while (source
+ wordEnd
< bufEnd
) {
617 if (isspace(source
[wordEnd
]) ||
618 strchr (specialChars
, source
[wordEnd
]) != NULL
)
620 if (wordEnd
> 51 /* Makes Base64 ISO-2022-JP "word" a multiple of 4 bytes */ &&
621 0xC0 == (0xC0 & (unsigned int) source
[wordEnd
])) {
622 // No English words are that long (46 is the longest),
623 // break up what is likely Asian text (which has no spaces)
624 // at the start of the next non-ASCII UTF-8 character (high
625 // two bits are both ones). Note that two encoded words in
626 // a row get joined together, even if there is a space
627 // between them in the final output text, according to the
628 // standard. Next word will also be conveniently get
629 // encoded due to the 0xC0 test.
630 currentWord
->needsEncoding
= true;
636 currentWord
->originalWord
.SetTo (source
, wordEnd
);
637 currentWord
->ConvertWordToCharset (charset
);
638 words
.AddItem(currentWord
);
642 // Combine adjacent words which contain unprintable text so that the
643 // overhead of switching back and forth between regular text and specially
644 // encoded text is reduced. However, the combined word must be shorter
645 // than the maximum of 75 bytes, including character set specification and
646 // all those delimiters (worst case 22 bytes of overhead).
650 for (int32 i
= 0; (currentWord
= (struct word
*) words
.ItemAt (i
)) != NULL
; i
++) {
651 if (!currentWord
->needsEncoding
)
652 continue; // No need to combine unencoded words.
653 for (int32 g
= i
+1; (run
= (struct word
*) words
.ItemAt (g
)) != NULL
; g
++) {
654 if (!run
->needsEncoding
)
655 break; // Don't want to combine encoded and unencoded words.
656 if ((currentWord
->convertedWord
.Length() + run
->convertedWord
.Length() <= 53)) {
657 currentWord
->originalWord
.Append (run
->originalWord
);
658 currentWord
->ConvertWordToCharset (charset
);
662 } else // Can't merge this word, result would be too long.
667 // Combine the encoded and unencoded words into one line, doing the
668 // quoted-printable or base64 encoding. Insert an extra space between
669 // words which are both encoded to make word wrapping easier, since there
670 // is normally none, and you're allowed to insert space (the receiver
671 // throws it away if it is between encoded words).
674 bool previousWordNeededEncoding
= false;
676 const char *charset_dec
= "none-bug";
677 for (int32 i
= 0; mail_charsets
[i
].charset
!= NULL
; i
++) {
678 if (mail_charsets
[i
].flavor
== charset
) {
679 charset_dec
= mail_charsets
[i
].charset
;
684 while ((currentWord
= (struct word
*)words
.RemoveItem((int32
)0)) != NULL
) {
685 if ((encoding
!= quoted_printable
&& encoding
!= base64
) ||
686 !currentWord
->needsEncoding
) {
687 rfc2047
.Append (currentWord
->convertedWord
);
689 // This word needs encoding. Try to insert a space between it and
690 // the previous word.
691 if (previousWordNeededEncoding
)
692 rfc2047
<< ' '; // Can insert as many spaces as you want between encoded words.
694 // Previous word is not encoded, spaces are significant. Try
695 // to move a space from the start of this word to be outside of
696 // the encoded text, so that there is a bit of space between
697 // this word and the previous one to enhance word wrapping
699 if (currentWord
->originalWord
.Length() > 1 &&
700 isspace (currentWord
->originalWord
[0])) {
701 rfc2047
<< currentWord
->originalWord
[0];
702 currentWord
->originalWord
.Remove (0 /* offset */, 1 /* length */);
703 currentWord
->ConvertWordToCharset (charset
);
707 char *encoded
= NULL
;
708 ssize_t encoded_len
= 0;
709 int32 convertedLength
= currentWord
->convertedWord
.Length ();
710 const char *convertedBuffer
= currentWord
->convertedWord
.String ();
713 case quoted_printable
:
714 encoded
= (char *) malloc (convertedLength
* 3);
715 encoded_len
= encode_qp (encoded
, convertedBuffer
, convertedLength
, true /* headerMode */);
718 encoded
= (char *) malloc (convertedLength
* 2);
719 encoded_len
= encode_base64 (encoded
, convertedBuffer
, convertedLength
, true /* headerMode */);
721 default: // Unknown encoding type, shouldn't happen.
722 encoded
= (char *) convertedBuffer
;
723 encoded_len
= convertedLength
;
727 rfc2047
<< "=?" << charset_dec
<< '?' << encoding
<< '?';
728 rfc2047
.Append (encoded
, encoded_len
);
731 if (encoding
== quoted_printable
|| encoding
== base64
)
734 previousWordNeededEncoding
= currentWord
->needsEncoding
;
740 ssize_t finalLength
= rfc2047
.Length ();
741 *bufp
= (char *) (malloc (finalLength
+ 1));
742 memcpy (*bufp
, rfc2047
.String(), finalLength
);
743 (*bufp
)[finalLength
] = 0;
750 FoldLineAtWhiteSpaceAndAddCRLF(BString
&string
)
752 int inputLength
= string
.Length();
754 const int maxLineLength
= 78; // Doesn't include CRLF.
761 // If we don't need to wrap the text, just output the remainder, if any.
763 if (lineStartIndex
+ maxLineLength
>= inputLength
) {
764 if (lineStartIndex
< inputLength
) {
765 output
.Insert (string
, lineStartIndex
/* source offset */,
766 inputLength
- lineStartIndex
/* count */,
767 output
.Length() /* insert at */);
768 output
.Append (CRLF
);
773 // Look ahead for a convenient spot to split it, between a comma and
774 // space, which you often see between e-mail addresses like this:
775 // "Joe Who" joe@dot.com, "Someone Else" else@blot.com
777 tempIndex
= lineStartIndex
+ maxLineLength
;
778 if (tempIndex
> inputLength
)
779 tempIndex
= inputLength
;
780 splitIndex
= string
.FindLast (", ", tempIndex
);
781 if (splitIndex
>= lineStartIndex
)
782 splitIndex
++; // Point to the space character.
784 // If none of those exist, try splitting at any white space.
786 if (splitIndex
<= lineStartIndex
)
787 splitIndex
= string
.FindLast (" ", tempIndex
);
788 if (splitIndex
<= lineStartIndex
)
789 splitIndex
= string
.FindLast ("\t", tempIndex
);
791 // If none of those exist, allow for a longer word - split at the next
792 // available white space.
794 if (splitIndex
<= lineStartIndex
)
795 splitIndex
= string
.FindFirst (" ", lineStartIndex
+ 1);
796 if (splitIndex
<= lineStartIndex
)
797 splitIndex
= string
.FindFirst ("\t", lineStartIndex
+ 1);
799 // Give up, the whole rest of the line can't be split, just dump it
802 if (splitIndex
<= lineStartIndex
) {
803 if (lineStartIndex
< inputLength
) {
804 output
.Insert (string
, lineStartIndex
/* source offset */,
805 inputLength
- lineStartIndex
/* count */,
806 output
.Length() /* insert at */);
807 output
.Append (CRLF
);
812 // Do the split. The current line up to but not including the space
813 // gets output, followed by a CRLF. The space remains to become the
814 // start of the next line (and that tells the message reader that it is
815 // a continuation line).
817 output
.Insert (string
, lineStartIndex
/* source offset */,
818 splitIndex
- lineStartIndex
/* count */,
819 output
.Length() /* insert at */);
820 output
.Append (CRLF
);
821 lineStartIndex
= splitIndex
;
823 string
.SetTo (output
);
828 readfoldedline(FILE *file
, char **buffer
, size_t *buflen
)
830 ssize_t len
= buflen
&& *buflen
? *buflen
: 0;
831 char * buf
= buffer
&& *buffer
? *buffer
: NULL
;
832 ssize_t cnt
= 0; // Number of characters currently in the buffer.
836 // Make sure there is space in the buffer for two more characters (one
837 // for the next character, and one for the end of string NUL byte).
838 if (buf
== NULL
|| cnt
+ 2 >= len
) {
839 char *temp
= (char *)realloc(buf
, len
+ 64);
841 // Out of memory, however existing buffer remains allocated.
849 // Read the next character, or end of file, or IO error.
850 if ((c
= fgetc(file
)) == EOF
) {
854 cnt
= -1; // Error codes must be negative.
856 // Really is end of file. Also make it end of line if there is
857 // some text already read in. If the first thing read was EOF,
858 // just return an empty string.
861 if (buf
[cnt
-2] == '\r') {
873 // Convert CRLF end of line to just a LF. Do it before folding, in
874 // case we don't need to fold.
875 if (cnt
>= 2 && buf
[cnt
-2] == '\r') {
879 // If the current line is empty then return it (so that empty lines
880 // don't disappear if the next line starts with a space).
883 // Fold if first character on the next line is whitespace.
884 c
= fgetc(file
); // Note it's OK to read EOF and ungetc it too.
885 if (c
== ' ' || c
== '\t')
886 buf
[cnt
-1] = c
; // Replace \n with the white space character.
888 // Not folding, we finished reading a line; break out of the loop
895 if (buf
!= NULL
&& cnt
>= 0)
911 readfoldedline(BPositionIO
&in
, char **buffer
, size_t *buflen
)
913 ssize_t len
= buflen
&& *buflen
? *buflen
: 0;
914 char * buf
= buffer
&& *buffer
? *buffer
: NULL
;
915 ssize_t cnt
= 0; // Number of characters currently in the buffer.
920 // Make sure there is space in the buffer for two more characters (one
921 // for the next character, and one for the end of string NUL byte).
922 if (buf
== NULL
|| cnt
+ 2 >= len
) {
923 char *temp
= (char *)realloc(buf
, len
+ 64);
925 // Out of memory, however existing buffer remains allocated.
933 errorCode
= in
.Read (&c
,1); // A really slow way of reading - unbuffered.
934 if (errorCode
!= 1) {
936 cnt
= errorCode
; // IO error encountered, just return the code.
938 // Really is end of file. Also make it end of line if there is
939 // some text already read in. If the first thing read was EOF,
940 // just return an empty string.
943 if (buf
[cnt
-2] == '\r') {
955 // Convert CRLF end of line to just a LF. Do it before folding, in
956 // case we don't need to fold.
957 if (cnt
>= 2 && buf
[cnt
-2] == '\r') {
961 // If the current line is empty then return it (so that empty lines
962 // don't disappear if the next line starts with a space).
965 // if first character on the next line is whitespace, fold lines
966 errorCode
= in
.Read(&c
,1);
967 if (errorCode
== 1) {
968 if (c
== ' ' || c
== '\t')
969 buf
[cnt
-1] = c
; // Replace \n with the white space character.
971 // Not folding, we finished reading a whole line.
972 in
.Seek(-1,SEEK_CUR
); // Undo the look-ahead character read.
975 } else if (errorCode
< 0) {
978 } else // No next line; at the end of the file. Return the line.
983 if (buf
!= NULL
&& cnt
>= 0)
999 nextfoldedline(const char** header
, char **buffer
, size_t *buflen
)
1001 ssize_t len
= buflen
&& *buflen
? *buflen
: 0;
1002 char * buf
= buffer
&& *buffer
? *buffer
: NULL
;
1003 ssize_t cnt
= 0; // Number of characters currently in the buffer.
1008 // Make sure there is space in the buffer for two more characters (one
1009 // for the next character, and one for the end of string NUL byte).
1010 if (buf
== NULL
|| cnt
+ 2 >= len
)
1012 char *temp
= (char *)realloc(buf
, len
+ 64);
1014 // Out of memory, however existing buffer remains allocated.
1022 // Read the next character, or end of file.
1023 if ((c
= *(*header
)++) == 0) {
1024 // End of file. Also make it end of line if there is some text
1025 // already read in. If the first thing read was EOF, just return
1029 if (buf
[cnt
-2] == '\r') {
1040 // Convert CRLF end of line to just a LF. Do it before folding, in
1041 // case we don't need to fold.
1042 if (cnt
>= 2 && buf
[cnt
-2] == '\r') {
1046 // If the current line is empty then return it (so that empty lines
1047 // don't disappear if the next line starts with a space).
1050 // if first character on the next line is whitespace, fold lines
1052 if (c
== ' ' || c
== '\t')
1053 buf
[cnt
-1] = c
; // Replace \n with the white space character.
1055 // Not folding, we finished reading a line; break out of the loop
1056 (*header
)--; // Undo read of the non-whitespace.
1063 if (buf
!= NULL
&& cnt
>= 0)
1079 trim_white_space(BString
&string
)
1082 int32 length
= string
.Length();
1083 char *buffer
= string
.LockBuffer(length
+ 1);
1085 while (length
> 0 && isspace(buffer
[length
- 1]))
1087 buffer
[length
] = '\0';
1089 for (i
= 0; buffer
[i
] && isspace(buffer
[i
]); i
++) {}
1092 memmove(buffer
,buffer
+ i
,length
+ 1);
1094 string
.UnlockBuffer(length
);
1098 /*! Tries to return a human-readable name from the specified
1099 header parameter (should be from "To:" or "From:").
1100 Tries to return the name rather than the eMail address.
1103 extract_address_name(BString
&header
)
1106 const char *start
= header
.String();
1107 const char *stop
= start
+ strlen (start
);
1109 // Find a string S in the header (email foo) that matches:
1110 // Old style name in brackets: foo@bar.com (S)
1111 // New style quotes: "S" <foo@bar.com>
1112 // New style no quotes if nothing else found: S <foo@bar.com>
1113 // If nothing else found then use the whole thing: S
1115 for (int i
= 0; i
<= 3; i
++) {
1116 // Set p1 to the first letter in the name and p2 to just past the last
1117 // letter in the name. p2 stays NULL if a name wasn't found in this
1119 const char *p1
= NULL
, *p2
= NULL
;
1122 case 0: // foo@bar.com (S)
1123 if ((p1
= strchr(start
,'(')) != NULL
) {
1124 p1
++; // Advance to first letter in the name.
1125 size_t nest
= 1; // Handle nested brackets.
1126 for (p2
= p1
; p2
< stop
; ++p2
)
1130 else if (*p2
== '(')
1136 p2
= NULL
; // False alarm, no terminating bracket.
1139 case 1: // "S" <foo@bar.com>
1140 if ((p1
= strchr(start
, '\"')) != NULL
)
1141 p2
= strchr(++p1
, '\"');
1143 case 2: // S <foo@bar.com>
1145 if (name
.Length() == 0)
1146 p2
= strchr(start
, '<');
1150 if (name
.Length() == 0)
1155 // Remove leading and trailing space-like characters and save the
1156 // result if it is longer than any other likely names found.
1158 while (p1
< p2
&& (isspace (*p1
)))
1161 while (p1
< p2
&& (isspace (p2
[-1])))
1164 int newLength
= p2
- p1
;
1165 if (name
.Length() < newLength
)
1166 name
.SetTo(p1
, newLength
);
1170 int32 lessIndex
= name
.FindFirst('<');
1171 int32 greaterIndex
= name
.FindLast('>');
1173 if (lessIndex
== 0) {
1174 // Have an address of the form <address> and nothing else, so remove
1175 // the greater and less than signs, if any.
1176 if (greaterIndex
> 0)
1177 name
.Remove(greaterIndex
, 1);
1178 name
.Remove(lessIndex
, 1);
1179 } else if (lessIndex
> 0 && lessIndex
< greaterIndex
) {
1180 // Yahoo stupidly inserts the e-mail address into the name string, so
1181 // this bit of code fixes: "Joe <joe@yahoo.com>" <joe@yahoo.com>
1182 name
.Remove(lessIndex
, greaterIndex
- lessIndex
+ 1);
1185 trim_white_space(name
);
1190 /*! Given a subject in a BString, remove the extraneous RE: re: and other stuff
1191 to get down to the core subject string, which should be identical for all
1192 messages posted about a topic. The input string is modified in place to
1193 become the output core subject string.
1196 SubjectToThread (BString
&string
)
1198 // a regex that matches a non-ASCII UTF8 character:
1200 "[\302-\337][\200-\277]" \
1201 "|\340[\302-\337][\200-\277]" \
1202 "|[\341-\357][\200-\277][\200-\277]" \
1203 "|\360[\220-\277][\200-\277][\200-\277]" \
1204 "|[\361-\367][\200-\277][\200-\277][\200-\277]" \
1205 "|\370[\210-\277][\200-\277][\200-\277][\200-\277]" \
1206 "|[\371-\373][\200-\277][\200-\277][\200-\277][\200-\277]" \
1207 "|\374[\204-\277][\200-\277][\200-\277][\200-\277][\200-\277]" \
1208 "|\375[\200-\277][\200-\277][\200-\277][\200-\277][\200-\277]"
1212 "|^(\\[[^]]*\\])(\\<| +| *(\\<(\\w|" U8C "){2,3} *(\\[[^\\]]*\\])? *:)+ *)" \
1213 "|^( +| *(\\<(\\w|" U8C "){2,3} *(\\[[^\\]]*\\])? *:)+ *)" \
1216 if (gRebuf
== NULL
&& atomic_add(&gLocker
, 1) == 0) {
1217 // the idea is to compile the regexp once to speed up testing
1219 for (int i
=0; i
<256; ++i
) gTranslation
[i
]=i
;
1220 for (int i
='a'; i
<='z'; ++i
) gTranslation
[i
]=toupper(i
);
1222 gRe
.translate
= gTranslation
;
1223 gRe
.regs_allocated
= REGS_FIXED
;
1224 re_syntax_options
= RE_SYNTAX_POSIX_EXTENDED
;
1226 const char *pattern
= PATTERN
;
1227 // count subexpressions in PATTERN
1228 for (unsigned int i
=0; pattern
[i
] != 0; ++i
)
1230 if (pattern
[i
] == '\\')
1232 else if (pattern
[i
] == '(')
1236 const char *err
= re_compile_pattern(pattern
,strlen(pattern
),&gRe
);
1240 fprintf(stderr
, "Failed to compile the regex: %s\n", err
);
1243 while (gRebuf
== NULL
&& tries
-- > 0)
1248 struct re_registers regs
;
1249 // can't be static if this function is to be thread-safe
1251 regs
.num_regs
= gNsub
;
1252 regs
.start
= (regoff_t
*)malloc(gNsub
*sizeof(regoff_t
));
1253 regs
.end
= (regoff_t
*)malloc(gNsub
*sizeof(regoff_t
));
1255 for (int start
= 0; (start
= re_search(gRebuf
, string
.String(),
1256 string
.Length(), 0, string
.Length(), ®s
)) >= 0;) {
1258 // we found something
1261 // don't delete [bemaildaemon]...
1262 if (start
== regs
.start
[1])
1263 start
= regs
.start
[2];
1265 string
.Remove(start
,regs
.end
[0]-start
);
1267 string
.Insert(' ',1,start
);
1269 // TODO: for some subjects this results in an endless loop, check
1271 if (regs
.end
[0] - start
<= 1)
1279 // Finally remove leading and trailing space. Some software, like
1280 // tm-edit 1.8, appends a space to the subject, which would break
1281 // threading if we left it in.
1282 trim_white_space(string
);
1286 /*! Converts a date to a time. Handles numeric time zones too, unlike
1287 parsedate(). Returns -1 if it fails.
1290 ParseDateWithTimeZone(const char *DateString
)
1294 char tempDateString
[80];
1295 char tempZoneString
[6];
1296 time_t zoneDeltaTime
;
1300 // See if we can remove the time zone portion. parsedate understands time
1301 // zone 3 letter names, but doesn't understand the numeric +9999 time zone
1302 // format. To do: see if a newer parsedate exists.
1304 strncpy (tempDateString
, DateString
, sizeof (tempDateString
));
1305 tempDateString
[sizeof (tempDateString
) - 1] = 0;
1307 // Remove trailing spaces.
1308 zonePntr
= tempDateString
+ strlen (tempDateString
) - 1;
1309 while (zonePntr
>= tempDateString
&& isspace (*zonePntr
))
1311 if (zonePntr
< tempDateString
)
1312 return -1; // Empty string.
1314 // Remove the trailing time zone in round brackets, like in
1315 // Fri, 22 Feb 2002 15:22:42 EST (-0500)
1316 // Thu, 25 Apr 1996 11:44:19 -0400 (EDT)
1317 if (tempDateString
[strlen(tempDateString
)-1] == ')')
1319 zonePntr
= strrchr (tempDateString
, '(');
1320 if (zonePntr
!= NULL
)
1322 *zonePntr
-- = 0; // Zap the '(', then remove trailing spaces.
1323 while (zonePntr
>= tempDateString
&& isspace (*zonePntr
))
1325 if (zonePntr
< tempDateString
)
1326 return -1; // Empty string.
1330 // Look for a numeric time zone like Tue, 30 Dec 2003 05:01:40 +0000
1331 for (zoneIndex
= strlen (tempDateString
); zoneIndex
>= 0; zoneIndex
--)
1333 zonePntr
= tempDateString
+ zoneIndex
;
1334 if (zonePntr
[0] == '+' || zonePntr
[0] == '-')
1336 if (zonePntr
[1] >= '0' && zonePntr
[1] <= '9' &&
1337 zonePntr
[2] >= '0' && zonePntr
[2] <= '9' &&
1338 zonePntr
[3] >= '0' && zonePntr
[3] <= '9' &&
1339 zonePntr
[4] >= '0' && zonePntr
[4] <= '9')
1345 // Remove the zone from the date string and any following time zone
1346 // letter codes. Also put in GMT so that the date gets parsed as GMT.
1347 memcpy (tempZoneString
, zonePntr
, 5);
1348 tempZoneString
[5] = 0;
1349 strcpy (zonePntr
, "GMT");
1351 else // No numeric time zone found.
1352 strcpy (tempZoneString
, "+0000");
1354 time (¤tTime
);
1355 dateAsTime
= parsedate (tempDateString
, currentTime
);
1356 if (dateAsTime
== (time_t) -1)
1357 return -1; // Failure.
1359 zoneDeltaTime
= 60 * atol (tempZoneString
+ 3); // Get the last two digits - minutes.
1360 tempZoneString
[3] = 0;
1361 zoneDeltaTime
+= atol (tempZoneString
+ 1) * 60 * 60; // Get the first two digits - hours.
1362 if (tempZoneString
[0] == '+')
1363 zoneDeltaTime
= 0 - zoneDeltaTime
;
1364 dateAsTime
+= zoneDeltaTime
;
1370 /*! Parses a mail header and fills the headers BMessage
1373 parse_header(BMessage
&headers
, BPositionIO
&input
)
1375 char *buffer
= NULL
;
1376 size_t bufferSize
= 0;
1379 while ((length
= readfoldedline(input
, &buffer
, &bufferSize
)) >= 2) {
1381 // Don't include the \n at the end of the buffer.
1383 // convert to UTF-8 and null-terminate the buffer
1384 length
= rfc2047_to_utf8(&buffer
, &bufferSize
, length
);
1385 buffer
[length
] = '\0';
1387 const char *delimiter
= strstr(buffer
, ":");
1388 if (delimiter
== NULL
)
1391 BString
header(buffer
, delimiter
- buffer
);
1392 header
.CapitalizeEachWord();
1393 // unified case for later fetch
1395 delimiter
++; // Skip the colon.
1396 // Skip over leading white space and tabs.
1397 // TODO: (comments in brackets).
1398 while (isspace(*delimiter
))
1401 // TODO: implement joining of multiple header tags (i.e. multiple "Cc:"s)
1402 headers
.AddString(header
.String(), delimiter
);
1411 extract_from_header(const BString
& header
, const BString
& field
,
1414 int32 headerLength
= header
.Length();
1415 int32 fieldEndPos
= 0;
1417 int32 pos
= header
.IFindFirst(field
, fieldEndPos
);
1420 fieldEndPos
= pos
+ field
.Length();
1422 if (pos
!= 0 && header
.ByteAt(pos
- 1) != '\n')
1424 if (header
.ByteAt(fieldEndPos
) == ':')
1429 int32 crPos
= fieldEndPos
;
1431 fieldEndPos
= crPos
;
1432 crPos
= header
.FindFirst('\n', crPos
);
1434 crPos
= headerLength
;
1436 header
.CopyInto(temp
, fieldEndPos
, crPos
- fieldEndPos
);
1437 if (header
.ByteAt(crPos
- 1) == '\r') {
1438 temp
.Truncate(temp
.Length() - 1);
1443 if (crPos
>= headerLength
)
1445 char nextByte
= header
.ByteAt(crPos
);
1446 if (nextByte
!= ' ' && nextByte
!= '\t')
1451 size_t bufferSize
= target
.Length();
1452 char* buffer
= target
.LockBuffer(bufferSize
);
1453 size_t length
= rfc2047_to_utf8(&buffer
, &bufferSize
, bufferSize
);
1454 target
.UnlockBuffer(length
);
1456 trim_white_space(target
);
1463 extract_address(BString
&address
)
1465 const char *string
= address
.String();
1468 // first, remove all quoted text
1470 if ((first
= address
.FindFirst('"')) >= 0) {
1471 int32 last
= first
+ 1;
1472 while (string
[last
] && string
[last
] != '"')
1475 if (string
[last
] == '"')
1476 address
.Remove(first
, last
+ 1 - first
);
1479 // try to extract the address now
1481 if ((first
= address
.FindFirst('<')) >= 0) {
1482 // the world likes us and we can just get the address the easy way...
1483 int32 last
= address
.FindFirst('>');
1485 address
.Truncate(last
);
1486 address
.Remove(0, first
+ 1);
1492 // then, see if there is anything in parenthesis to throw away
1494 if ((first
= address
.FindFirst('(')) >= 0) {
1495 int32 last
= first
+ 1;
1496 while (string
[last
] && string
[last
] != ')')
1499 if (string
[last
] == ')')
1500 address
.Remove(first
, last
+ 1 - first
);
1503 // now, there shouldn't be much else left
1505 trim_white_space(address
);
1510 get_address_list(BList
&list
, const char *string
,
1511 void (*cleanupFunc
)(BString
&))
1513 if (string
== NULL
|| !string
[0])
1516 const char *start
= string
;
1519 if (string
[0] == '"') {
1520 const char *quoteEnd
= ++string
;
1522 while (quoteEnd
[0] && quoteEnd
[0] != '"')
1525 if (!quoteEnd
[0]) // string exceeds line!
1528 string
= quoteEnd
+ 1;
1531 if (string
[0] == ',' || string
[0] == '\0') {
1532 BString
address(start
, string
- start
);
1533 trim_white_space(address
);
1536 cleanupFunc(address
);
1538 list
.AddItem(strdup(address
.String()));